all repos — sunstroke @ eceadcf2f0e4150b131b14e7c3c9553f1169b87e

Overpost.py (view raw)

 1from html.parser import HTMLParser
 2from datetime import datetime
 3from re import compile
 4import os
 5import feedparser
 6from dotenv import load_dotenv
 7load_dotenv()
 8
 9RSS_URL = os.getenv("RSS_URL") or os.path.join(".", "rss.xml")
10N_LINKS_TO_REMOVE = os.getenv("N_LINKS_TO_REMOVE") or 2
11REGEX_DATE = compile("\(([\d\.]*)\)")
12
13def add_or_update(dictionary, key, value):
14    try:
15        dictionary[key].append(value)
16    except KeyError:
17        dictionary[key] = [ value ]
18
19class PostParser(HTMLParser):
20    def __init__(self):
21        HTMLParser.__init__(self)
22        self.links = {}
23        self.prev_tag = None
24        self.current_tag = None
25        self.current_link = None
26    
27    def handle_starttag(self, tag, attrs):
28        if tag == "br":
29            return
30        self.prev_tag = self.current_tag
31        self.current_tag = tag
32        if tag == "a":
33            for at in attrs:
34                if at[0] == "href":
35                    self.current_link = at[1]
36
37    def handle_endtag(self, tag):
38        self.current_tag = self.prev_tag
39
40    def handle_data(self, data):
41        if self.current_tag == "a":
42            key = data.replace("_", " ").split(" - ")[0]
43            value = self.current_link
44            add_or_update(self.links, key, value)
45            
46    def get_links(self):
47        return self.links.copy()
48    
49def parse_html(html):
50    parser = PostParser()
51    parser.feed(html)
52    return parser.get_links()
53
54def dict_pop(d):
55    return (k := next(iter(d)), d.pop(k))
56
57def dict_pop_first_n(d, n):
58    return [dict_pop(d) for _ in range(n)]
59
60def parse_entry(entry): # entry = day
61    date = REGEX_DATE.findall(entry.title)[0]
62    links = parse_html(entry.turbo_content)
63    
64    dict_pop_first_n(links, int(N_LINKS_TO_REMOVE))
65    return (datetime.strptime(date, "%d.%m.%Y"), links)
66
67def get_links(rss_url):
68    feed = feedparser.parse(rss_url)
69    return [ parse_entry(entry) for entry in feed.entries ]
70
71def get_newspaper(prefix="", index=0):
72    all_links = get_links(RSS_URL)
73    try:
74        daily = all_links[index][1]
75    except IndexError:
76        print("Empty feed.")
77        return {}
78    return { k: v for k, v in daily.items() if k.startswith(prefix)}
79
80if __name__ == "__main__":
81    print(get_newspaper("Il Sole"))