all repos — sunstroke @ 733a0a23988fb074c93e6c398d8b9142ee180b29

Sole.py (view raw)

 1import feedparser
 2from html.parser import HTMLParser
 3from datetime import datetime
 4from re import compile
 5
 6N_LINKS_TO_REMOVE = 2
 7REGEX_DATE = compile("\(([\d\.]*)\)")
 8OVERPOST_URL = "https://overpost.biz/e-books/quotidiani/rss.xml"
 9
10def add_or_update(dictionary, key, value):
11    try:
12        dictionary[key].append(value)
13    except KeyError:
14        dictionary[key] = [ value ]
15
16class PostParser(HTMLParser):
17    def __init__(self):
18        HTMLParser.__init__(self)
19        self.links = {}
20        self.prev_tag = None
21        self.current_tag = None
22        self.current_link = None
23    
24    def handle_starttag(self, tag, attrs):
25        if tag == "br":
26            return
27        self.prev_tag = self.current_tag
28        self.current_tag = tag
29        if tag == "a":
30            for at in attrs:
31                if at[0] == "href":
32                    self.current_link = at[1]
33
34    def handle_endtag(self, tag):
35        self.current_tag = self.prev_tag
36
37    def handle_data(self, data):
38        if self.current_tag == "a":
39            key = data.replace("_", " ").split(" - ")[0]
40            value = self.current_link
41            add_or_update(self.links, key, value)
42            
43    def get_links(self):
44        return self.links.copy()
45    
46def parse_html(html):
47    parser = PostParser()
48    parser.feed(html)
49    return parser.get_links()
50
51def remove_first(d):
52    return (k := next(iter(d)), d.pop(k))
53
54def remove_first_n(d, n):
55    for i in range(n):
56        remove_first(d)
57
58def parse_entry(entry): # entry = day
59    date = REGEX_DATE.findall(entry.title)[0]
60    links = parse_html(entry.turbo_content)
61    
62    remove_first_n(links, N_LINKS_TO_REMOVE)
63    return (datetime.strptime(date, "%d.%m.%Y"), links)
64
65def get_links(rss_url):
66    feed = feedparser.parse(rss_url)
67    return [ parse_entry(entry) for entry in feed.entries ]
68
69def get_sole():
70    links = get_links(OVERPOST_URL)
71    today = links[1]
72    return { k: v for k, v in today[1].items() if k.startswith("Il Sole 24 Ore")}
73
74OVERPOST_URL = r"/home/marco/Documenti/overpost/rss.xml"
75if __name__ == "__main__":
76    print(get_sole())