Sole.py (view raw)
1import feedparser
2from html.parser import HTMLParser
3from datetime import datetime
4from re import compile
5
6N_LINKS_TO_REMOVE = 2
7REGEX_DATE = compile("\(([\d\.]*)\)")
8OVERPOST_URL = "https://overpost.biz/e-books/quotidiani/rss.xml"
9
10def add_or_update(dictionary, key, value):
11 try:
12 dictionary[key].append(value)
13 except KeyError:
14 dictionary[key] = [ value ]
15
16class PostParser(HTMLParser):
17 def __init__(self):
18 HTMLParser.__init__(self)
19 self.links = {}
20 self.prev_tag = None
21 self.current_tag = None
22 self.current_link = None
23
24 def handle_starttag(self, tag, attrs):
25 if tag == "br":
26 return
27 self.prev_tag = self.current_tag
28 self.current_tag = tag
29 if tag == "a":
30 for at in attrs:
31 if at[0] == "href":
32 self.current_link = at[1]
33
34 def handle_endtag(self, tag):
35 self.current_tag = self.prev_tag
36
37 def handle_data(self, data):
38 if self.current_tag == "a":
39 key = data.replace("_", " ").split(" - ")[0]
40 value = self.current_link
41 add_or_update(self.links, key, value)
42
43 def get_links(self):
44 return self.links.copy()
45
46def parse_html(html):
47 parser = PostParser()
48 parser.feed(html)
49 return parser.get_links()
50
51def remove_first(d):
52 return (k := next(iter(d)), d.pop(k))
53
54def remove_first_n(d, n):
55 for i in range(n):
56 remove_first(d)
57
58def parse_entry(entry): # entry = day
59 date = REGEX_DATE.findall(entry.title)[0]
60 links = parse_html(entry.turbo_content)
61
62 remove_first_n(links, N_LINKS_TO_REMOVE)
63 return (datetime.strptime(date, "%d.%m.%Y"), links)
64
65def get_links(rss_url):
66 feed = feedparser.parse(rss_url)
67 return [ parse_entry(entry) for entry in feed.entries ]
68
69def get_sole():
70 links = get_links(OVERPOST_URL)
71 today = links[1]
72 return { k: v for k, v in today[1].items() if k.startswith("Il Sole 24 Ore")}
73
74OVERPOST_URL = r"/home/marco/Documenti/overpost/rss.xml"
75if __name__ == "__main__":
76 print(get_sole())