Overpost.py (view raw)
1from html.parser import HTMLParser
2from datetime import datetime
3from re import compile
4import os
5import feedparser
6from dotenv import load_dotenv
7load_dotenv()
8
9RSS_URL = os.getenv("RSS_URL") or os.path.join(".", "rss.xml")
10N_LINKS_TO_REMOVE = os.getenv("N_LINKS_TO_REMOVE") or 2
11REGEX_DATE = compile("\(([\d\.]*)\)")
12
13def add_or_update(dictionary, key, value):
14 try:
15 dictionary[key].append(value)
16 except KeyError:
17 dictionary[key] = [ value ]
18
19class PostParser(HTMLParser):
20 def __init__(self):
21 HTMLParser.__init__(self)
22 self.links = {}
23 self.prev_tag = None
24 self.current_tag = None
25 self.current_link = None
26
27 def handle_starttag(self, tag, attrs):
28 if tag == "br":
29 return
30 self.prev_tag = self.current_tag
31 self.current_tag = tag
32 if tag == "a":
33 for at in attrs:
34 if at[0] == "href":
35 self.current_link = at[1]
36
37 def handle_endtag(self, tag):
38 self.current_tag = self.prev_tag
39
40 def handle_data(self, data):
41 if self.current_tag == "a":
42 key = data.replace("_", " ").split(" - ")[0]
43 value = self.current_link
44 add_or_update(self.links, key, value)
45
46 def get_links(self):
47 return self.links.copy()
48
49def parse_html(html):
50 parser = PostParser()
51 parser.feed(html)
52 return parser.get_links()
53
54def dict_pop(d):
55 return (k := next(iter(d)), d.pop(k))
56
57def dict_pop_first_n(d, n):
58 return [dict_pop(d) for _ in range(n)]
59
60def parse_entry(entry): # entry = day
61 date = REGEX_DATE.findall(entry.title)[0]
62 links = parse_html(entry.turbo_content)
63
64 dict_pop_first_n(links, int(N_LINKS_TO_REMOVE))
65 return (datetime.strptime(date, "%d.%m.%Y"), links)
66
67def get_links(rss_url):
68 feed = feedparser.parse(rss_url)
69 return [ parse_entry(entry) for entry in feed.entries ]
70
71def get_newspaper(prefix="", index=0):
72 all_links = get_links(RSS_URL)
73 try:
74 daily = all_links[index][1]
75 except IndexError:
76 print("Empty feed.")
77 return {}
78 return { k: v for k, v in daily.items() if k.startswith(prefix)}
79
80if __name__ == "__main__":
81 print(get_newspaper("Il Sole"))