Overpost.py (view raw)
1from html.parser import HTMLParser
2from datetime import datetime
3from re import compile
4import os
5import feedparser
6from dotenv import load_dotenv
7from MyResolver import get
8load_dotenv()
9
10RSS_URL = os.getenv("RSS_URL") or os.path.join(".", "rss.xml")
11N_LINKS_TO_REMOVE = os.getenv("N_LINKS_TO_REMOVE") or 2
12REGEX_DATE = compile("\(([\d\.]*)\)")
13
14def add_or_update(dictionary, key, value):
15 try:
16 dictionary[key].append(value)
17 except KeyError:
18 dictionary[key] = [ value ]
19
20class PostParser(HTMLParser):
21 def __init__(self):
22 HTMLParser.__init__(self)
23 self.links = {}
24 self.prev_tag = None
25 self.current_tag = None
26 self.current_link = None
27
28 def handle_starttag(self, tag, attrs):
29 if tag == "br":
30 return
31 self.prev_tag = self.current_tag
32 self.current_tag = tag
33 if tag == "a":
34 for at in attrs:
35 if at[0] == "href":
36 self.current_link = at[1]
37
38 def handle_endtag(self, tag):
39 self.current_tag = self.prev_tag
40
41 def handle_data(self, data):
42 if self.current_tag == "a":
43 key = data.replace("_", " ").split(" - ")[0]
44 value = self.current_link
45 add_or_update(self.links, key, value)
46
47 def get_links(self):
48 return self.links.copy()
49
50def parse_html(html):
51 parser = PostParser()
52 parser.feed(html)
53 return parser.get_links()
54
55def dict_pop(d):
56 return (k := next(iter(d)), d.pop(k))
57
58def dict_pop_first_n(d, n):
59 return [dict_pop(d) for _ in range(n)]
60
61def parse_entry(entry): # entry = day
62 date = REGEX_DATE.findall(entry.title)[0]
63 links = parse_html(entry.turbo_content)
64
65 dict_pop_first_n(links, int(N_LINKS_TO_REMOVE))
66 return (datetime.strptime(date, "%d.%m.%Y"), links)
67
68def handle_url(url):
69 if url.startswith("http"):
70 return get(url)
71 else:
72 return url
73
74def get_links(rss_url):
75 feed = feedparser.parse(handle_url(rss_url))
76 return [ parse_entry(entry) for entry in feed.entries ]
77
78def get_newspaper(prefix="", index=0):
79 all_links = get_links(RSS_URL)
80 try:
81 daily = all_links[index][1]
82 except IndexError:
83 print("Empty feed.")
84 return {}
85 return { k: v for k, v in daily.items() if k.startswith(prefix)}
86
87if __name__ == "__main__":
88 print(get_newspaper("Il Sole"))