Overpost.py (view raw)
1from html.parser import HTMLParser
2from datetime import datetime
3from re import compile
4import os
5import feedparser
6from MyResolver import get
7
8RSS_URL = os.getenv("RSS_URL") or os.path.join(".", "rss.xml")
9N_LINKS_TO_REMOVE = os.getenv("N_LINKS_TO_REMOVE") or 2
10REGEX_DATE = compile("\(([\d\.]*)\)")
11
12def add_or_update(dictionary, key, value):
13 try:
14 dictionary[key].append(value)
15 except KeyError:
16 dictionary[key] = [ value ]
17
18class PostParser(HTMLParser):
19 def __init__(self):
20 HTMLParser.__init__(self)
21 self.links = {}
22 self.prev_tag = None
23 self.current_tag = None
24 self.current_link = None
25
26 def handle_starttag(self, tag, attrs):
27 if tag == "br":
28 return
29 self.prev_tag = self.current_tag
30 self.current_tag = tag
31 if tag == "a":
32 for at in attrs:
33 if at[0] == "href":
34 self.current_link = at[1]
35
36 def handle_endtag(self, tag):
37 self.current_tag = self.prev_tag
38
39 def handle_data(self, data):
40 if self.current_tag == "a":
41 key = data.replace("_", " ").split(" - ")[0]
42 value = self.current_link
43 add_or_update(self.links, key, value)
44
45 def get_links(self):
46 return self.links.copy()
47
48def parse_html(html):
49 parser = PostParser()
50 parser.feed(html)
51 return parser.get_links()
52
53def dict_pop(d):
54 return (k := next(iter(d)), d.pop(k))
55
56def dict_pop_first_n(d, n):
57 return [dict_pop(d) for _ in range(n)]
58
59def parse_entry(entry): # entry = day
60 date = REGEX_DATE.findall(entry.title)[0]
61 links = parse_html(entry.turbo_content)
62
63 dict_pop_first_n(links, int(N_LINKS_TO_REMOVE))
64 return (datetime.strptime(date, "%d.%m.%Y"), links)
65
66def handle_url(url):
67 if url.startswith("http"):
68 return get(url).text
69 else:
70 return url
71
72def get_links(rss_url):
73 feed = feedparser.parse(handle_url(rss_url))
74 return [ parse_entry(entry) for entry in feed.entries ]
75
76def get_newspaper(prefix="", index=0):
77 all_links = get_links(RSS_URL)
78 try:
79 daily = all_links[index][1]
80 except IndexError:
81 print("Empty feed.")
82 return {}
83 return { k: v for k, v in daily.items() if k.startswith(prefix)}
84
85if __name__ == "__main__":
86 print(get_newspaper("Il Sole"))