all repos — sunstroke @ dc852443b5738a07c869b612ca3ed6db9d3c3777

Overpost.py (view raw)

 1from html.parser import HTMLParser
 2from datetime import datetime
 3from re import compile
 4import os
 5import feedparser
 6from MyResolver import get
 7
 8RSS_URL = os.getenv("RSS_URL") or os.path.join(".", "rss.xml")
 9N_LINKS_TO_REMOVE = os.getenv("N_LINKS_TO_REMOVE") or 2
10REGEX_DATE = compile("\(([\d\.]*)\)")
11
12def add_or_update(dictionary, key, value):
13    try:
14        dictionary[key].append(value)
15    except KeyError:
16        dictionary[key] = [ value ]
17
18class PostParser(HTMLParser):
19    def __init__(self):
20        HTMLParser.__init__(self)
21        self.links = {}
22        self.prev_tag = None
23        self.current_tag = None
24        self.current_link = None
25    
26    def handle_starttag(self, tag, attrs):
27        if tag == "br":
28            return
29        self.prev_tag = self.current_tag
30        self.current_tag = tag
31        if tag == "a":
32            for at in attrs:
33                if at[0] == "href":
34                    self.current_link = at[1]
35
36    def handle_endtag(self, tag):
37        self.current_tag = self.prev_tag
38
39    def handle_data(self, data):
40        if self.current_tag == "a":
41            key = data.replace("_", " ").split(" - ")[0]
42            value = self.current_link
43            add_or_update(self.links, key, value)
44            
45    def get_links(self):
46        return self.links.copy()
47    
48def parse_html(html):
49    parser = PostParser()
50    parser.feed(html)
51    return parser.get_links()
52
53def dict_pop(d):
54    return (k := next(iter(d)), d.pop(k))
55
56def dict_pop_first_n(d, n):
57    return [dict_pop(d) for _ in range(n)]
58
59def parse_entry(entry): # entry = day
60    date = REGEX_DATE.findall(entry.title)[0]
61    links = parse_html(entry.turbo_content)
62    
63    dict_pop_first_n(links, int(N_LINKS_TO_REMOVE))
64    return (datetime.strptime(date, "%d.%m.%Y"), links)
65
66def handle_url(url):
67    if url.startswith("http"):
68        return get(url).text
69    else:
70        return url
71
72def get_links(rss_url):
73    feed = feedparser.parse(handle_url(rss_url))
74    return [ parse_entry(entry) for entry in feed.entries ]
75
76def get_newspaper(prefix="", index=0):
77    all_links = get_links(RSS_URL)
78    try:
79        daily = all_links[index][1]
80    except IndexError:
81        print("Empty feed.")
82        return {}
83    return { k: v for k, v in daily.items() if k.startswith(prefix)}
84
85if __name__ == "__main__":
86    print(get_newspaper("Il Sole"))