all repos — sunstroke @ 19ce085fa872db7f4f15e230524889ac5322f8a5

Overpost.py (view raw)

 1from html.parser import HTMLParser
 2from datetime import datetime
 3from re import compile
 4import os
 5import feedparser
 6from dotenv import load_dotenv
 7from MyResolver import get
 8load_dotenv()
 9
10RSS_URL = os.getenv("RSS_URL") or os.path.join(".", "rss.xml")
11N_LINKS_TO_REMOVE = os.getenv("N_LINKS_TO_REMOVE") or 2
12REGEX_DATE = compile("\(([\d\.]*)\)")
13
14def add_or_update(dictionary, key, value):
15    try:
16        dictionary[key].append(value)
17    except KeyError:
18        dictionary[key] = [ value ]
19
20class PostParser(HTMLParser):
21    def __init__(self):
22        HTMLParser.__init__(self)
23        self.links = {}
24        self.prev_tag = None
25        self.current_tag = None
26        self.current_link = None
27    
28    def handle_starttag(self, tag, attrs):
29        if tag == "br":
30            return
31        self.prev_tag = self.current_tag
32        self.current_tag = tag
33        if tag == "a":
34            for at in attrs:
35                if at[0] == "href":
36                    self.current_link = at[1]
37
38    def handle_endtag(self, tag):
39        self.current_tag = self.prev_tag
40
41    def handle_data(self, data):
42        if self.current_tag == "a":
43            key = data.replace("_", " ").split(" - ")[0]
44            value = self.current_link
45            add_or_update(self.links, key, value)
46            
47    def get_links(self):
48        return self.links.copy()
49    
50def parse_html(html):
51    parser = PostParser()
52    parser.feed(html)
53    return parser.get_links()
54
55def dict_pop(d):
56    return (k := next(iter(d)), d.pop(k))
57
58def dict_pop_first_n(d, n):
59    return [dict_pop(d) for _ in range(n)]
60
61def parse_entry(entry): # entry = day
62    date = REGEX_DATE.findall(entry.title)[0]
63    links = parse_html(entry.turbo_content)
64    
65    dict_pop_first_n(links, int(N_LINKS_TO_REMOVE))
66    return (datetime.strptime(date, "%d.%m.%Y"), links)
67
68def handle_url(url):
69    if url.startswith("http"):
70        return get(url)
71    else:
72        return url
73
74def get_links(rss_url):
75    feed = feedparser.parse(handle_url(rss_url))
76    return [ parse_entry(entry) for entry in feed.entries ]
77
78def get_newspaper(prefix="", index=0):
79    all_links = get_links(RSS_URL)
80    try:
81        daily = all_links[index][1]
82    except IndexError:
83        print("Empty feed.")
84        return {}
85    return { k: v for k, v in daily.items() if k.startswith(prefix)}
86
87if __name__ == "__main__":
88    print(get_newspaper("Il Sole"))