from html.parser import HTMLParser
from datetime import datetime
from re import compile
import os
import feedparser
from dotenv import load_dotenv
load_dotenv()
RSS_URL = os.getenv("RSS_URL") or os.path.join(".", "rss.xml")
N_LINKS_TO_REMOVE = os.getenv("N_LINKS_TO_REMOVE") or 2
REGEX_DATE = compile("\(([\d\.]*)\)")
def add_or_update(dictionary, key, value):
try:
dictionary[key].append(value)
except KeyError:
dictionary[key] = [ value ]
class PostParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.links = {}
self.prev_tag = None
self.current_tag = None
self.current_link = None
def handle_starttag(self, tag, attrs):
if tag == "br":
return
self.prev_tag = self.current_tag
self.current_tag = tag
if tag == "a":
for at in attrs:
if at[0] == "href":
self.current_link = at[1]
def handle_endtag(self, tag):
self.current_tag = self.prev_tag
def handle_data(self, data):
if self.current_tag == "a":
key = data.replace("_", " ").split(" - ")[0]
value = self.current_link
add_or_update(self.links, key, value)
def get_links(self):
return self.links.copy()
def parse_html(html):
parser = PostParser()
parser.feed(html)
return parser.get_links()
def dict_pop(d):
return (k := next(iter(d)), d.pop(k))
def dict_pop_first_n(d, n):
return [dict_pop(d) for _ in range(n)]
def parse_entry(entry): # entry = day
date = REGEX_DATE.findall(entry.title)[0]
links = parse_html(entry.turbo_content)
dict_pop_first_n(links, int(N_LINKS_TO_REMOVE))
return (datetime.strptime(date, "%d.%m.%Y"), links)
def get_links(rss_url):
feed = feedparser.parse(rss_url)
return [ parse_entry(entry) for entry in feed.entries ]
def get_newspaper(prefix="", index=0):
all_links = get_links(RSS_URL)
try:
daily = all_links[index][1]
except IndexError:
print("Empty feed.")
return {}
return { k: v for k, v in daily.items() if k.startswith(prefix)}
if __name__ == "__main__":
print(get_newspaper("Il Sole"))