initial commit
Marco Andronaco andronacomarco@gmail.com
Wed, 12 Jul 2023 09:14:36 +0200
4 files changed,
135 insertions(+),
0 deletions(-)
A
Sole.py
@@ -0,0 +1,76 @@
+import feedparser +from html.parser import HTMLParser +from datetime import datetime +from re import compile + +N_LINKS_TO_REMOVE = 2 +REGEX_DATE = compile("\(([\d\.]*)\)") +OVERPOST_URL = "https://overpost.biz/e-books/quotidiani/rss.xml" + +def add_or_update(dictionary, key, value): + try: + dictionary[key].append(value) + except KeyError: + dictionary[key] = [ value ] + +class PostParser(HTMLParser): + def __init__(self): + HTMLParser.__init__(self) + self.links = {} + self.prev_tag = None + self.current_tag = None + self.current_link = None + + def handle_starttag(self, tag, attrs): + if tag == "br": + return + self.prev_tag = self.current_tag + self.current_tag = tag + if tag == "a": + for at in attrs: + if at[0] == "href": + self.current_link = at[1] + + def handle_endtag(self, tag): + self.current_tag = self.prev_tag + + def handle_data(self, data): + if self.current_tag == "a": + key = data.replace("_", " ").split(" - ")[0] + value = self.current_link + add_or_update(self.links, key, value) + + def get_links(self): + return self.links.copy() + +def parse_html(html): + parser = PostParser() + parser.feed(html) + return parser.get_links() + +def remove_first(d): + return (k := next(iter(d)), d.pop(k)) + +def remove_first_n(d, n): + for i in range(n): + remove_first(d) + +def parse_entry(entry): # entry = day + date = REGEX_DATE.findall(entry.title)[0] + links = parse_html(entry.turbo_content) + + remove_first_n(links, N_LINKS_TO_REMOVE) + return (datetime.strptime(date, "%d.%m.%Y"), links) + +def get_links(rss_url): + feed = feedparser.parse(rss_url) + return [ parse_entry(entry) for entry in feed.entries ] + +def get_sole(): + links = get_links(OVERPOST_URL) + today = links[1] + return { k: v for k, v in today[1].items() if k.startswith("Il Sole 24 Ore")} + +OVERPOST_URL = r"/home/marco/Documenti/overpost/rss.xml" +if __name__ == "__main__": + print(get_sole())
A
main.py
@@ -0,0 +1,48 @@
+import json +import requests # https://github.com/pyload/pyload/wiki/module.Api.Api +from Sole import get_sole, remove_first + +SESSION_FILENAME = "session.txt" +PYLOAD_PROTOCOL = "http" +PYLOAD_HOST = "localhost" +PYLOAD_PORT = 8000 +PYLOAD_USER = "pyload" +PYLOAD_PW = "pyload" +PYLOAD_API_ENDPOINT = "/api" +PYLOAD_LOGIN_ENDPOINT = "/login" +PYLOAD_ADDPACKAGE_ENDPOINT = "/generateAndAddPackages" +PYLOAD_API_URL = f"{ PYLOAD_PROTOCOL }://{ PYLOAD_HOST }:{ PYLOAD_PORT }{ PYLOAD_API_ENDPOINT }" + +LOGIN_DATA = { "username": PYLOAD_USER, "password": PYLOAD_PW } +LOGIN_URL = PYLOAD_API_URL + PYLOAD_LOGIN_ENDPOINT +ADDPACKAGE_URL = PYLOAD_API_URL + PYLOAD_ADDPACKAGE_ENDPOINT + +def get_session_id(): + try: + with open(SESSION_FILENAME, "r", encoding="utf-8") as in_file: + return in_file.readline() + except FileNotFoundError: + res = requests.post(LOGIN_URL, data=LOGIN_DATA) + cookies = res.cookies.get_dict() + session_id = cookies['pyload_session'] + with open(SESSION_FILENAME, "w", encoding="utf-8") as out_file: + out_file.write(session_id) + return session_id + +def add_package(links): + ADDPACKAGE_DATA = { "links": json.dumps(links), "session": session_id } + print(ADDPACKAGE_URL) + print(ADDPACKAGE_DATA) + kek = requests.post(ADDPACKAGE_URL, data=LOGIN_DATA).text + return kek + +if __name__ == "__main__": + session_id = get_session_id() + + #sole = get_sole() + #sole_link = remove_first(sole)[1][0] + + + links = [ "http://localhost:8080/file2", "http://localhost:8080/file1" ] + + print(add_package(links))
A
requirements.txt
@@ -0,0 +1,7 @@
+certifi==2023.5.7 +charset-normalizer==3.2.0 +feedparser==6.0.10 +idna==3.4 +requests==2.31.0 +sgmllib3k==1.0.0 +urllib3==2.0.3