From 5b212ff8b7d1cf530545888846567453b10bb8a0 Mon Sep 17 00:00:00 2001 From: garret Date: Sun, 2 Oct 2022 02:01:55 +0100 Subject: make relative URLs into absolute URLs, + minor refactor --- kadomatsu-rss.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'kadomatsu-rss.py') diff --git a/kadomatsu-rss.py b/kadomatsu-rss.py index 8a31c14..a69ddc4 100755 --- a/kadomatsu-rss.py +++ b/kadomatsu-rss.py @@ -11,6 +11,7 @@ import requests from bs4 import BeautifulSoup from os import path +from urllib.parse import urljoin root = "http://www.toshiki-kadomatsu.jp/information/" @@ -37,14 +38,20 @@ def get_soup(url): soup = BeautifulSoup(site.text, "lxml") return soup +def relative2absolute(rel): + return urljoin(root, rel) -# with open("index.html") as fp: -# soup = BeautifulSoup(fp, "lxml") - - -def get_article(soup): - return soup.find("li", id="Detail") +def format_article(soup): + for i in soup.find_all("a"): + i["href"] = relative2absolute(i["href"]) + for i in soup.find_all("img"): + i["src"] = relative2absolute(i["src"]) + return soup +def get_article(url): + soup = get_soup(url) + article = soup.find("li", id="Detail") + return format_article(article) def get_rss_items(soup): items = soup.find("ul", id="List") @@ -52,8 +59,8 @@ def get_rss_items(soup): for i in items.find_all("dl"): title = list(i.find("a").strings)[0] date = datetime.strptime(i.find("time")["datetime"], "%Y-%m-%d") - content_url = path.join(root, i.find("a")["href"]) - content = get_article(get_soup(content_url)) + content_url = relative2absolute(i.find("a")["href"]) + content = format_article(get_article(content_url)) rss_item = Item(title, date, content, content_url) rss_items.append(rss_item) return rss_items -- cgit v1.2.3-70-g09d2