diff options
author | garret <garret@airmail.cc> | 2022-10-02 02:01:55 +0100 |
---|---|---|
committer | garret <garret@airmail.cc> | 2022-10-02 02:01:55 +0100 |
commit | 5b212ff8b7d1cf530545888846567453b10bb8a0 (patch) | |
tree | b4c929ac0a59111becb875aea825ae887186e01c /kadomatsu-rss.py | |
parent | 60169f977860c6c253ebf67e0bc288864cceaf05 (diff) | |
download | kadomatsu-rss-5b212ff8b7d1cf530545888846567453b10bb8a0.tar.gz kadomatsu-rss-5b212ff8b7d1cf530545888846567453b10bb8a0.tar.bz2 kadomatsu-rss-5b212ff8b7d1cf530545888846567453b10bb8a0.zip |
make relative URLs into absolute URLs, + minor refactor
Diffstat (limited to 'kadomatsu-rss.py')
-rwxr-xr-x | kadomatsu-rss.py | 23 |
1 files changed, 15 insertions, 8 deletions
diff --git a/kadomatsu-rss.py b/kadomatsu-rss.py index 8a31c14..a69ddc4 100755 --- a/kadomatsu-rss.py +++ b/kadomatsu-rss.py @@ -11,6 +11,7 @@ import requests from bs4 import BeautifulSoup from os import path +from urllib.parse import urljoin root = "http://www.toshiki-kadomatsu.jp/information/" @@ -37,14 +38,20 @@ def get_soup(url): soup = BeautifulSoup(site.text, "lxml") return soup +def relative2absolute(rel): + return urljoin(root, rel) -# with open("index.html") as fp: -# soup = BeautifulSoup(fp, "lxml") - - -def get_article(soup): - return soup.find("li", id="Detail") +def format_article(soup): + for i in soup.find_all("a"): + i["href"] = relative2absolute(i["href"]) + for i in soup.find_all("img"): + i["src"] = relative2absolute(i["src"]) + return soup +def get_article(url): + soup = get_soup(url) + article = soup.find("li", id="Detail") + return format_article(article) def get_rss_items(soup): items = soup.find("ul", id="List") @@ -52,8 +59,8 @@ def get_rss_items(soup): for i in items.find_all("dl"): title = list(i.find("a").strings)[0] date = datetime.strptime(i.find("time")["datetime"], "%Y-%m-%d") - content_url = path.join(root, i.find("a")["href"]) - content = get_article(get_soup(content_url)) + content_url = relative2absolute(i.find("a")["href"]) + content = format_article(get_article(content_url)) rss_item = Item(title, date, content, content_url) rss_items.append(rss_item) return rss_items |