summaryrefslogtreecommitdiffstats
path: root/kadomatsu-rss.py
diff options
context:
space:
mode:
authorgarret <garret@airmail.cc>2022-10-02 02:01:55 +0100
committergarret <garret@airmail.cc>2022-10-02 02:01:55 +0100
commit5b212ff8b7d1cf530545888846567453b10bb8a0 (patch)
treeb4c929ac0a59111becb875aea825ae887186e01c /kadomatsu-rss.py
parent60169f977860c6c253ebf67e0bc288864cceaf05 (diff)
downloadkadomatsu-rss-5b212ff8b7d1cf530545888846567453b10bb8a0.tar.gz
kadomatsu-rss-5b212ff8b7d1cf530545888846567453b10bb8a0.tar.bz2
kadomatsu-rss-5b212ff8b7d1cf530545888846567453b10bb8a0.zip
make relative URLs into absolute URLs, + minor refactor
Diffstat (limited to 'kadomatsu-rss.py')
-rwxr-xr-xkadomatsu-rss.py23
1 files changed, 15 insertions, 8 deletions
diff --git a/kadomatsu-rss.py b/kadomatsu-rss.py
index 8a31c14..a69ddc4 100755
--- a/kadomatsu-rss.py
+++ b/kadomatsu-rss.py
@@ -11,6 +11,7 @@ import requests
from bs4 import BeautifulSoup
from os import path
+from urllib.parse import urljoin
root = "http://www.toshiki-kadomatsu.jp/information/"
@@ -37,14 +38,20 @@ def get_soup(url):
soup = BeautifulSoup(site.text, "lxml")
return soup
+def relative2absolute(rel):
+ return urljoin(root, rel)
-# with open("index.html") as fp:
-# soup = BeautifulSoup(fp, "lxml")
-
-
-def get_article(soup):
- return soup.find("li", id="Detail")
+def format_article(soup):
+ for i in soup.find_all("a"):
+ i["href"] = relative2absolute(i["href"])
+ for i in soup.find_all("img"):
+ i["src"] = relative2absolute(i["src"])
+ return soup
+def get_article(url):
+ soup = get_soup(url)
+ article = soup.find("li", id="Detail")
+ return format_article(article)
def get_rss_items(soup):
items = soup.find("ul", id="List")
@@ -52,8 +59,8 @@ def get_rss_items(soup):
for i in items.find_all("dl"):
title = list(i.find("a").strings)[0]
date = datetime.strptime(i.find("time")["datetime"], "%Y-%m-%d")
- content_url = path.join(root, i.find("a")["href"])
- content = get_article(get_soup(content_url))
+ content_url = relative2absolute(i.find("a")["href"])
+ content = format_article(get_article(content_url))
rss_item = Item(title, date, content, content_url)
rss_items.append(rss_item)
return rss_items