summaryrefslogtreecommitdiffstats
path: root/kadomatsu-rss.py
blob: 5b29ecd5249b6534fd88f62a50b25248230b7fb7 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/usr/bin/env python3
import cgi
import cgitb

cgitb.enable()

from datetime import datetime

import feedgenerator
import requests

from bs4 import BeautifulSoup
from urllib.parse import urljoin

root = "http://www.toshiki-kadomatsu.jp/information/"

feed = feedgenerator.Rss201rev2Feed(
    title="角松敏生 OFFICIAL SITE",
    description="Recent 角松敏生 Information posts",
    link=root,
    language="ja",
)


class Item:
    def __init__(self, title, date, content, url):
        self.title = title
        self.date = date
        self.url = url
        self.content = content

s = requests.Session()

def get_soup(url):
    site = s.get(url)
    site.encoding = "utf-8"
    soup = BeautifulSoup(site.text, "lxml")
    return soup

def relative2absolute(rel):
    return urljoin(root, rel)

def format_article(soup):
    for i in soup.find_all("a"):
        i["href"] = relative2absolute(i["href"])
    for i in soup.find_all("img"):
        i["src"] = relative2absolute(i["src"])
    return soup

def get_article(url):
    soup = get_soup(url)
    article = soup.find("li", id="Detail")
    return format_article(article)

def get_rss_items(soup):
    items = soup.find("ul", id="List")
    rss_items = []
    for i in items.find_all("dl"):
        title = list(i.find("a").strings)[0]
        date = datetime.strptime(i.find("time")["datetime"], "%Y-%m-%d")
        content_url = relative2absolute(i.find("a")["href"])
        content = format_article(get_article(content_url))
        rss_item = Item(title, date, content, content_url)
        rss_items.append(rss_item)
    return rss_items


rss_items = get_rss_items(get_soup("http://www.toshiki-kadomatsu.jp/information/?select=all"))

for i in rss_items:
    feed.add_item(
        title=i.title,
        link=i.url,
        pubdate=i.date,
        description=i.content,
        updateddate=rss_items[0].date,
    )

print("Content-Type: application/rss+xml; charset=UTF-8")
print()
print(feed.writeString("utf-8"))