diff options
Diffstat (limited to 'yt_dlp_plugins')
-rw-r--r-- | yt_dlp_plugins/extractor/radiko.py | 399 | ||||
-rw-r--r-- | yt_dlp_plugins/extractor/radiko_dependencies.py | 29 | ||||
-rw-r--r-- | yt_dlp_plugins/extractor/radiko_hacks.py | 65 | ||||
-rw-r--r-- | yt_dlp_plugins/extractor/radiko_podcast.py | 175 | ||||
-rwxr-xr-x | yt_dlp_plugins/extractor/radiko_protobufs.py | 146 |
5 files changed, 662 insertions, 152 deletions
diff --git a/yt_dlp_plugins/extractor/radiko.py b/yt_dlp_plugins/extractor/radiko.py index c6cea37..3fd19d9 100644 --- a/yt_dlp_plugins/extractor/radiko.py +++ b/yt_dlp_plugins/extractor/radiko.py @@ -6,18 +6,23 @@ import urllib.parse import pkgutil from yt_dlp.extractor.common import InfoExtractor +from yt_dlp.networking.exceptions import HTTPError from yt_dlp.utils import ( + ExtractorError, OnDemandPagedList, clean_html, int_or_none, join_nonempty, parse_qs, traverse_obj, + urlencode_postdata, url_or_none, update_url_query, ) +from yt_dlp_plugins.extractor.radiko_podcast import RadikoPodcastSearchIE import yt_dlp_plugins.extractor.radiko_time as rtime +import yt_dlp_plugins.extractor.radiko_hacks as hacks class _RadikoBaseIE(InfoExtractor): @@ -82,12 +87,15 @@ class _RadikoBaseIE(InfoExtractor): _APP_VERSIONS = ["7.5.0", "7.4.17", "7.4.16", "7.4.15", "7.4.14", "7.4.13", "7.4.12", "7.4.11", "7.4.10", "7.4.9", "7.4.8", "7.4.7", "7.4.6", "7.4.5", "7.4.4", "7.4.3", "7.4.2", "7.4.1", "7.4.0", "7.3.8", "7.3.7", "7.3.6", "7.3.1", "7.3.0", "7.2.11", "7.2.10"] _DELIVERED_ONDEMAND = ('radiko.jp',) - _DOESNT_WORK_WITH_FFMPEG = ('tf-f-rpaa-radiko.smartstream.ne.jp', 'si-f-radiko.smartstream.ne.jp') + _DOESNT_WORK_WITH_FFMPEG = ('tf-f-rpaa-radiko.smartstream.ne.jp', 'si-f-radiko.smartstream.ne.jp', 'alliance-stream-radiko.smartstream.ne.jp') + _AD_INSERTION = ('si-f-radiko.smartstream.ne.jp', ) + + _has_tf30 = None def _index_regions(self): region_data = {} - tree = self._download_xml("https://radiko.jp/v3/station/region/full.xml", None, note="Indexing regions") + tree = self._download_xml("https://radiko.jp/v3/station/region/full.xml", None, note="Indexing station regions") for stations in tree: for station in stations: area = station.find("area_id").text @@ -178,16 +186,21 @@ class _RadikoBaseIE(InfoExtractor): "X-Radiko-AuthToken": auth_token, }, "user": auth2_headers["X-Radiko-User"], + "has_tf30": self._has_tf30, } if not region_mismatch: self.cache.store("rajiko", station_region, auth_data) return auth_data - def _auth(self, station_region): + def _auth(self, station_region, need_tf30=False): cachedata = self.cache.load("rajiko", station_region) self.write_debug(cachedata) if cachedata is not None: + if need_tf30 and not cachedata.get("has_tf30"): + self.write_debug("Cached token doesn't have timefree 30, getting a new one") + return self._negotiate_token(station_region) + auth_headers = cachedata.get("token") response = self._download_webpage("https://radiko.jp/v2/api/auth_check", station_region, "Checking cached token", headers=auth_headers, expected_status=401) @@ -205,6 +218,17 @@ class _RadikoBaseIE(InfoExtractor): station = region.find(f'.//station/id[.="{station_id}"]/..') # a <station> with an <id> of our station_id station_name = station.find("name").text station_url = url_or_none(station.find("href").text) + + thumbnails = [] + for logo in station.findall("logo"): + thumbnails.append({ + "url": logo.text, + **traverse_obj(logo.attrib, ({ + "width": ("width", {int_or_none}), + "height": ("height", {int_or_none}), + })) + }) + meta = { "id": station_id, "title": station_name, @@ -218,7 +242,7 @@ class _RadikoBaseIE(InfoExtractor): "uploader_id": station_id, "uploader_url": station_url, - "thumbnail": url_or_none(station.find("banner").text), + "thumbnails": thumbnails, } self.cache.store("rajiko", station_id, { "expiry": (now + datetime.timedelta(days=1)).timestamp(), @@ -229,8 +253,15 @@ class _RadikoBaseIE(InfoExtractor): self.to_screen(f"{station_id}: Using cached station metadata") return cachedata.get("meta") - def _get_station_formats(self, station, timefree, auth_data, start_at=None, end_at=None): - device = self._configuration_arg('device', ['aSmartPhone7a'], casesense=True, ie_key="rajiko")[0] # aSmartPhone7a formats = always happy path + def _get_station_formats(self, station, timefree, auth_data, start_at=None, end_at=None, use_pc_html5=False): + config_device = traverse_obj(self._configuration_arg('device', casesense=True, ie_key="rajiko"), 0) + + if not use_pc_html5: + device = config_device or "aSmartPhone7a" # still has the radiko.jp on-demand one for timefree + else: + device = config_device or "pc_html5" # the on-demand one doesnt work with timefree30 stuff sadly + # so just use pc_html5 which has everything + url_data = self._download_xml(f"https://radiko.jp/v3/station/stream/{device}/{station}.xml", station, note=f"Downloading {device} stream information") @@ -238,8 +269,11 @@ class _RadikoBaseIE(InfoExtractor): formats = [] timefree_int = 1 if timefree else 0 + do_blacklist_streams = not len(self._configuration_arg("no_stream_blacklist", ie_key="rajiko")) > 0 + do_as_live_chunks = not len(self._configuration_arg("no_as_live_chunks", ie_key="rajiko")) > 0 for element in url_data.findall(f".//url[@timefree='{timefree_int}'][@areafree='0']/playlist_create_url"): # find <url>s with matching timefree and no areafree, then get their <playlist_create_url> + # we don't want areafree here because we should always be in-region url = element.text if url in seen_urls: # there are always dupes, even with ^ specific filtering continue @@ -249,7 +283,7 @@ class _RadikoBaseIE(InfoExtractor): "station_id": station, "l": "15", # l = length, ie how many seconds in the live m3u8 (max 300) "lsid": auth_data["user"], - "type": "b", # it is a mystery + "type": "b", # a/b = in-region, c = areafree }) if timefree: @@ -267,20 +301,56 @@ class _RadikoBaseIE(InfoExtractor): delivered_live = True preference = -1 entry_protocol = 'm3u8' + format_note=[] - if domain in self._DOESNT_WORK_WITH_FFMPEG: + if timefree and domain in self._DOESNT_WORK_WITH_FFMPEG and do_blacklist_streams: + # TODO: remove this completely + # https://github.com/garret1317/yt-dlp-rajiko/issues/29 self.write_debug(f"skipping {domain} (known not working)") continue if domain in self._DELIVERED_ONDEMAND: # override the defaults for delivered as on-demand delivered_live = False - preference = 1 + preference += 2 entry_protocol = None + if domain in self._AD_INSERTION: + preference -= 3 + format_note.append("Ad insertion") + + + auth_headers = auth_data["token"] + + if delivered_live and timefree and do_as_live_chunks: + + chunks_playlist = hacks._generate_as_live_playlist( + self, playlist_url, start_at, end_at, domain, auth_headers + ) + + m3u8_formats = [{ + "format_id": join_nonempty(domain, "chunked"), + "hls_media_playlist_data": chunks_playlist, + "preference": preference, + "ext": "m4a", + "vcodec": "none", + + # fallback to live for ffmpeg etc + "url": playlist_url, + "http_headers": auth_headers, + }] + format_note.append("Chunked") + else: + + m3u8_formats = self._extract_m3u8_formats( + playlist_url, station, m3u8_id=domain, fatal=False, headers=auth_headers, + live=delivered_live, preference=preference, entry_protocol=entry_protocol, + note=f"Downloading m3u8 information from {domain}") + + for f in m3u8_formats: + # ffmpeg sends a Range header which some streams reject. here we disable that (and also some icecast header as well) + f['downloader_options'] = {'ffmpeg_args': ['-seekable', '0', '-http_seekable', '0', '-icy', '0']} + f['format_note'] = ", ".join(format_note) + formats.append(f) - formats += self._extract_m3u8_formats( - playlist_url, station, m3u8_id=domain, fatal=False, headers=auth_data["token"], - live=delivered_live, preference=preference, entry_protocol=entry_protocol, - note=f"Downloading m3u8 information from {domain}") return formats @@ -299,7 +369,7 @@ class RadikoLiveIE(_RadikoBaseIE): "id": "FMT", "title": "re:^TOKYO FM.+$", "alt_title": "TOKYO FM", - "thumbnail": "https://radiko.jp/res/banner/FMT/20220512162447.jpg", + "thumbnail": "https://radiko.jp/v2/static/station/logo/FMT/lrtrim/688x160.png", "channel": "TOKYO FM", "channel_id": "FMT", @@ -319,7 +389,7 @@ class RadikoLiveIE(_RadikoBaseIE): "id": "NORTHWAVE", "title": "re:^FM NORTH WAVE.+$", "alt_title": "FM NORTH WAVE", - "thumbnail": "https://radiko.jp/res/banner/NORTHWAVE/20150731161543.png", + "thumbnail": "https://radiko.jp/v2/static/station/logo/NORTHWAVE/lrtrim/688x160.png", "uploader": "FM NORTH WAVE", "uploader_url": "https://www.fmnorth.co.jp/", @@ -340,7 +410,7 @@ class RadikoLiveIE(_RadikoBaseIE): "id": "RN1", "title": "re:^ラジオNIKKEI第1.+$", "alt_title": "RADIONIKKEI", - "thumbnail": "https://radiko.jp/res/banner/RN1/20120802154152.png", + "thumbnail": "https://radiko.jp/v2/static/station/logo/RN1/lrtrim/688x160.png", "channel": "ラジオNIKKEI第1", "channel_url": "http://www.radionikkei.jp/", @@ -357,7 +427,7 @@ class RadikoLiveIE(_RadikoBaseIE): region = self._get_station_region(station) station_meta = self._get_station_meta(region, station) auth_data = self._auth(region) - formats = self._get_station_formats(station, False, auth_data) + formats = self._get_station_formats(station, False, auth_data, use_pc_html5=True) return { "is_live": True, @@ -368,71 +438,36 @@ class RadikoLiveIE(_RadikoBaseIE): class RadikoTimeFreeIE(_RadikoBaseIE): + _NETRC_MACHINE = "rajiko" _VALID_URL = r"https?://(?:www\.)?radiko\.jp/#!/ts/(?P<station>[A-Z0-9-_]+)/(?P<id>\d+)" - _TESTS = [{ - "url": "https://radiko.jp/#!/ts/INT/20240809230000", - "info_dict": { - "live_status": "was_live", - "ext": "m4a", - "id": "INT-20240809230000", - - "title": "TOKYO MOON", - "series": "Tokyo Moon", - "description": "md5:20e68d2f400a391fa34d4e7c8c702cb8", - "chapters": "count:14", - "thumbnail": "https://program-static.cf.radiko.jp/ehwtw6mcvy.jpg", - - "upload_date": "20240809", - "timestamp": 1723212000.0, - "release_date": "20240809", - "release_timestamp": 1723215600.0, - "duration": 3600, - - "channel": "interfm", - "channel_id": "INT", - "channel_url": "https://www.interfm.co.jp/", - "uploader": "interfm", - "uploader_id": "INT", - "uploader_url": "https://www.interfm.co.jp/", - - "cast": ["松浦\u3000俊夫"], - "tags": ["松浦俊夫"], - }, - }, { - # late-night/early-morning show to test broadcast day checking - "url": "https://radiko.jp/#!/ts/TBS/20240810033000", - "info_dict": { - "live_status": "was_live", - "ext": "m4a", - "id": "TBS-20240810033000", - - "title": "CITY CHILL CLUB", - "series": "CITY CHILL CLUB", - "description": "md5:3fba2c1125059bed27247c0be90e58fa", - "chapters": "count:22", - "thumbnail": "https://program-static.cf.radiko.jp/ku7t4ztnaq.jpg", - - "upload_date": "20240809", - "timestamp": 1723228200.0, - "release_date": "20240809", - "release_timestamp": 1723233600.0, - "duration": 5400, - - "channel": "TBSラジオ", - "channel_url": "https://www.tbsradio.jp/", - "channel_id": "TBS", - "uploader": "TBSラジオ", - "uploader_url": "https://www.tbsradio.jp/", - "uploader_id": "TBS", - - "tags": ["CCC905", "音楽との出会いが楽しめる", "人気アーティストトーク", "音楽プロデューサー出演", "ドライブ中におすすめ", "寝る前におすすめ", "学生におすすめ"], - "cast": ["PES"], - }, - }] + # TESTS use a custom-ish script that updates the airdates automatically, see contrib/test_extractors.py + + def _perform_login(self, username, password): + try: + login_info = self._download_json('https://radiko.jp/ap/member/webapi/member/login', None, note='Logging in', + data=urlencode_postdata({'mail': username, 'pass': password})) + self._has_tf30 = '2' in login_info.get('privileges') + # areafree = 1, timefree30 = 2, double plan = both + self.write_debug({**login_info, "radiko_session": "PRIVATE", "member_ukey": "PRIVATE"}) + except ExtractorError as error: + if isinstance(error.cause, HTTPError) and error.cause.status == 401: + raise ExtractorError('Invalid username and/or password', expected=True) + raise + + def _check_tf30(self): + if self._has_tf30 is not None: + return self._has_tf30 + if self._get_cookies('https://radiko.jp').get('radiko_session') is None: + return + account_info = self._download_json('https://radiko.jp/ap/member/webapi/v2/member/login/check', + None, note='Checking account status from cookies', expected_status=400) + self.write_debug({**account_info, "user_key": "PRIVATE"}) + self._has_tf30 = account_info.get('timefreeplus') == '1' + return self._has_tf30 def _get_programme_meta(self, station_id, url_time): day = url_time.broadcast_day_string() - meta = self._download_json(f"https://radiko.jp/v4/program/station/date/{day}/{station_id}.json", station_id, + meta = self._download_json(f"https://api.radiko.jp/program/v4/date/{day}/station/{station_id}.json", station_id, note="Downloading programme data") programmes = traverse_obj(meta, ("stations", lambda _, v: v["station_id"] == station_id, "programs", "program"), get_all=False) @@ -467,10 +502,12 @@ class RadikoTimeFreeIE(_RadikoBaseIE): "start_time_gte": start.isoformat(), "end_time_lt": end.isoformat(), }) - data = self._download_json(api_url, video_id, note="Downloading tracklist").get("data") + data_json = self._download_json( + api_url, video_id, note="Downloading tracklist", errnote="Downloading tracklist", fatal=False + ) chapters = [] - for track in data: + for track in traverse_obj(data_json, "data") or []: artist = traverse_obj(track, ("artist", "name")) or track.get("artist_name") chapters.append({ "title": join_nonempty(artist, track.get("title"), delim=" - "), @@ -493,11 +530,11 @@ class RadikoTimeFreeIE(_RadikoBaseIE): end = times[1] now = datetime.datetime.now(tz=rtime.JST) expiry_free, expiry_tf30 = end.expiry() - have_tf30 = False if expiry_tf30 < now: self.raise_no_formats("Programme is no longer available.", video_id=meta["id"], expected=True) - elif not have_tf30 and expiry_free < now: + need_tf30 = expiry_free < now + if need_tf30 and not self._check_tf30(): self.raise_login_required("Programme is only available with a Timefree 30 subscription") elif start > now: self.raise_no_formats("Programme has not aired yet.", video_id=meta["id"], expected=True) @@ -508,13 +545,19 @@ class RadikoTimeFreeIE(_RadikoBaseIE): region = self._get_station_region(station) station_meta = self._get_station_meta(region, station) - chapters = self._extract_chapters(station, start, end, video_id=meta["id"]) - auth_data = self._auth(region) - formats = self._get_station_formats(station, True, auth_data, start_at=start, end_at=end) + if live_status == "was_live": + chapters = self._extract_chapters(station, start, end, video_id=meta["id"]) + auth_data = self._auth(region, need_tf30=need_tf30) + formats = self._get_station_formats(station, True, auth_data, start_at=start, end_at=end, use_pc_html5=need_tf30) + else: + chapters = None + formats = None return { **station_meta, - "alt_title": None, + "alt_title": None, # override from station metadata + "thumbnails": None, + **meta, "chapters": chapters, "formats": formats, @@ -524,7 +567,7 @@ class RadikoTimeFreeIE(_RadikoBaseIE): class RadikoSearchIE(InfoExtractor): - _VALID_URL = r"https?://(?:www\.)?radiko\.jp/#!/search/(?:timeshift|live|history)\?" + _VALID_URL = r"https?://(?:www\.)?radiko\.jp/#!/search/(?:radio/)?(?:timeshift|live|history)\?" _TESTS = [{ # timefree, specific area "url": "https://radiko.jp/#!/search/live?key=city%20chill%20club&filter=past&start_day=&end_day=®ion_id=&area_id=JP13&cul_area_id=JP13&page_idx=0", @@ -549,26 +592,70 @@ class RadikoSearchIE(InfoExtractor): "id": "ニュース-all-all", "title": "ニュース" }, + 'expected_warnings': ['Skipping podcasts. If you really want EVERY EPISODE of EVERY RESULT, set your search filter to Podcasts only.'], }] def _strip_date(self, date): + # lazy way of making a timestring (from eg 2025-05-20 01:00:00) return date.replace(" ", "").replace("-", "").replace(":", "") def _pagefunc(self, url, idx): url = update_url_query(url, {"page_idx": idx}) data = self._download_json(url, None, note=f"Downloading page {idx+1}") - return [self.url_result("https://radiko.jp/#!/ts/{station}/{time}".format( - station = i.get("station_id"), time = self._strip_date(i.get("start_time")))) - for i in data.get("data")] + results = [] + for r in data.get("data"): + station = r.get("station_id") + timestring = self._strip_date(r.get("start_time")) + + results.append( + self.url_result( + f"https://radiko.jp/#!/ts/{station}/{timestring}", + id=join_nonempty(station, timestring), + ie=RadikoTimeFreeIE, + ) + ) + return results def _real_extract(self, url): - url = url.replace("/#!/", "/!/", 1) # urllib.parse interprets the path as just one giant fragment because of the #, so we hack it away + url = url.replace("/#!/", "/!/", 1) queries = parse_qs(url) + key = traverse_obj(queries, ("key", 0)) - search_url = update_url_query("https://radiko.jp/v3/api/program/search", { + # site used to use "cul_area_id" in the search url, now it uses "cur_area_id" (with an r) + # and outright rejects the old one with HTTP Error 415: Unsupported Media Type + if queries.get("cul_area_id"): + queries["cur_area_id"] = queries.pop("cul_area_id") + + if queries.get("filter"): + filter_set = set(queries["filter"][0].split("|")) + del queries["filter"] + else: + filter_set = {"future", "past", "channel"} + + if filter_set == {"channel"}: + podcast_search_url = update_url_query( + "https://radiko.jp/!/search/podcast/live", {"key": key} + ).replace("!", "#!", 1) # same shit with urllib.parse + return self.url_result(podcast_search_url, ie=RadikoPodcastSearchIE) + + if "channel" in filter_set: + self.report_warning("Skipping podcasts. If you really want EVERY EPISODE of EVERY RESULT, set your search filter to Podcasts only.") + filter_set.discard("channel") + + if filter_set == {"future", "past"}: + filter_str = "" + else: + filter_str = "|".join(filter_set) # there should be only one filter now, so this should be the same as filter_set[0] + # but if there's more than one, then we should at least try to pass it through as-is, in the hope that it works + if len(filter_set) != 1: + # but also kick up a stink about it so it's clear it probably won't + self.report_warning("Your search has an unknown combination of filters, so this request will probably fail!") + + search_url = update_url_query("https://api.annex-cf.radiko.jp/v1/programs/legacy/perl/program/search", { **queries, + "filter": filter_str, "uid": "".join(random.choices("0123456789abcdef", k=32)), "app_id": "pc", "row_limit": 50, # higher row_limit = more results = less requests = more good @@ -576,60 +663,32 @@ class RadikoSearchIE(InfoExtractor): results = OnDemandPagedList(lambda idx: self._pagefunc(search_url, idx), 50) - key = traverse_obj(queries, ("key", 0)) day = traverse_obj(queries, ("start_day", 0)) or "all" region = traverse_obj(queries, ("region_id", 0)) or traverse_obj(queries, ("area_id", 0)) - status_filter = traverse_obj(queries, ("filter", 0)) or "all" + status_filter = filter_str or "all" playlist_id = join_nonempty(key, status_filter, day, region) return { "_type": "playlist", - "title": traverse_obj(queries, ("key", 0)), + "title": key, "id": playlist_id, "entries": results, } + class RadikoShareIE(InfoExtractor): _VALID_URL = r"https?://(?:www\.)?radiko\.jp/share/" - _TESTS = [{ - # 29-hour time -> 24-hour time - "url": "http://radiko.jp/share/?sid=FMT&t=20240802240000", - "info_dict": { - "live_status": "was_live", - "ext": "m4a", - "id": "FMT-20240803000000", # the time given (24:00) works out to 00:00 the next day - - "title": "JET STREAM", - "series": "JET STREAM", - "description": "md5:c1a2172036ebb7a54eeafb47e0a08a50", - "chapters": "count:9", - "thumbnail": "https://program-static.cf.radiko.jp/greinlrspi.jpg", - - "upload_date": "20240802", - "timestamp": 1722610800.0, - "release_date": "20240802", - "release_timestamp": 1722614100.0, - "duration": 3300, - - "channel": "TOKYO FM", - "channel_id": "FMT", - "channel_url": "https://www.tfm.co.jp/", - "uploader": "TOKYO FM", - "uploader_id": "FMT", - "uploader_url": "https://www.tfm.co.jp/", - - "cast": ["福山雅治"], - "tags": ["福山雅治", "夜間飛行", "音楽との出会いが楽しめる", "朗読を楽しめる", "寝る前に聴きたい"], - } - }] def _real_extract(self, url): queries = parse_qs(url) station = traverse_obj(queries, ("sid", 0)) time = traverse_obj(queries, ("t", 0)) time = rtime.RadikoShareTime(time).timestring() - return self.url_result(f"https://radiko.jp/#!/ts/{station}/{time}", RadikoTimeFreeIE) + return self.url_result( + f"https://radiko.jp/#!/ts/{station}/{time}", RadikoTimeFreeIE, + id=join_nonempty(station, time) + ) class RadikoStationButtonIE(InfoExtractor): @@ -642,19 +701,9 @@ class RadikoStationButtonIE(InfoExtractor): "info_dict": { "ext": "m4a", 'live_status': 'is_live', - "id": "QRR", - "title": "re:^文化放送.+$", - 'alt_title': 'JOQR BUNKA HOSO', - 'thumbnail': 'https://radiko.jp/res/banner/QRR/20240423144553.png', - 'channel': '文化放送', - 'channel_id': 'QRR', - 'channel_url': 'http://www.joqr.co.jp/', - 'uploader': '文化放送', - 'uploader_id': 'QRR', - 'uploader_url': 'http://www.joqr.co.jp/', - - } + }, + 'only_matching': True, }] _WEBPAGE_TESTS = [{ @@ -665,7 +714,7 @@ class RadikoStationButtonIE(InfoExtractor): 'id': 'CCL', "title": "re:^FM COCOLO.+$", 'alt_title': 'FM COCOLO', - 'thumbnail': 'https://radiko.jp/res/banner/CCL/20161014144826.png', + 'thumbnail': 'https://radiko.jp/v2/static/station/logo/CCL/lrtrim/688x160.png', 'channel': 'FM COCOLO', 'channel_id': 'CCL', @@ -694,7 +743,7 @@ class RadikoPersonIE(InfoExtractor): },{ "url": "https://radiko.jp/persons/11421", "params": {'extractor_args': {'rajiko': {'key_station_only': ['']}}}, - "playlist_count": 1, + "playlist_mincount": 1, "info_dict": { "id": "person-11421", }, @@ -720,19 +769,65 @@ class RadikoPersonIE(InfoExtractor): def entries(): key_station_only = len(self._configuration_arg("key_station_only", ie_key="rajiko")) > 0 for episode in person_api.get("data"): - if key_station_only and episode.get("key_station_id") != episode.get("station_id"): - continue - share_url = traverse_obj(episode, ("radiko_url", ("pc", "sp", "android", "ios", "app"), - {url_or_none}), get_all=False) - # they're all identical share links at the moment (5th aug 2024) but they might not be in the future + station = episode.get("station_id") + if key_station_only and episode.get("key_station_id") != station: + continue - # predictions: - # pc will probably stay the same - # don't know what sp is, possibly "SmartPhone"?, anyway seems reasonably generic - # android is easier for me to reverse-engineer than ios (no ithing) - # i assume "app" would be some internal tell-it-to-do-something link, not a regular web link + start = episode.get("start_at") + timestring = rtime.RadikoTime.fromisoformat(start).timestring() - yield self.url_result(share_url, ie=RadikoShareIE, video_title=episode.get("title")) + timefree_id = join_nonempty(station, timestring) + timefree_url = f"https://radiko.jp/#!/ts/{station}/{timestring}" + yield self.url_result(timefree_url, ie=RadikoTimeFreeIE, video_id=timefree_id) return self.playlist_result(entries(), playlist_id=join_nonempty("person", person_id)) + + +class RadikoRSeasonsIE(InfoExtractor): + _VALID_URL = r"https?://(?:www\.)?radiko\.jp/(?:mobile/)?r_seasons/(?P<id>\d+$)" + _TESTS = [{ + "url": "https://radiko.jp/r_seasons/10012302", + "playlist_mincount": 4, + "info_dict": { + "id": '10012302', + "title": '山下達郎の楽天カード サンデー・ソングブック', + 'thumbnail': 'https://program-static.cf.radiko.jp/935a87fc-4a52-48e5-9468-7b2ef9448d9f.jpeg', + } + }, { + "url": "https://radiko.jp/r_seasons/10002831", + "playlist_mincount": 4, + "info_dict": { + "id": "10002831", + "title": "Tokyo Moon", + 'description': 'md5:3eef525003bbe96ccf33ec647c43d904', + 'thumbnail': 'https://program-static.cf.radiko.jp/0368ee85-5d5f-41c9-8ee1-6c1035d87b3f.jpeg', + } + }] + + def _real_extract(self, url): + season_id = self._match_id(url) + html = self._download_webpage(url, season_id) + pageProps = self._search_nextjs_data(html, season_id)["props"]["pageProps"] + season_id = traverse_obj(pageProps, ("rSeason", "id")) or season_id + + def entries(): + for episode in pageProps.get("pastPrograms"): + station = traverse_obj(episode, ("stationId")) + start = traverse_obj(episode, ("startAt", "seconds")) + timestring = rtime.RadikoTime.fromtimestamp(start, tz=rtime.JST).timestring() + + timefree_id = join_nonempty(station, timestring) + timefree_url = f"https://radiko.jp/#!/ts/{station}/{timestring}" + + yield self.url_result(timefree_url, ie=RadikoTimeFreeIE, video_id=timefree_id) + + return self.playlist_result( + entries(), + playlist_id=season_id, + **traverse_obj(pageProps, ("rSeason", { + "playlist_title": "rSeasonName", + "thumbnail": "backgroundImageUrl", + "description": ("summary", filter), + })), + ) diff --git a/yt_dlp_plugins/extractor/radiko_dependencies.py b/yt_dlp_plugins/extractor/radiko_dependencies.py new file mode 100644 index 0000000..769a5e3 --- /dev/null +++ b/yt_dlp_plugins/extractor/radiko_dependencies.py @@ -0,0 +1,29 @@ +# Bundle importing code Copyright (c) 2021-2022 Grub4K, from yt-dont-lock-p. +# https://github.com/Grub4K/yt-dont-lock-p/blob/ff3b6e1d42ce8584153ae27544d2c05b50ab5954/yt_dlp_plugins/postprocessor/yt_dont_lock_p/__init__.py#L23-L46 +# Used under 0BSD with permission + +# https://discord.com/channels/807245652072857610/1112613156934668338/1416816007732920430 (yt-dlp discord server, https://discord.gg/H5MNcFW63r ) +# [17:00] garret1317: @Grub4K can i pinch your MIT-licensed dependency bundling code to use in my 0BSD-licensed plugin? +# I will credit of course but i can't require that anyone else does the same +# (Any response to this message will be considered a written consent or refusal of the request) +# [17:04] Grub4K: Feel free to use that part under 0BSD +# [17:05] garret1317: 👍 cheers + +try: + import protobug +except ImportError: + import sys + from pathlib import Path + + # Try importing from zip file bundle + search_path = str(Path(__file__).parent.parent) + sys.path.append(search_path) + try: + import protobug + except ImportError: + protobug = None + except Exception: + protobug = None + + finally: + sys.path.remove(search_path) diff --git a/yt_dlp_plugins/extractor/radiko_hacks.py b/yt_dlp_plugins/extractor/radiko_hacks.py new file mode 100644 index 0000000..6486034 --- /dev/null +++ b/yt_dlp_plugins/extractor/radiko_hacks.py @@ -0,0 +1,65 @@ +import datetime +import re + +from yt_dlp.extractor.common import InfoExtractor +from yt_dlp.utils import ( + join_nonempty, + update_url_query, + traverse_obj, +) + +# "hacks" as in great jank/schizo shit that works anyway + +def _generate_as_live_playlist(self, playlist_url, start_at, end_at, domain, headers={}): + playlist = "" + chunk_length = 300 # max the api allows + + duration = int(end_at.timestamp() - start_at.timestamp()) + cursor = 0 + chunk_num = 1 + while cursor < duration: + chunk_length = min(chunk_length, duration - cursor) + + chunk_start = start_at + datetime.timedelta(seconds=cursor) + chunk_url = update_url_query(playlist_url, { + "seek": chunk_start.timestring(), + "l": chunk_length, + }) + + chunk_playlist, real_chunk_length = _get_chunk_playlist(self, chunk_url, domain, chunk_num, headers) + + playlist += chunk_playlist + cursor += real_chunk_length + chunk_num += 1 + + return playlist + +def _get_chunk_playlist(self, chunk_url, src_id, chunk_num, headers={}): + EXTINF_duration = re.compile(r"^#EXTINF:([\d.]+),", flags=re.MULTILINE) + + playlist = "" + chunk_id = join_nonempty(src_id, chunk_num) + base_format = self._extract_m3u8_formats( + chunk_url, chunk_id, fatal=False, headers=headers, + note=f"Preparing {src_id} chunk {chunk_num}" + ) + m3u8_url = traverse_obj(base_format, (..., "url",), get_all=False) + playlist = self._download_webpage(m3u8_url, chunk_id, note=f"Getting {src_id} chunk {chunk_num} fragments") + + real_duration = 0 + for i in EXTINF_duration.findall(playlist): + real_duration += float(i) + real_duration = round(real_duration) + + # playlists can sometimes be longer than they should + # wowza stream does some strange things + # it goes along fine with every fragment 5s long as normal + # and then during the ad break it does one with a different length (2s here) + # i assume so they have a clean split to do ad insertion in? idk + + # but anyway now the chunks aren't always a clean 5mins long + # and we get a repeated fragment going into the next chunk + + # so to work around this, we track the real duration from the #EXTINF tags + + return playlist, real_duration diff --git a/yt_dlp_plugins/extractor/radiko_podcast.py b/yt_dlp_plugins/extractor/radiko_podcast.py new file mode 100644 index 0000000..27b91ad --- /dev/null +++ b/yt_dlp_plugins/extractor/radiko_podcast.py @@ -0,0 +1,175 @@ +from yt_dlp.extractor.common import InfoExtractor +from yt_dlp.utils import ( + clean_html, + OnDemandPagedList, + parse_qs, + traverse_obj, + update_url_query, + url_or_none, + str_or_none, +) + +import dataclasses +import random + +from yt_dlp_plugins.extractor.radiko_dependencies import protobug +if protobug: + import yt_dlp_plugins.extractor.radiko_protobufs as pb + + +class _RadikoPodcastBaseIE(InfoExtractor): + + def _extract_episode(self, episode_info): + return { + **traverse_obj(episode_info, { + "id": ("id", {str_or_none}), + "url": ("audio", "url"), + "duration": ("audio", "durationSec"), + + "title": "title", + "description": ("description", {clean_html}), + "timestamp": ("startAt", "seconds"), + + "series": "channelTitle", + "series_id": "channelId", + "channel": "channelStationName", + "uploader": "channelStationName", + }), + "thumbnail": traverse_obj(episode_info, ("imageUrl", {url_or_none})) + or traverse_obj(episode_info, ("channelImageUrl", {url_or_none})), + + # so that --download-archive still works if you download from the playlist page + "webpage_url": "https://radiko.jp/podcast/episodes/{id}".format(id=traverse_obj(episode_info, "id")), + 'extractor_key': RadikoPodcastEpisodeIE.ie_key(), + 'extractor': 'RadikoPodcastEpisode', + } + + +class RadikoPodcastEpisodeIE(_RadikoPodcastBaseIE): + _VALID_URL = r"https?://radiko\.jp/podcast/episodes/(?P<id>[a-f0-9-]+)" + + _TESTS = [{ + "url": "https://radiko.jp/podcast/episodes/cc8cf709-a50b-4846-aa0e-91ab10cf8bff", + "info_dict": { + "id": "cc8cf709-a50b-4846-aa0e-91ab10cf8bff", + "ext": "mp3", + 'title': '2025.6.26 おしゃべり技術くん', + 'description': 'md5:1c4048025f68d6da053dd879a5d62304', + 'duration': 717, + 'thumbnail': 'https://podcast-static.cf.radiko.jp/09f27a48-ae04-4ce7-a024-572460e46eb7-20240214160012.png', + 'series': 'おしゃべり技術くん', + 'series_id': '09f27a48-ae04-4ce7-a024-572460e46eb7', + 'timestamp': 1751554800, + 'upload_date': '20250703', + 'uploader': 'IBCラジオ', + 'channel': 'IBCラジオ', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + next_data = self._search_nextjs_data(webpage, video_id)["props"]["pageProps"] + + episode_info = next_data["podcastEpisode"] + + return self._extract_episode(episode_info) + + +class RadikoPodcastChannelIE(_RadikoPodcastBaseIE): + _VALID_URL = r"https?://radiko\.jp/podcast/channels/(?P<id>[a-f0-9-]+)" + + _TESTS = [{ + "url": "https://radiko.jp/podcast/channels/09f27a48-ae04-4ce7-a024-572460e46eb7", + "info_dict": { + "id": "09f27a48-ae04-4ce7-a024-572460e46eb7" + }, + 'playlist_mincount': 20, + 'expected_warnings': ['Currently this extractor can only extract the latest 20 episodes'], + }] + + def _real_extract(self, url): + channel_id = self._match_id(url) + webpage = self._download_webpage(url, channel_id) + next_data = self._search_nextjs_data(webpage, channel_id)["props"]["pageProps"] + + channel_info = next_data["podcastChannel"] + episode_list_response = next_data["listPodcastEpisodesResponse"] + + + def entries(): + has_next_page = episode_list_response.get("hasNextPage") + for episode in episode_list_response["episodesList"]: + cursor = episode.get("id") + yield self._extract_episode(episode) + + if has_next_page: + if protobug: + userservice_token = pb.auth_userservice(self) + while has_next_page: + page = pb.get_podcast_episodes(self, channel_id, userservice_token, cursor) + has_next_page = page.hasNextPage + for episode in page.episodes: + cursor = episode.id + yield self._extract_episode(dataclasses.asdict(episode)) + else: + self.report_warning(f'protobug is required to extract more than the latest {len(episode_list_response["episodesList"])} episodes.\nIf you installed yt-dlp-rajiko manually (with the .whl), use the .zip bundle instead. If you installed with pip, pip install protobug .') + + return { + "_type": "playlist", + "id": channel_id, + **traverse_obj(channel_info, { + "playlist_title": "title", + "playlist_id": "id", + "playlist_description": ("description", {clean_html}), + "playlist_thumbnail": ("imageUrl", {url_or_none}), + + }), + "entries": entries(), + } + + +class RadikoPodcastSearchIE(InfoExtractor): + _VALID_URL = r"https?://(?:www\.)?radiko\.jp/#!/search/podcast/(?:timeshift|live)\?" + _TESTS = [{ + "url": "https://radiko.jp/#!/search/podcast/live?key=ドラマ", + "playlist_mincount": 51, + "info_dict": { + "id": "ドラマ", + "title": "ドラマ", + }, + }] + + def _pagefunc(self, url, idx): + url = update_url_query(url, {"pageIdx": idx}) + data = self._download_json(url, None, note=f"Downloading page {idx+1}") + + results = [] + for channel in data.get("channels"): + results.append( + self.url_result( + channel.get("channelUrl"), + id=channel.get("id"), + ie=RadikoPodcastChannelIE, + ) + ) + return results + + + def _real_extract(self, url): + # hack away the # so urllib.parse will work (same as normal RadikoSearchIE) + url = url.replace("/#!/", "/!/", 1) + queries = parse_qs(url) + + keywords = traverse_obj(queries, ("key", 0)) + search_url = update_url_query("https://api.annex-cf.radiko.jp/v1/podcasts/channels/search_with_keywords_by_offset", { + "keywords": keywords, + "uid": "".join(random.choices("0123456789abcdef", k=32)), + "limit": 50, # result limit. the actual limit before the api errors is 5000, but that seems a bit rude so i'll leave as 50 like the radio one + }) + + return self.playlist_result( + OnDemandPagedList(lambda idx: self._pagefunc(search_url, idx), 50), + title=keywords, + id=keywords, # i have to put some kind of id or the tests fail + ) diff --git a/yt_dlp_plugins/extractor/radiko_protobufs.py b/yt_dlp_plugins/extractor/radiko_protobufs.py new file mode 100755 index 0000000..a8bbec1 --- /dev/null +++ b/yt_dlp_plugins/extractor/radiko_protobufs.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 +import struct +import random + +from yt_dlp_plugins.extractor.radiko_dependencies import protobug + +if protobug: # i suppose it works lmao + + + def add_grpc_header(protobuf_data): + compression_flag = 0 + message_length = len(protobuf_data) + header = struct.pack('>BI', compression_flag, message_length) + return header + protobuf_data + + def strip_grpc_response(response): + return response[5:].rpartition(b"grpc-status:")[0] + + def _download_grpc(self, url_or_request, video_id, response_message, note="Downloading GRPC information", *args, **kwargs): + urlh = self._request_webpage(url_or_request, video_id, + headers={ + 'Content-Type': 'application/grpc-web+proto', + 'X-User-Agent': 'grpc-web-javascript/0.1', + 'X-Grpc-Web': '1', + **kwargs.pop('headers') + }, + data=add_grpc_header(protobug.dumps(kwargs.pop('data'))), note=note, + *args, **kwargs, + ) + response = urlh.read() + + protobuf = strip_grpc_response(response) + if len(protobuf) > 0: + return protobug.loads(protobuf, response_message) + + + @protobug.message + class SignUpRequest: + lsid: protobug.String = protobug.field(1) + + def sign_up(self): + lsid = ''.join(random.choices('0123456789abcdef', k=32)) + + signup = _download_grpc(self, "https://api.annex.radiko.jp/radiko.UserService/SignUp", + "UserService", None, note="Registering ID", headers={'Origin': 'https://radiko.jp'}, + data=SignUpRequest(lsid=lsid), + ) + # youre meant to only do the sign up ^ once and then keep your lsid for later + # so that you can sign in and get the token for the API to work + return lsid + + + @protobug.message + class SignInRequest: + lsid: protobug.String = protobug.field(2) + area: protobug.String = protobug.field(3) + + @protobug.message + class SignInResponse: + jwt: protobug.String = protobug.field(1) + + + def sign_in(self, lsid): + sign_in = _download_grpc(self, "https://api.annex.radiko.jp/radiko.UserService/SignIn", + "UserService", SignInResponse, note="Getting auth token", headers={'Origin': 'https://radiko.jp'}, + data=SignInRequest(lsid=lsid, area="JP13"), + ) + return sign_in.jwt + + + def auth_userservice(self): + cachedata = self.cache.load("rajiko", "UserService") + if cachedata is not None: + lsid = cachedata.get("lsid") + else: + lsid = sign_up(self) + self.cache.store("rajiko", "UserService", {"lsid": lsid}) + jwt = sign_in(self, lsid) + return jwt + + + @protobug.message + class ListPodcastEpisodesRequest: + channel_id: protobug.String = protobug.field(1) + sort_by_latest: protobug.Bool = protobug.field(2) + page_length: protobug.Int32 = protobug.field(4) + cursor: protobug.String = protobug.field(5, default=None) + + + @protobug.message + class Audio: + revision: protobug.Int32 = protobug.field(1) + url: protobug.String = protobug.field(2) + fileSize: protobug.Int64 = protobug.field(3) + durationSec: protobug.Int64 = protobug.field(4) + transcoded: protobug.Bool = protobug.field(5) + + @protobug.message + class EpisodeStartAt: + seconds: protobug.UInt64 = protobug.field(1) + nanos: protobug.UInt64 = protobug.field(2, default=0) + + + @protobug.message + class PodcastEpisode: + id: protobug.String = protobug.field(1) + workspaceId: protobug.String = protobug.field(2) + channelId: protobug.String = protobug.field(3) + title: protobug.String = protobug.field(4) + description: protobug.String = protobug.field(5) + + audio: Audio = protobug.field(8) + channelImageUrl: protobug.String = protobug.field(16) + channelTitle: protobug.String = protobug.field(17) + channelStationName: protobug.String = protobug.field(18) + channelAuthor: protobug.String = protobug.field(19) + + channelThumbnailImageUrl: protobug.String = protobug.field(21) + channelStationType: protobug.UInt32 = protobug.field(22) + startAt: EpisodeStartAt = protobug.field(27) + isEnabled: protobug.Bool = protobug.field(29) + hasTranscription: protobug.Bool = protobug.field(32) + + imageUrl: protobug.String = protobug.field(7, default=None) + thumbnailImageUrl: protobug.String = protobug.field(20, default=None) + + @protobug.message + class ListPodcastEpisodesResponse: + episodes: list[PodcastEpisode] = protobug.field(1) + hasNextPage: protobug.Bool = protobug.field(2, default=False) + + + def get_podcast_episodes(self, channel_id, jwt, cursor, page_length=20): + # site uses 20 items + # cursor is the id of the last episode you've seen in the list + + return _download_grpc(self, 'https://api.annex.radiko.jp/radiko.PodcastService/ListPodcastEpisodes', + channel_id, ListPodcastEpisodesResponse, note="Downloading episode listings", + headers={'Authorization': f'Bearer {jwt}'}, + data=ListPodcastEpisodesRequest( + channel_id=channel_id, + sort_by_latest=True, + page_length=page_length, + cursor=cursor, + ) + ) |