From 31f50c8194f12c27ac6fbfe336f1d515aa8677ae Mon Sep 17 00:00:00 2001 From: dirkf Date: Sun, 27 Aug 2023 19:08:28 +0100 Subject: [PATCH 1/5] [S4C] Add thumbnail extraction, extract series as playlist Based on https://github.com/yt-dlp/yt-dlp/pull/7776: thx ifan-t, bashonly --- youtube_dl/extractor/extractors.py | 5 ++- youtube_dl/extractor/s4c.py | 62 ++++++++++++++++++++++++++---- 2 files changed, 59 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index cb39876c2..d9289e5bf 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1087,7 +1087,10 @@ from .rutube import ( from .rutv import RUTVIE from .ruutu import RuutuIE from .ruv import RuvIE -from .s4c import S4CIE +from .s4c import ( + S4CIE, + S4CSeriesIE, +) from .safari import ( SafariIE, SafariApiIE, diff --git a/youtube_dl/extractor/s4c.py b/youtube_dl/extractor/s4c.py index 21d40c2d3..b152e6680 100644 --- a/youtube_dl/extractor/s4c.py +++ b/youtube_dl/extractor/s4c.py @@ -2,6 +2,8 @@ from __future__ import unicode_literals +from functools import partial as partial_f + from .common import InfoExtractor from ..utils import ( float_or_none, @@ -9,6 +11,7 @@ from ..utils import ( T, traverse_obj, txt_or_none, + url_or_none, ) @@ -21,7 +24,8 @@ class S4CIE(InfoExtractor): 'ext': 'mp4', 'title': 'Y Swn', 'description': 'md5:f7681a30e4955b250b3224aa9fe70cf0', - 'duration': 5340 + 'duration': 5340, + 'thumbnail': 'https://www.s4c.cymru/amg/1920x1080/Y_Swn_2023S4C_099_ii.jpg', }, }, { 'url': 'https://www.s4c.cymru/clic/programme/856636948', @@ -31,6 +35,7 @@ class S4CIE(InfoExtractor): 'title': 'Am Dro', 'duration': 2880, 'description': 'md5:100d8686fc9a632a0cb2db52a3433ffe', + 'thumbnail': 'https://www.s4c.cymru/amg/1920x1080/Am_Dro_2022-23S4C_P6_4005.jpg', }, }] @@ -43,7 +48,7 @@ class S4CIE(InfoExtractor): 'programme_id': video_id, }, fatal=False) - filename = self._download_json( + player_config = self._download_json( 'https://player-api.s4c-cdn.co.uk/player-configuration/prod', video_id, query={ 'programme_id': video_id, 'signed': '0', @@ -51,7 +56,8 @@ class S4CIE(InfoExtractor): 'mode': 'od', 'appId': 'clic', 'streamName': '', - }, note='Downloading player config JSON')['filename'] + }, note='Downloading player config JSON') + m3u8_url = self._download_json( 'https://player-api.s4c-cdn.co.uk/streaming-urls/prod', video_id, query={ 'mode': 'od', @@ -59,18 +65,60 @@ class S4CIE(InfoExtractor): 'region': 'WW', 'extra': 'false', 'thirdParty': 'false', - 'filename': filename, + 'filename': player_config['filename'], }, note='Downloading streaming urls JSON')['hls'] - # ... self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4', m3u8_id='hls') - formats, subtitles = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', m3u8_id='hls', entry_protocol='m3u8_native'), {} + formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', m3u8_id='hls', entry_protocol='m3u8_native') + self._sort_formats(formats) + + subtitles = {} + for sub in traverse_obj(player_config, ('subtitles', lambda _, v: url_or_none(v['0']))): + subtitles.setdefault(sub.get('3', 'en'), []).append({ + 'url': sub['0'], + 'name': sub.get('1'), + }) return merge_dicts({ 'id': video_id, 'formats': formats, 'subtitles': subtitles, + 'thumbnail': url_or_none(player_config.get('poster')), }, traverse_obj(details, ('full_prog_details', 0, { 'title': (('programme_title', 'series_title'), T(txt_or_none)), 'description': ('full_billing', T(txt_or_none)), - 'duration': ('duration', T(lambda x: float_or_none(x, invscale=60))), + 'duration': ('duration', T(partial_f(float_or_none, invscale=60))), }), get_all=False), rev=True) + + +class S4CSeriesIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?s4c\.cymru/clic/series/(?P\d+)' + _TESTS = [{ + 'url': 'https://www.s4c.cymru/clic/series/864982911', + 'playlist_mincount': 6, + 'info_dict': { + 'id': '864982911', + 'title': 'Iaith ar Daith', + }, + }, { + 'url': 'https://www.s4c.cymru/clic/series/866852587', + 'playlist_mincount': 8, + 'info_dict': { + 'id': '866852587', + 'title': 'FFIT Cymru', + }, + }] + + def _real_extract(self, url): + series_id = self._match_id(url) + series_details = self._download_json( + 'https://www.s4c.cymru/df/series_details', series_id, query={ + 'lang': 'e', + 'series_id': series_id, + 'show_prog_in_series': 'Y' + }, note='Downloading series details JSON') + + return self.playlist_result( + (self.url_result('https://www.s4c.cymru/clic/programme/' + episode_id, S4CIE, episode_id) + for episode_id in traverse_obj(series_details, ('other_progs_in_series', Ellipsis, 'id'))), + playlist_id=series_id, playlist_title=traverse_obj( + series_details, ('full_prog_details', 0, 'series_title', T(txt_or_none)))) From 21caaf23800c95451cec27dfac56df2c0f8de85a Mon Sep 17 00:00:00 2001 From: dirkf Date: Sun, 3 Sep 2023 01:13:40 +0100 Subject: [PATCH 2/5] [test] Remove redundancy from lambda expected value regex --- test/helper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/helper.py b/test/helper.py index fc55c6b46..5b7e3dfe2 100644 --- a/test/helper.py +++ b/test/helper.py @@ -142,7 +142,7 @@ def expect_value(self, got, expected, field): self.assertTrue( contains_str in got, 'field %s (value: %r) should contain %r' % (field, got, contains_str)) - elif isinstance(expected, compat_str) and re.match(r'^lambda \w+:', expected): + elif isinstance(expected, compat_str) and re.match(r'lambda \w+:', expected): fn = eval(expected) suite = expected.split(':', 1)[1].strip() self.assertTrue( From bbd3e7e9999877104e1e47a8ed49f3b90257f083 Mon Sep 17 00:00:00 2001 From: dirkf Date: Sun, 3 Sep 2023 01:18:22 +0100 Subject: [PATCH 3/5] [utils] Properly handle list values in update_url() An actual list value in a query update could have been treated as a list of values because of the key:list parse_qs format. --- youtube_dl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 81ff78807..fdf41b025 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -4257,7 +4257,7 @@ def update_url(url, **kwargs): query = kwargs.pop('query_update', None) if query: qs = compat_parse_qs(url.query) - qs.update(query) + qs.update((k, [v]) for k, v in query.items()) kwargs['query'] = compat_urllib_parse_urlencode(qs, True) kwargs = compat_kwargs(kwargs) return compat_urllib_parse.urlunparse(url._replace(**kwargs)) From 66ab0814c4baa2dc79c2dd5287bc0ad61a37c5b9 Mon Sep 17 00:00:00 2001 From: dirkf Date: Sun, 3 Sep 2023 23:15:19 +0100 Subject: [PATCH 4/5] [utils] Revert bbd3e7e, updating docstring, test instead --- test/test_utils.py | 46 ++++++++++++++++++++++----------------------- youtube_dl/utils.py | 3 ++- 2 files changed, 24 insertions(+), 25 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index fdae1f744..102420fcb 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -62,13 +62,14 @@ from youtube_dl.utils import ( OnDemandPagedList, orderedSet, parse_age_limit, + parse_bitrate, parse_duration, parse_filesize, parse_codecs, parse_count, parse_iso8601, parse_resolution, - parse_bitrate, + parse_qs, pkcs1pad, prepend_extension, read_batch_urls, @@ -125,7 +126,6 @@ from youtube_dl.compat import ( compat_setenv, compat_str, compat_urlparse, - compat_parse_qs, ) @@ -683,38 +683,36 @@ class TestUtil(unittest.TestCase): self.assertTrue(isinstance(data, bytes)) def test_update_url_query(self): - def query_dict(url): - return compat_parse_qs(compat_urlparse.urlparse(url).query) - self.assertEqual(query_dict(update_url_query( + self.assertEqual(parse_qs(update_url_query( 'http://example.com/path', {'quality': ['HD'], 'format': ['mp4']})), - query_dict('http://example.com/path?quality=HD&format=mp4')) - self.assertEqual(query_dict(update_url_query( + parse_qs('http://example.com/path?quality=HD&format=mp4')) + self.assertEqual(parse_qs(update_url_query( 'http://example.com/path', {'system': ['LINUX', 'WINDOWS']})), - query_dict('http://example.com/path?system=LINUX&system=WINDOWS')) - self.assertEqual(query_dict(update_url_query( + parse_qs('http://example.com/path?system=LINUX&system=WINDOWS')) + self.assertEqual(parse_qs(update_url_query( 'http://example.com/path', {'fields': 'id,formats,subtitles'})), - query_dict('http://example.com/path?fields=id,formats,subtitles')) - self.assertEqual(query_dict(update_url_query( + parse_qs('http://example.com/path?fields=id,formats,subtitles')) + self.assertEqual(parse_qs(update_url_query( 'http://example.com/path', {'fields': ('id,formats,subtitles', 'thumbnails')})), - query_dict('http://example.com/path?fields=id,formats,subtitles&fields=thumbnails')) - self.assertEqual(query_dict(update_url_query( + parse_qs('http://example.com/path?fields=id,formats,subtitles&fields=thumbnails')) + self.assertEqual(parse_qs(update_url_query( 'http://example.com/path?manifest=f4m', {'manifest': []})), - query_dict('http://example.com/path')) - self.assertEqual(query_dict(update_url_query( + parse_qs('http://example.com/path')) + self.assertEqual(parse_qs(update_url_query( 'http://example.com/path?system=LINUX&system=WINDOWS', {'system': 'LINUX'})), - query_dict('http://example.com/path?system=LINUX')) - self.assertEqual(query_dict(update_url_query( + parse_qs('http://example.com/path?system=LINUX')) + self.assertEqual(parse_qs(update_url_query( 'http://example.com/path', {'fields': b'id,formats,subtitles'})), - query_dict('http://example.com/path?fields=id,formats,subtitles')) - self.assertEqual(query_dict(update_url_query( + parse_qs('http://example.com/path?fields=id,formats,subtitles')) + self.assertEqual(parse_qs(update_url_query( 'http://example.com/path', {'width': 1080, 'height': 720})), - query_dict('http://example.com/path?width=1080&height=720')) - self.assertEqual(query_dict(update_url_query( + parse_qs('http://example.com/path?width=1080&height=720')) + self.assertEqual(parse_qs(update_url_query( 'http://example.com/path', {'bitrate': 5020.43})), - query_dict('http://example.com/path?bitrate=5020.43')) - self.assertEqual(query_dict(update_url_query( + parse_qs('http://example.com/path?bitrate=5020.43')) + self.assertEqual(parse_qs(update_url_query( 'http://example.com/path', {'test': '第二行тест'})), - query_dict('http://example.com/path?test=%E7%AC%AC%E4%BA%8C%E8%A1%8C%D1%82%D0%B5%D1%81%D1%82')) + parse_qs('http://example.com/path?test=%E7%AC%AC%E4%BA%8C%E8%A1%8C%D1%82%D0%B5%D1%81%D1%82')) def test_multipart_encode(self): self.assertEqual( diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index fdf41b025..443d2609c 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -4248,6 +4248,7 @@ def update_url(url, **kwargs): url: compat_str or parsed URL tuple if query_update is in kwargs, update query with its value instead of replacing (overrides any `query`) + NB: query_update expects parse_qs() format: [key: value_list, ...] returns: compat_str """ if not kwargs: @@ -4257,7 +4258,7 @@ def update_url(url, **kwargs): query = kwargs.pop('query_update', None) if query: qs = compat_parse_qs(url.query) - qs.update((k, [v]) for k, v in query.items()) + qs.update(query) kwargs['query'] = compat_urllib_parse_urlencode(qs, True) kwargs = compat_kwargs(kwargs) return compat_urllib_parse.urlunparse(url._replace(**kwargs)) From 00ef748cc0e35ee60efd0f7a00e373ab8d1af86b Mon Sep 17 00:00:00 2001 From: dirkf Date: Sun, 24 Sep 2023 22:00:13 +0100 Subject: [PATCH 5/5] [downloader] Fix baa6c5e: show ETA of http download as ETA instead of total d/l time --- youtube_dl/downloader/common.py | 2 +- youtube_dl/downloader/http.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index afb4ee33d..91e691776 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -96,7 +96,7 @@ class FileDownloader(object): return None return int(float(remaining) / rate) start, now = (start_or_rate, now_or_remaining) - total, current = args + total, current = args[:2] if total is None: return None if now is None: diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index 28a49b9e8..3cad87420 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -294,7 +294,7 @@ class HttpFD(FileDownloader): # Progress message speed = self.calc_speed(start, now, byte_counter - ctx.resume_len) - eta = self.calc_eta(speed, ctx.data_len and (ctx.data_len - ctx.resume_len)) + eta = self.calc_eta(speed, ctx.data_len and (ctx.data_len - byte_counter)) self._hook_progress({ 'status': 'downloading',