From 782bfd26dbebea60e35f58ab18e218bedbecb782 Mon Sep 17 00:00:00 2001 From: "Lesmiscore (Naoya Ozaki)" Date: Thu, 24 Feb 2022 22:34:32 +0900 Subject: [PATCH 01/26] [bigo] add support for bigo.tv (#30635) * [bigo] add support for bigo.tv * [bigo] prepend "Bigo says" * title fallback * add error for invalid json data --- youtube_dl/extractor/bigo.py | 59 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 60 insertions(+) create mode 100644 youtube_dl/extractor/bigo.py diff --git a/youtube_dl/extractor/bigo.py b/youtube_dl/extractor/bigo.py new file mode 100644 index 000000000..ddf76ac55 --- /dev/null +++ b/youtube_dl/extractor/bigo.py @@ -0,0 +1,59 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ExtractorError, urlencode_postdata + + +class BigoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bigo\.tv/(?:[a-z]{2,}/)?(?P[^/]+)' + + _TESTS = [{ + 'url': 'https://www.bigo.tv/ja/221338632', + 'info_dict': { + 'id': '6576287577575737440', + 'title': '土よ〜💁‍♂️ 休憩室/REST room', + 'thumbnail': r're:https?://.+', + 'uploader': '✨Shin💫', + 'uploader_id': '221338632', + 'is_live': True, + }, + 'skip': 'livestream', + }, { + 'url': 'https://www.bigo.tv/th/Tarlerm1304', + 'only_matching': True, + }, { + 'url': 'https://bigo.tv/115976881', + 'only_matching': True, + }] + + def _real_extract(self, url): + user_id = self._match_id(url) + + info_raw = self._download_json( + 'https://bigo.tv/studio/getInternalStudioInfo', + user_id, data=urlencode_postdata({'siteId': user_id})) + + if not isinstance(info_raw, dict): + raise ExtractorError('Received invalid JSON data') + if info_raw.get('code'): + raise ExtractorError( + 'Bigo says: %s (code %s)' % (info_raw.get('msg'), info_raw.get('code')), expected=True) + info = info_raw.get('data') or {} + + if not info.get('alive'): + raise ExtractorError('This user is offline.', expected=True) + + return { + 'id': info.get('roomId') or user_id, + 'title': info.get('roomTopic') or info.get('nick_name') or user_id, + 'formats': [{ + 'url': info.get('hls_src'), + 'ext': 'mp4', + 'protocol': 'm3u8', + }], + 'thumbnail': info.get('snapshot'), + 'uploader': info.get('nick_name'), + 'uploader_id': user_id, + 'is_live': True, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 50b7cb4a0..c73c4cd6c 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -115,6 +115,7 @@ from .bfmtv import ( ) from .bibeltv import BibelTVIE from .bigflix import BigflixIE +from .bigo import BigoIE from .bild import BildIE from .bilibili import ( BiliBiliIE, From 923292ba643bf2a5c1fade797bd87a0de4f58d25 Mon Sep 17 00:00:00 2001 From: marieell Date: Thu, 10 Feb 2022 10:36:24 +0100 Subject: [PATCH 02/26] [aliexpress] Fix test case --- youtube_dl/extractor/aliexpress.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/aliexpress.py b/youtube_dl/extractor/aliexpress.py index 6f241e683..9722fe9ac 100644 --- a/youtube_dl/extractor/aliexpress.py +++ b/youtube_dl/extractor/aliexpress.py @@ -18,7 +18,7 @@ class AliExpressLiveIE(InfoExtractor): 'id': '2800002704436634', 'ext': 'mp4', 'title': 'CASIMA7.22', - 'thumbnail': r're:http://.*\.jpg', + 'thumbnail': r're:https?://.*\.jpg', 'uploader': 'CASIMA Official Store', 'timestamp': 1500717600, 'upload_date': '20170722', From 1f13ccfd7fcafbfd79ddd652967e02f9eda7ce79 Mon Sep 17 00:00:00 2001 From: dirkf Date: Thu, 24 Feb 2022 18:26:58 +0000 Subject: [PATCH 03/26] Fixed groups() call on potentially empty regex search object (#30676) * Fixed groups() call on potentially empty regex search object. - https://github.com/ytdl-org/youtube-dl/issues/30521 * minimising lines changed Co-authored-by: yayorbitgum <50963144+yayorbitgum@users.noreply.github.com> --- youtube_dl/extractor/myspass.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/myspass.py b/youtube_dl/extractor/myspass.py index db7ebc94c..f540c52ee 100644 --- a/youtube_dl/extractor/myspass.py +++ b/youtube_dl/extractor/myspass.py @@ -35,7 +35,9 @@ class MySpassIE(InfoExtractor): title = xpath_text(metadata, 'title', fatal=True) video_url = xpath_text(metadata, 'url_flv', 'download url', True) video_id_int = int(video_id) - for group in re.search(r'/myspass2009/\d+/(\d+)/(\d+)/(\d+)/', video_url).groups(): + + grps = re.search(r'/myspass2009/\d+/(\d+)/(\d+)/(\d+)/', video_url) + for group in grps.groups() if grps else []: group_int = int(group) if group_int > video_id_int: video_url = video_url.replace( From c4d1738316db45e03e0625650b3550334b66ab7f Mon Sep 17 00:00:00 2001 From: dirkf Date: Thu, 24 Feb 2022 09:16:16 +0000 Subject: [PATCH 04/26] [CPAC] Add extractor for Canadian Parliament CPACIE: single episode CPACPlaylistIE: playlists and searches --- youtube_dl/extractor/cpac.py | 148 +++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 4 + 2 files changed, 152 insertions(+) create mode 100644 youtube_dl/extractor/cpac.py diff --git a/youtube_dl/extractor/cpac.py b/youtube_dl/extractor/cpac.py new file mode 100644 index 000000000..22741152c --- /dev/null +++ b/youtube_dl/extractor/cpac.py @@ -0,0 +1,148 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + str_or_none, + try_get, + unified_timestamp, + update_url_query, + urljoin, +) + +# compat_range +try: + if callable(xrange): + range = xrange +except (NameError, TypeError): + pass + + +class CPACIE(InfoExtractor): + IE_NAME = 'cpac' + _VALID_URL = r'https?://(?:www\.)?cpac\.ca/(?Pl-)?episode\?id=(?P[\da-f]{8}(?:-[\da-f]{4}){3}-[\da-f]{12})' + _TEST = { + # 'url': 'http://www.cpac.ca/en/programs/primetime-politics/episodes/65490909', + 'url': 'https://www.cpac.ca/episode?id=fc7edcae-4660-47e1-ba61-5b7f29a9db0f', + 'md5': 'e46ad699caafd7aa6024279f2614e8fa', + 'info_dict': { + 'id': 'fc7edcae-4660-47e1-ba61-5b7f29a9db0f', + 'ext': 'mp4', + 'upload_date': '20220215', + 'title': 'News Conference to Celebrate National Kindness Week – February 15, 2022', + 'description': 'md5:466a206abd21f3a6f776cdef290c23fb', + 'timestamp': 1644901200, + }, + 'params': { + 'format': 'bestvideo', + 'hls_prefer_native': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + url_lang = 'fr' if '/l-episode?' in url else 'en' + + content = self._download_json( + 'https://www.cpac.ca/api/1/services/contentModel.json?url=/site/website/episode/index.xml&crafterSite=cpacca&id=' + video_id, + video_id) + video_url = try_get(content, lambda x: x['page']['details']['videoUrl'], compat_str) + formats = [] + if video_url: + content = content['page'] + title = str_or_none(content['details']['title_%s_t' % (url_lang, )]) + formats = self._extract_m3u8_formats(video_url, video_id, m3u8_id='hls', ext='mp4') + for fmt in formats: + # prefer language to match URL + fmt_lang = fmt.get('language') + if fmt_lang == url_lang: + fmt['language_preference'] = 10 + elif not fmt_lang: + fmt['language_preference'] = -1 + else: + fmt['language_preference'] = -10 + + self._sort_formats(formats) + + category = str_or_none(content['details']['category_%s_t' % (url_lang, )]) + + def is_live(v_type): + return (v_type == 'live') if v_type is not None else None + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'description': str_or_none(content['details'].get('description_%s_t' % (url_lang, ))), + 'timestamp': unified_timestamp(content['details'].get('liveDateTime')), + 'category': [category] if category else None, + 'thumbnail': urljoin(url, str_or_none(content['details'].get('image_%s_s' % (url_lang, )))), + 'is_live': is_live(content['details'].get('type')), + } + + +class CPACPlaylistIE(InfoExtractor): + IE_NAME = 'cpac:playlist' + _VALID_URL = r'(?i)https?://(?:www\.)?cpac\.ca/(?:program|search|(?Pemission|rechercher))\?(?:[^&]+&)*?(?P(?:id=\d+|programId=\d+|key=[^&]+))' + + _TESTS = [{ + 'url': 'https://www.cpac.ca/program?id=6', + 'info_dict': { + 'id': 'id=6', + 'title': 'Headline Politics', + 'description': 'Watch CPAC’s signature long-form coverage of the day’s pressing political events as they unfold.', + }, + 'playlist_count': 10, + }, { + 'url': 'https://www.cpac.ca/search?key=hudson&type=all&order=desc', + 'info_dict': { + 'id': 'key=hudson', + 'title': 'hudson', + }, + 'playlist_count': 22, + }, { + 'url': 'https://www.cpac.ca/search?programId=50', + 'info_dict': { + 'id': 'programId=50', + 'title': '50', + }, + 'playlist_count': 9, + }, { + 'url': 'https://www.cpac.ca/emission?id=6', + 'only_matching': True, + }, { + 'url': 'https://www.cpac.ca/rechercher?key=hudson&type=all&order=desc', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + url_lang = 'fr' if any(x in url for x in ('/emission?', '/rechercher?')) else 'en' + pl_type, list_type = ('program', 'itemList') if any(x in url for x in ('/program?', '/emission?')) else ('search', 'searchResult') + api_url = ( + 'https://www.cpac.ca/api/1/services/contentModel.json?url=/site/website/%s/index.xml&crafterSite=cpacca&%s' + % (pl_type, video_id, )) + content = self._download_json(api_url, video_id) + entries = [] + total_pages = int_or_none(try_get(content, lambda x: x['page'][list_type]['totalPages']), default=1) + for page in range(1, total_pages + 1): + if page > 1: + api_url = update_url_query(api_url, {'page': '%d' % (page, ), }) + content = self._download_json( + api_url, video_id, + note='Downloading continuation - %d' % (page, ), + fatal=False) + + for item in try_get(content, lambda x: x['page'][list_type]['item'], list) or []: + episode_url = urljoin(url, try_get(item, lambda x: x['url_%s_s' % (url_lang, )])) + if episode_url: + entries.append(episode_url) + + return self.playlist_result( + (self.url_result(entry) for entry in entries), + playlist_id=video_id, + playlist_title=try_get(content, lambda x: x['page']['program']['title_%s_t' % (url_lang, )]) or video_id.split('=')[-1], + playlist_description=try_get(content, lambda x: x['page']['program']['description_%s_t' % (url_lang, )]), + ) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c73c4cd6c..7c99cb7e0 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -255,6 +255,10 @@ from .commonprotocols import ( from .condenast import CondeNastIE from .contv import CONtvIE from .corus import CorusIE +from .cpac import ( + CPACIE, + CPACPlaylistIE, +) from .cracked import CrackedIE from .crackle import CrackleIE from .crooksandliars import CrooksAndLiarsIE From f8e543c9063c1c7ad157936cb6a15b428ddb3896 Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 7 Feb 2022 20:06:27 +0000 Subject: [PATCH 05/26] [Alsace20TV] Add new extractors Alsace20TVIE, Alsace20TVEmbedIE --- youtube_dl/extractor/alsace20tv.py | 89 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 4 ++ 2 files changed, 93 insertions(+) create mode 100644 youtube_dl/extractor/alsace20tv.py diff --git a/youtube_dl/extractor/alsace20tv.py b/youtube_dl/extractor/alsace20tv.py new file mode 100644 index 000000000..228cec3ec --- /dev/null +++ b/youtube_dl/extractor/alsace20tv.py @@ -0,0 +1,89 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_html, + dict_get, + get_element_by_class, + int_or_none, + unified_strdate, + url_or_none, +) + + +class Alsace20TVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?alsace20\.tv/(?:[\w-]+/)+[\w-]+-(?P[\w]+)' + _TESTS = [{ + 'url': 'https://www.alsace20.tv/VOD/Actu/JT/Votre-JT-jeudi-3-fevrier-lyNHCXpYJh.html', + # 'md5': 'd91851bf9af73c0ad9b2cdf76c127fbb', + 'info_dict': { + 'id': 'lyNHCXpYJh', + 'ext': 'mp4', + 'description': 'md5:fc0bc4a0692d3d2dba4524053de4c7b7', + 'title': 'Votre JT du jeudi 3 février', + 'upload_date': '20220203', + 'thumbnail': r're:https?://.+\.jpg', + 'duration': 1073, + 'view_count': int, + }, + 'params': { + 'format': 'bestvideo', + }, + }] + + def _extract_video(self, video_id, url=None): + info = self._download_json( + 'https://www.alsace20.tv/visionneuse/visio_v9_js.php?key=%s&habillage=0&mode=html' % (video_id, ), + video_id) or {} + title = info['titre'] + + formats = [] + for res, fmt_url in (info.get('files') or {}).items(): + formats.extend( + self._extract_smil_formats(fmt_url, video_id, fatal=False) + if '/smil:_' in fmt_url + else self._extract_mpd_formats(fmt_url, video_id, mpd_id=res, fatal=False)) + self._sort_formats(formats) + + webpage = (url and self._download_webpage(url, video_id, fatal=False)) or '' + thumbnail = url_or_none(dict_get(info, ('image', 'preview', )) or self._og_search_thumbnail(webpage)) + upload_date = self._search_regex(r'/(\d{6})_', thumbnail, 'upload_date', default=None) + upload_date = unified_strdate('20%s-%s-%s' % (upload_date[:2], upload_date[2:4], upload_date[4:])) if upload_date else None + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': clean_html(get_element_by_class('wysiwyg', webpage)), + 'upload_date': upload_date, + 'thumbnail': thumbnail, + 'duration': int_or_none(self._og_search_property('video:duration', webpage) if webpage else None), + 'view_count': int_or_none(info.get('nb_vues')), + } + + def _real_extract(self, url): + video_id = self._match_id(url) + return self._extract_video(video_id, url) + + +class Alsace20TVEmbedIE(Alsace20TVIE): + _VALID_URL = r'https?://(?:www\.)?alsace20\.tv/emb/(?P[\w]+)' + _TESTS = [{ + 'url': 'https://www.alsace20.tv/emb/lyNHCXpYJh', + # 'md5': 'd91851bf9af73c0ad9b2cdf76c127fbb', + 'info_dict': { + 'id': 'lyNHCXpYJh', + 'ext': 'mp4', + 'title': 'Votre JT du jeudi 3 février', + 'upload_date': '20220203', + 'thumbnail': r're:https?://.+\.jpg', + 'view_count': int, + }, + 'params': { + 'format': 'bestvideo', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self._extract_video(video_id) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 7c99cb7e0..535080d0a 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -51,6 +51,10 @@ from .anvato import AnvatoIE from .aol import AolIE from .allocine import AllocineIE from .aliexpress import AliExpressLiveIE +from .alsace20tv import ( + Alsace20TVIE, + Alsace20TVEmbedIE, +) from .apa import APAIE from .aparat import AparatIE from .appleconnect import AppleConnectIE From 4194d253c0b922addf0439228066cb4fb487bac3 Mon Sep 17 00:00:00 2001 From: dirkf Date: Fri, 30 Jul 2021 12:58:19 +0100 Subject: [PATCH 06/26] Avoid skipping ID when unlisted_hash is numeric Pattern needed a non-greedy match; also replaced a redundant test with one for this, issue 29690 --- youtube_dl/extractor/vimeo.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 0b386f450..a66912502 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -271,7 +271,7 @@ class VimeoIE(VimeoBaseInfoExtractor): )? vimeo(?:pro)?\.com/ (?!(?:channels|album|showcase)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/) - (?:.*?/)? + (?:.*?/)?? (?: (?: play_redirect_hls| @@ -517,14 +517,28 @@ class VimeoIE(VimeoBaseInfoExtractor): 'url': 'https://vimeo.com/7809605', 'only_matching': True, }, - { - 'url': 'https://vimeo.com/160743502/abd0e13fb4', - 'only_matching': True, - }, { # requires passing unlisted_hash(a52724358e) to load_download_config request 'url': 'https://vimeo.com/392479337/a52724358e', 'only_matching': True, + }, + { + # similar, but all numeric: ID must be 581039021, not 9603038895 + # issue #29690 + 'url': 'https://vimeo.com/581039021/9603038895', + 'info_dict': { + 'id': '581039021', + # these have to be provided but we don't care + 'ext': 'mp4', + 'timestamp': 1627621014, + 'title': 're:.+', + 'uploader_id': 're:.+', + 'uploader': 're:.+', + 'upload_date': r're:\d+', + }, + 'params': { + 'skip_download': True, + }, } # https://gettingthingsdone.com/workflowmap/ # vimeo embed with check-password page protected by Referer header From 6508688e88c83bb811653083db9351702cd39a6a Mon Sep 17 00:00:00 2001 From: df Date: Sun, 1 Aug 2021 09:42:57 +0100 Subject: [PATCH 07/26] Make default upload_/release_date a compat_str Ensures download tests pass in Python 2 as well as 3; also add YoutubeDL tests for timestamp -> upload_date etc. --- test/test_YoutubeDL.py | 19 +++++++++++++++++++ youtube_dl/YoutubeDL.py | 2 +- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index a35effe0e..f8c8e619c 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -997,6 +997,25 @@ class TestYoutubeDL(unittest.TestCase): self.assertEqual(downloaded['extractor'], 'Video') self.assertEqual(downloaded['extractor_key'], 'Video') + def test_default_times(self): + """Test addition of missing upload/release/_date from /release_/timestamp""" + info = { + 'id': '1234', + 'url': TEST_URL, + 'title': 'Title', + 'ext': 'mp4', + 'timestamp': 1631352900, + 'release_timestamp': 1632995931, + } + + params = {'simulate': True, } + ydl = FakeYDL(params) + out_info = ydl.process_ie_result(info) + self.assertTrue(isinstance(out_info['upload_date'], compat_str)) + self.assertEqual(out_info['upload_date'], '20210911') + self.assertTrue(isinstance(out_info['release_date'], compat_str)) + self.assertEqual(out_info['release_date'], '20210930') + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index fe30758ef..69736acff 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1529,7 +1529,7 @@ class YoutubeDL(object): # see http://bugs.python.org/issue1646728) try: upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key]) - info_dict[date_key] = upload_date.strftime('%Y%m%d') + info_dict[date_key] = compat_str(upload_date.strftime('%Y%m%d')) except (ValueError, OverflowError, OSError): pass From 49c5293014bc11ec8c009856cd63cffa6296c1e1 Mon Sep 17 00:00:00 2001 From: dirkf Date: Tue, 22 Feb 2022 11:24:06 +0000 Subject: [PATCH 08/26] Ignore --external-downloader-args if --external-downloader was rejected ... and generate warning --- youtube_dl/YoutubeDL.py | 11 ++++++++++- youtube_dl/downloader/__init__.py | 3 +++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 69736acff..019e309cb 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1906,8 +1906,17 @@ class YoutubeDL(object): if not self.params.get('skip_download', False): try: + def checked_get_suitable_downloader(info_dict, params): + ed_args = params.get('external_downloader_args') + dler = get_suitable_downloader(info_dict, params) + if ed_args and not params.get('external_downloader_args'): + # external_downloader_args was cleared because external_downloader was rejected + self.report_warning('Requested external downloader cannot be used: ' + 'ignoring --external-downloader-args.') + return dler + def dl(name, info): - fd = get_suitable_downloader(info, self.params)(self, self.params) + fd = checked_get_suitable_downloader(info, self.params)(self, self.params) for ph in self._progress_hooks: fd.add_progress_hook(ph) if self.params.get('verbose'): diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py index d8f2fa342..d701d6292 100644 --- a/youtube_dl/downloader/__init__.py +++ b/youtube_dl/downloader/__init__.py @@ -50,6 +50,9 @@ def _get_suitable_downloader(info_dict, params={}): ed = get_external_downloader(external_downloader) if ed.can_download(info_dict): return ed + # Avoid using unwanted args since external_downloader was rejected + if params.get('external_downloader_args'): + params['external_downloader_args'] = None protocol = info_dict['protocol'] if protocol.startswith('m3u8') and info_dict.get('is_live'): From 17d295a1ec6d04362740dd8a0c583690f5ba082a Mon Sep 17 00:00:00 2001 From: lihan7 Date: Fri, 25 Mar 2022 15:46:28 +0800 Subject: [PATCH 09/26] [extractor/bilibili] Fix path "/audio/auxxxxx" download return 403 --- youtube_dl/extractor/bilibili.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index bff6ea194..d42f0e98a 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -369,6 +369,11 @@ class BilibiliAudioIE(BilibiliAudioBaseIE): 'filesize': int_or_none(play_data.get('size')), }] + for a_format in formats: + a_format.setdefault('http_headers', {}).update({ + 'Referer': url, + }) + song = self._call_api('song/info', au_id) title = song['title'] statistic = song.get('statistic') or {} From 9e5ca66f16998eb2a680e23a6e769e34001898c5 Mon Sep 17 00:00:00 2001 From: nixxo Date: Mon, 4 Jan 2021 15:11:47 +0100 Subject: [PATCH 10/26] [RAI] Added checks for DRM protected content (PR #27657) reviewed by pukkandan (https://github.com/yt-dlp/yt-dlp/pull/150) --- youtube_dl/extractor/rai.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 67b86fc72..2abe164e0 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -158,6 +158,10 @@ class RaiPlayIE(RaiBaseIE): # subtitles at 'subtitlesArray' key (see #27698) 'url': 'https://www.raiplay.it/video/2020/12/Report---04-01-2021-2e90f1de-8eee-4de4-ac0e-78d21db5b600.html', 'only_matching': True, + }, { + # DRM protected + 'url': 'https://www.raiplay.it/video/2020/09/Lo-straordinario-mondo-di-Zoey-S1E1-Lo-straordinario-potere-di-Zoey-ed493918-1d32-44b7-8454-862e473d00ff.html', + 'only_matching': True, }] def _real_extract(self, url): @@ -166,6 +170,13 @@ class RaiPlayIE(RaiBaseIE): media = self._download_json( base + '.json', video_id, 'Downloading video JSON') + if try_get( + media, + (lambda x: x['rights_management']['rights']['drm'], + lambda x: x['program_info']['rights_management']['rights']['drm']), + dict): + raise ExtractorError('This video is DRM protected.', expected=True) + title = media['name'] video = media['video'] From 1f50a07771fddb5f64617617d156bfdd593f951e Mon Sep 17 00:00:00 2001 From: nixxo Date: Wed, 27 Jan 2021 12:24:50 +0100 Subject: [PATCH 11/26] [RAI] Extend formats with direct http mp4 link (PR #27990) * initial support for creating direct mp4 link * improved regexes and info extraction * added "connection: close" to request headers * updated to https://github.com/yt-dlp/yt-dlp/pull/208 --- youtube_dl/extractor/rai.py | 111 +++++++++++++++++++++++++++++++++++- 1 file changed, 108 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 2abe164e0..7b0315a62 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -5,15 +5,16 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urlparse, compat_str, + compat_urlparse, ) from ..utils import ( - ExtractorError, determine_ext, + ExtractorError, find_xpath_attr, fix_xml_ampersands, GeoRestrictedError, + HEADRequest, int_or_none, parse_duration, remove_start, @@ -96,12 +97,100 @@ class RaiBaseIE(InfoExtractor): if not formats and geoprotection is True: self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + formats.extend(self._create_http_urls(relinker_url, formats)) + return dict((k, v) for k, v in { 'is_live': is_live, 'duration': duration, 'formats': formats, }.items() if v is not None) + def _create_http_urls(self, relinker_url, fmts): + _RELINKER_REG = r'https?://(?P[^/]+?)/(?:i/)?(?P[^/]+?)/(?P.+?)/(?P\d+)(?:_(?P[\d\,]+))?(?:\.mp4|/playlist\.m3u8).+?' + _MP4_TMPL = '%s&overrideUserAgentRule=mp4-%s' + _QUALITY = { + # tbr: w, h + '250': [352, 198], + '400': [512, 288], + '700': [512, 288], + '800': [700, 394], + '1200': [736, 414], + '1800': [1024, 576], + '2400': [1280, 720], + '3200': [1440, 810], + '3600': [1440, 810], + '5000': [1920, 1080], + '10000': [1920, 1080], + } + + def test_url(url): + resp = self._request_webpage( + HEADRequest(url), None, headers={'User-Agent': 'Rai'}, + fatal=False, errnote=False, note=False) + + if resp is False: + return False + + if resp.code == 200: + return False if resp.url == url else resp.url + return None + + def get_format_info(tbr): + import math + br = int_or_none(tbr) + if len(fmts) == 1 and not br: + br = fmts[0].get('tbr') + if br > 300: + tbr = compat_str(math.floor(br / 100) * 100) + else: + tbr = '250' + + # try extracting info from available m3u8 formats + format_copy = None + for f in fmts: + if f.get('tbr'): + br_limit = math.floor(br / 100) + if br_limit - 1 <= math.floor(f['tbr'] / 100) <= br_limit + 1: + format_copy = f.copy() + return { + 'width': format_copy.get('width'), + 'height': format_copy.get('height'), + 'tbr': format_copy.get('tbr'), + 'vcodec': format_copy.get('vcodec'), + 'acodec': format_copy.get('acodec'), + 'fps': format_copy.get('fps'), + 'format_id': 'https-%s' % tbr, + } if format_copy else { + 'width': _QUALITY[tbr][0], + 'height': _QUALITY[tbr][1], + 'format_id': 'https-%s' % tbr, + 'tbr': int(tbr), + } + + loc = test_url(_MP4_TMPL % (relinker_url, '*')) + if not isinstance(loc, compat_str): + return [] + + mobj = re.match( + _RELINKER_REG, + test_url(relinker_url) or '') + if not mobj: + return [] + + available_qualities = mobj.group('quality').split(',') if mobj.group('quality') else ['*'] + available_qualities = [i for i in available_qualities if i] + + formats = [] + for q in available_qualities: + fmt = { + 'url': _MP4_TMPL % (relinker_url, q), + 'protocol': 'https', + 'ext': 'mp4', + } + fmt.update(get_format_info(q)) + formats.append(fmt) + return formats + @staticmethod def _extract_subtitles(url, video_data): STL_EXT = 'stl' @@ -151,6 +240,22 @@ class RaiPlayIE(RaiBaseIE): 'params': { 'skip_download': True, }, + }, { + # 1080p direct mp4 url + 'url': 'https://www.raiplay.it/video/2021/03/Leonardo-S1E1-b5703b02-82ee-475a-85b6-c9e4a8adf642.html', + 'md5': '2e501e8651d72f05ffe8f5d286ad560b', + 'info_dict': { + 'id': 'b5703b02-82ee-475a-85b6-c9e4a8adf642', + 'ext': 'mp4', + 'title': 'Leonardo - S1E1', + 'alt_title': 'St 1 Ep 1 - Episodio 1', + 'description': 'md5:f5360cd267d2de146e4e3879a5a47d31', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Rai 1', + 'duration': 3229, + 'series': 'Leonardo', + 'season': 'Season 1', + }, }, { 'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?', 'only_matching': True, @@ -318,7 +423,7 @@ class RaiIE(RaiBaseIE): }, { # with ContentItem in og:url 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html', - 'md5': '6865dd00cf0bbf5772fdd89d59bd768a', + 'md5': '06345bd97c932f19ffb129973d07a020', 'info_dict': { 'id': 'efb17665-691c-45d5-a60c-5301333cbb0c', 'ext': 'mp4', From 871645a4a4a0e12ec8f7bf78a3ad7bf75838ee5c Mon Sep 17 00:00:00 2001 From: nixxo Date: Sat, 2 Apr 2022 07:57:56 +0200 Subject: [PATCH 12/26] [RAI] Fix extraction of http formats From https://github.com/yt-dlp/yt-dlp/pull/3272 Closes https://github.com/yt-dlp/yt-dlp/issues/3270 Authored by: nixxo --- youtube_dl/extractor/rai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 7b0315a62..563d3400f 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -106,7 +106,7 @@ class RaiBaseIE(InfoExtractor): }.items() if v is not None) def _create_http_urls(self, relinker_url, fmts): - _RELINKER_REG = r'https?://(?P[^/]+?)/(?:i/)?(?P[^/]+?)/(?P.+?)/(?P\d+)(?:_(?P[\d\,]+))?(?:\.mp4|/playlist\.m3u8).+?' + _RELINKER_REG = r'https?://(?P[^/]+?)/(?:i/)?(?P[^/]+?)/(?P.+?)/(?P\w+)(?:_(?P[\d\,]+))?(?:\.mp4|/playlist\.m3u8).+?' _MP4_TMPL = '%s&overrideUserAgentRule=mp4-%s' _QUALITY = { # tbr: w, h From b764dbe7730bc5b0a4f30f4f89fd85e096d0c4a0 Mon Sep 17 00:00:00 2001 From: dirkf Date: Sun, 10 Apr 2022 05:49:09 +0100 Subject: [PATCH 13/26] Disable blank issues --- .github/ISSUE_TEMPLATE/config.yml | 1 + 1 file changed, 1 insertion(+) create mode 100644 .github/ISSUE_TEMPLATE/config.yml diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 000000000..3ba13e0ce --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1 @@ +blank_issues_enabled: false From a0068bd6bec16008bda7a39caecccbf84881c603 Mon Sep 17 00:00:00 2001 From: dirkf Date: Fri, 15 Apr 2022 16:07:09 +0100 Subject: [PATCH 14/26] [Youtube] Fix "n" descrambling for player fae06c11 Resolves #30856. --- youtube_dl/extractor/youtube.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 41695a561..ff6c7b0f8 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1464,15 +1464,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # 2. https://code.videolan.org/videolan/vlc/-/blob/4fb284e5af69aa9ac2100ccbdd3b88debec9987f/share/lua/playlist/youtube.lua#L116 # 3. https://github.com/ytdl-org/youtube-dl/issues/30097#issuecomment-950157377 def _extract_n_function_name(self, jscode): - target = r'(?P[a-zA-Z0-9$]{3})(?:\[(?P\d+)\])?' + target = r'(?P[a-zA-Z_$][\w$]*)(?:\[(?P\d+)\])?' nfunc_and_idx = self._search_regex( - r'\.get\("n"\)\)&&\(b=(%s)\([a-zA-Z0-9]\)' % (target, ), + r'\.get\("n"\)\)&&\(b=(%s)\([\w$]+\)' % (target, ), jscode, 'Initial JS player n function name') nfunc, idx = re.match(target, nfunc_and_idx).group('nfunc', 'idx') if not idx: return nfunc return self._parse_json(self._search_regex( - r'var %s\s*=\s*(\[.+?\]);' % (nfunc, ), jscode, + r'var %s\s*=\s*(\[.+?\]);' % (re.escape(nfunc), ), jscode, 'Initial JS player n function list ({nfunc}[{idx}])'.format(**locals())), nfunc, transform_source=js_to_json)[int(idx)] def _extract_n_function(self, video_id, player_url): From ebc627847cd1f5faddf4bd90376c1635777283cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81rni=20Dagur?= Date: Thu, 28 Apr 2022 11:18:10 +0200 Subject: [PATCH 15/26] [KTH] Add new extractor for KTH play (#30885) * Implement extractor for KTH play * Make KTH Play url regex more relaxed --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/kaltura.py | 2 +- youtube_dl/extractor/kth.py | 31 ++++++++++++++++++++++++++++++ 3 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 youtube_dl/extractor/kth.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 535080d0a..452caeade 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -557,6 +557,7 @@ from .kinja import KinjaEmbedIE from .kinopoisk import KinoPoiskIE from .konserthusetplay import KonserthusetPlayIE from .krasview import KrasViewIE +from .kth import KTHIE from .ku6 import Ku6IE from .kusi import KUSIIE from .kuwo import ( diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index c731612c4..6d4d93394 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -373,5 +373,5 @@ class KalturaIE(InfoExtractor): 'duration': info.get('duration'), 'timestamp': info.get('createdAt'), 'uploader_id': info.get('userId') if info.get('userId') != 'None' else None, - 'view_count': info.get('plays'), + 'view_count': int_or_none(info.get('plays')), } diff --git a/youtube_dl/extractor/kth.py b/youtube_dl/extractor/kth.py new file mode 100644 index 000000000..b8db461f5 --- /dev/null +++ b/youtube_dl/extractor/kth.py @@ -0,0 +1,31 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import smuggle_url + + +class KTHIE(InfoExtractor): + _VALID_URL = r'https?://play\.kth\.se/(?:[^/]+/)+(?P[a-z0-9_]+)' + _TEST = { + 'url': 'https://play.kth.se/media/Lunch+breakA+De+nya+aff%C3%A4rerna+inom+Fordonsdalen/0_uoop6oz9', + 'md5': 'd83ada6d00ca98b73243a88efe19e8a6', + 'info_dict': { + 'id': '0_uoop6oz9', + 'ext': 'mp4', + 'title': 'md5:bd1d6931facb6828762a33e6ce865f37', + 'thumbnail': 're:https?://.+/thumbnail/.+', + 'duration': 3516, + 'timestamp': 1647345358, + 'upload_date': '20220315', + 'uploader_id': 'md5:0ec23e33a89e795a4512930c8102509f', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + result = self.url_result( + smuggle_url('kaltura:308:%s' % video_id, { + 'service_url': 'https://api.kaltura.nordu.net'}), + 'Kaltura') + return result From e27d8d819fa69d5714ea1682a1d5d56f617461fc Mon Sep 17 00:00:00 2001 From: dirkf Date: Fri, 29 Apr 2022 13:36:02 +0100 Subject: [PATCH 16/26] [streamcz] Remove empty `'{}'.format()` for Py2.6 Use `'-join()'` here, or `{0}`, ..., in general. --- youtube_dl/extractor/streamcz.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/streamcz.py b/youtube_dl/extractor/streamcz.py index 060ba32e0..97b2eb7f8 100644 --- a/youtube_dl/extractor/streamcz.py +++ b/youtube_dl/extractor/streamcz.py @@ -62,7 +62,7 @@ class StreamCZIE(InfoExtractor): if not stream.get('url'): continue yield merge_dicts({ - 'format_id': '{}-{}'.format(format_id, ext), + 'format_id': '-'.join((format_id, ext)), 'ext': ext, 'source_preference': pref, 'url': urljoin(spl_url, stream['url']), From e988fa4523e489596a2a27c4d45275e44db49406 Mon Sep 17 00:00:00 2001 From: dirkf Date: Thu, 28 Apr 2022 15:25:49 +0100 Subject: [PATCH 17/26] [doc] Clarify test naming --- README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 2841ed68f..cd888c731 100644 --- a/README.md +++ b/README.md @@ -1069,9 +1069,11 @@ After you have ensured this site is distributing its content legally, you can fo } ``` 5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/extractors.py). -6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in. -7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/ytdl-org/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303). Add tests and code for as many as you want. -8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://flake8.pycqa.org/en/latest/index.html#quickstart): +6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test (actually, test case) then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note: + * the test names use the extractor class name **without the trailing `IE`** + * tests with `only_matching` key in test's dict are not counted. +8. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/ytdl-org/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303). Add tests and code for as many as you want. +9. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://flake8.pycqa.org/en/latest/index.html#quickstart): $ flake8 youtube_dl/extractor/yourextractor.py From c7965b9fc2cae54f244f31f5373cb81a40e822ab Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 9 May 2022 18:54:41 +0100 Subject: [PATCH 18/26] [NHK] Support alphabetic characters in 7-char NhkVod IDs (#29682) --- youtube_dl/extractor/nhk.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/nhk.py b/youtube_dl/extractor/nhk.py index 8a9331a79..46a800e7e 100644 --- a/youtube_dl/extractor/nhk.py +++ b/youtube_dl/extractor/nhk.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import re @@ -23,7 +24,7 @@ class NhkBaseIE(InfoExtractor): def _extract_episode_info(self, url, episode=None): fetch_episode = episode is None lang, m_type, episode_id = re.match(NhkVodIE._VALID_URL, url).groups() - if episode_id.isdigit(): + if len(episode_id) == 7: episode_id = episode_id[:4] + '-' + episode_id[4:] is_video = m_type == 'video' @@ -84,7 +85,8 @@ class NhkBaseIE(InfoExtractor): class NhkVodIE(NhkBaseIE): - _VALID_URL = r'%s%s(?P\d{7}|[^/]+?-\d{8}-[0-9a-z]+)' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX) + # the 7-character IDs can have alphabetic chars too: assume [a-z] rather than just [a-f], eg + _VALID_URL = r'%s%s(?P[0-9a-z]{7}|[^/]+?-\d{8}-[0-9a-z]+)' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX) # Content available only for a limited period of time. Visit # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples. _TESTS = [{ @@ -124,6 +126,19 @@ class NhkVodIE(NhkBaseIE): }, { 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/j_art-20150903-1/', 'only_matching': True, + }, { + # video, alphabetic character in ID #29670 + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999a34/', + 'only_matching': True, + 'info_dict': { + 'id': 'qfjay6cg', + 'ext': 'mp4', + 'title': 'DESIGN TALKS plus - Fishermen’s Finery', + 'description': 'md5:8a8f958aaafb0d7cb59d38de53f1e448', + 'thumbnail': r're:^https?:/(/[a-z0-9.-]+)+\.jpg\?w=1920&h=1080$', + 'upload_date': '20210615', + 'timestamp': 1623722008, + } }] def _real_extract(self, url): From c3deca86aedd2d8ab7cd0c596fd68b7aeb7c042d Mon Sep 17 00:00:00 2001 From: dirkf Date: Thu, 19 May 2022 17:41:48 +0000 Subject: [PATCH 19/26] [wat.tv] Add version `pver` to metadata API call Resolves #30959. --- youtube_dl/extractor/wat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index f1bccc2d6..b15e03768 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -57,7 +57,7 @@ class WatIE(InfoExtractor): # 'http://www.wat.tv/interface/contentv4s/' + video_id, video_id) video_data = self._download_json( 'https://mediainfo.tf1.fr/mediainfocombo/' + video_id, - video_id, query={'context': 'MYTF1'}) + video_id, query={'context': 'MYTF1', 'pver': '4001000'}) video_info = video_data['media'] error_desc = video_info.get('error_desc') From be35e5343a6c31f5f32ee216ab4486a1992260c5 Mon Sep 17 00:00:00 2001 From: Jacob Chapman <7908073+chapmanjacobd@users.noreply.github.com> Date: Wed, 13 Apr 2022 07:21:23 -0500 Subject: [PATCH 20/26] Update options.py --- youtube_dl/options.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 0a0641bd4..6521ad881 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -270,11 +270,11 @@ def parseOpts(overrideArguments=None): selection.add_option( '--match-title', dest='matchtitle', metavar='REGEX', - help='Download only matching titles (regex or caseless sub-string)') + help='Download only matching titles (case-insensitive regex or sub-string)') selection.add_option( '--reject-title', dest='rejecttitle', metavar='REGEX', - help='Skip download for matching titles (regex or caseless sub-string)') + help='Skip download for matching titles (case-insensitive regex or sub-string)') selection.add_option( '--max-downloads', dest='max_downloads', metavar='NUMBER', type=int, default=None, From 187a48aee29847664e0c4cd80fe90c32e1fb334b Mon Sep 17 00:00:00 2001 From: dirkf Date: Tue, 24 May 2022 15:33:00 +0100 Subject: [PATCH 21/26] [YouTube] Handle player c5a4daa1 with indirect n-function definition * resolves #30976 --- youtube_dl/extractor/youtube.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index ff6c7b0f8..9c62b8890 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1471,9 +1471,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): nfunc, idx = re.match(target, nfunc_and_idx).group('nfunc', 'idx') if not idx: return nfunc + if int_or_none(idx) == 0: + real_nfunc = self._search_regex( + r'var %s\s*=\s*\[([a-zA-Z_$][\w$]*)\];' % (re.escape(nfunc), ), jscode, + 'Initial JS player n function alias ({nfunc}[{idx}])'.format(**locals())) + if real_nfunc: + return real_nfunc return self._parse_json(self._search_regex( r'var %s\s*=\s*(\[.+?\]);' % (re.escape(nfunc), ), jscode, - 'Initial JS player n function list ({nfunc}[{idx}])'.format(**locals())), nfunc, transform_source=js_to_json)[int(idx)] + 'Initial JS player n function name ({nfunc}[{idx}])'.format(**locals())), nfunc, transform_source=js_to_json)[int(idx)] def _extract_n_function(self, video_id, player_url): player_id = self._extract_player_info(player_url) @@ -1482,7 +1488,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if func_code: jsi = JSInterpreter(func_code) else: - player_id = self._extract_player_info(player_url) jscode = self._get_player_code(video_id, player_url, player_id) funcname = self._extract_n_function_name(jscode) jsi = JSInterpreter(jscode) From 52c3751df722ab6f31f0229a415c7389a95c2307 Mon Sep 17 00:00:00 2001 From: dirkf Date: Sat, 28 May 2022 13:52:51 +0100 Subject: [PATCH 22/26] [utils] Enable ALPN in HTTPS to satisfy broken servers See https://github.com/yt-dlp/yt-dlp/issues/3878 --- youtube_dl/utils.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index e722eed58..4ff27db3d 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2292,12 +2292,30 @@ def formatSeconds(secs): def make_HTTPS_handler(params, **kwargs): + + # https://www.rfc-editor.org/info/rfc7301 + ALPN_PROTOCOLS = ['http/1.1'] + + def set_alpn_protocols(ctx): + # From https://github.com/yt-dlp/yt-dlp/commit/2c6dcb65fb612fc5bc5c61937bf438d3c473d8d0 + # Thanks @coletdjnz + # Some servers may (wrongly) reject requests if ALPN extension is not sent. See: + # https://github.com/python/cpython/issues/85140 + # https://github.com/yt-dlp/yt-dlp/issues/3878 + try: + ctx.set_alpn_protocols(ALPN_PROTOCOLS) + except (AttributeError, NotImplementedError): + # Python < 2.7.10, not ssl.HAS_ALPN + pass + opts_no_check_certificate = params.get('nocheckcertificate', False) if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH) + set_alpn_protocols(context) if opts_no_check_certificate: context.check_hostname = False context.verify_mode = ssl.CERT_NONE + try: return YoutubeDLHTTPSHandler(params, context=context, **kwargs) except TypeError: @@ -2313,6 +2331,7 @@ def make_HTTPS_handler(params, **kwargs): if opts_no_check_certificate else ssl.CERT_REQUIRED) context.set_default_verify_paths() + set_alpn_protocols(context) return YoutubeDLHTTPSHandler(params, context=context, **kwargs) From 04fd3289d30de3c99c7d2de34d555b050bc96d4d Mon Sep 17 00:00:00 2001 From: dirkf Date: Sat, 28 May 2022 13:54:32 +0100 Subject: [PATCH 23/26] [YouPorn] Improve `upload_date` extraction See https://github.com/yt-dlp/yt-dlp/issues/2701#issuecomment-1034341883 --- youtube_dl/extractor/youporn.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index 7084d3d12..31e8abb72 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import re @@ -137,9 +138,10 @@ class YouPornIE(InfoExtractor): r'(?s)]+class=["\']submitByLink["\'][^>]*>(.+?)', webpage, 'uploader', fatal=False) upload_date = unified_strdate(self._html_search_regex( - [r'UPLOADED:\s*([^<]+)', + (r'UPLOADED:\s*([^<]+)', r'Date\s+[Aa]dded:\s*([^<]+)', - r'(?s)]+class=["\']videoInfo(?:Date|Time)["\'][^>]*>(.+?)'], + r'''(?s)]+class=["']videoInfo(?:Date|Time)\b[^>]*>(.+?)''', + r'(?s)]*>Uploaded[^<]*\s*]*>(.+?)'), webpage, 'upload date', fatal=False)) age_limit = self._rta_search(webpage) From 9aa8e5340f3d5ece372b983f8e399277ca1f1fe4 Mon Sep 17 00:00:00 2001 From: LewdyCoder <88900506+LewdyCoder@users.noreply.github.com> Date: Mon, 30 May 2022 03:50:50 +0200 Subject: [PATCH 24/26] [Readme] Clarified extractor naming (#29799) * Exported usable extractors must be named `xxxxIE` Co-authored-by: dirkf --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 58ab3a4b8..ff40cef78 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -150,7 +150,7 @@ After you have ensured this site is distributing its content legally, you can fo # TODO more properties (see youtube_dl/extractor/common.py) } ``` -5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/extractors.py). +5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/extractors.py). This makes the extractor available for use, as long as the class ends with `IE`. 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in. 7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/ytdl-org/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303). Add tests and code for as many as you want. 8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://flake8.pycqa.org/en/latest/index.html#quickstart): From 1baa0f5f6678c047624785dc9a3ab3cb44a72809 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Thu, 29 Apr 2021 04:56:09 +0530 Subject: [PATCH 25/26] [utils] Escape URL while sanitizing Closes #31008, #yt-dlp/263 While this fixes the issue in question, it does not try to address the root-cause of the problem Refer: 915f911e365736227e134ad654601443dbfd7ccb, f5fa042c82300218a2d07b95dd6b9c0756745db3 --- youtube_dl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 4ff27db3d..8aa2a43a2 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2151,7 +2151,7 @@ def sanitize_url(url): for mistake, fixup in COMMON_TYPOS: if re.match(mistake, url): return re.sub(mistake, fixup, url) - return url + return escape_url(url) def sanitized_Request(url, *args, **kwargs): From 530f4582d011cd94986cf4d233f9fb9263f72150 Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 6 Jun 2022 19:29:48 +0100 Subject: [PATCH 26/26] [HRFernsehen] Back-port new extractor from yt-dlp Closes #26445, where this was originally proposed. --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/hrfernsehen.py | 101 ++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+) create mode 100644 youtube_dl/extractor/hrfernsehen.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 452caeade..751fc38b6 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -479,6 +479,7 @@ from .hotstar import ( ) from .howcast import HowcastIE from .howstuffworks import HowStuffWorksIE +from .hrfernsehen import HRFernsehenIE from .hrti import ( HRTiIE, HRTiPlaylistIE, diff --git a/youtube_dl/extractor/hrfernsehen.py b/youtube_dl/extractor/hrfernsehen.py new file mode 100644 index 000000000..11b879dbd --- /dev/null +++ b/youtube_dl/extractor/hrfernsehen.py @@ -0,0 +1,101 @@ +# coding: utf-8 + +from __future__ import unicode_literals + +import json +import re + +from ..utils import ( + int_or_none, + unified_timestamp, + unescapeHTML +) +from .common import InfoExtractor + + +class HRFernsehenIE(InfoExtractor): + IE_NAME = 'hrfernsehen' + _VALID_URL = r'^https?://www\.(?:hr-fernsehen|hessenschau)\.de/.*,video-(?P[0-9]{6})\.html' + + _TESTS = [{ + 'url': 'https://www.hessenschau.de/tv-sendung/hessenschau-vom-26082020,video-130546.html', + 'md5': '5c4e0ba94677c516a2f65a84110fc536', + 'info_dict': { + 'id': '130546', + 'ext': 'mp4', + 'description': 'Sturmtief Kirsten fegt über Hessen / Die Corona-Pandemie – eine Chronologie / ' + 'Sterbehilfe: Die Lage in Hessen / Miss Hessen leitet zwei eigene Unternehmen / ' + 'Pop-Up Museum zeigt Schwarze Unterhaltung und Black Music', + 'subtitles': {'de': [{ + 'url': 'https://hr-a.akamaihd.net/video/as/hessenschau/2020_08/hrLogo_200826200407_L385592_512x288-25p-500kbit.vtt' + }]}, + 'timestamp': 1598470200, + 'upload_date': '20200826', + 'thumbnail': 'https://www.hessenschau.de/tv-sendung/hs_ganz-1554~_t-1598465545029_v-16to9__medium.jpg', + 'title': 'hessenschau vom 26.08.2020' + } + }, { + 'url': 'https://www.hr-fernsehen.de/sendungen-a-z/mex/sendungen/fair-und-gut---was-hinter-aldis-eigenem-guetesiegel-steckt,video-130544.html', + 'only_matching': True + }] + + _GEO_COUNTRIES = ['DE'] + + def extract_airdate(self, loader_data): + airdate_str = loader_data.get('mediaMetadata', {}).get('agf', {}).get('airdate') + + if airdate_str is None: + return None + + return unified_timestamp(airdate_str) + + def extract_formats(self, loader_data): + stream_formats = [] + for stream_obj in loader_data["videoResolutionLevels"]: + stream_format = { + 'format_id': str(stream_obj['verticalResolution']) + "p", + 'height': stream_obj['verticalResolution'], + 'url': stream_obj['url'], + } + + quality_information = re.search(r'([0-9]{3,4})x([0-9]{3,4})-([0-9]{2})p-([0-9]{3,4})kbit', + stream_obj['url']) + if quality_information: + stream_format['width'] = int_or_none(quality_information.group(1)) + stream_format['height'] = int_or_none(quality_information.group(2)) + stream_format['fps'] = int_or_none(quality_information.group(3)) + stream_format['tbr'] = int_or_none(quality_information.group(4)) + + stream_formats.append(stream_format) + + self._sort_formats(stream_formats) + return stream_formats + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_search_meta( + ['og:title', 'twitter:title', 'name'], webpage) + description = self._html_search_meta( + ['description'], webpage) + + loader_str = unescapeHTML(self._search_regex(r"data-new-hr-mediaplayer-loader='([^']*)'", webpage, "ardloader")) + loader_data = json.loads(loader_str) + + info = { + 'id': video_id, + 'title': title, + 'description': description, + 'formats': self.extract_formats(loader_data), + 'timestamp': self.extract_airdate(loader_data) + } + + if "subtitle" in loader_data: + info["subtitles"] = {"de": [{"url": loader_data["subtitle"]}]} + + thumbnails = list(set([t for t in loader_data.get("previewImageUrl", {}).values()])) + if len(thumbnails) > 0: + info["thumbnails"] = [{"url": t} for t in thumbnails] + + return info