From f640916de124206f9a8a397d44455ab917728f4a Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 6 Feb 2023 16:19:21 +0000 Subject: [PATCH] [YouTube] Refresh compat/utils usage * import parse_qs() * import parse_qs in lazy_extractors (clears old TODO) * clean up old compiled lazy_extractors for Py2 * use update_url() --- devscripts/make_lazy_extractors.py | 10 ++++- test/test_execution.py | 12 +++--- youtube_dl/extractor/youtube.py | 61 +++++++++++------------------- 3 files changed, 39 insertions(+), 44 deletions(-) diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py index 878ae72b1..edc19183d 100644 --- a/devscripts/make_lazy_extractors.py +++ b/devscripts/make_lazy_extractors.py @@ -13,6 +13,11 @@ sys.path.insert(0, dirn(dirn((os.path.abspath(__file__))))) lazy_extractors_filename = sys.argv[1] if os.path.exists(lazy_extractors_filename): os.remove(lazy_extractors_filename) +# Py2: may be confused by leftover lazy_extractors.pyc +try: + os.remove(lazy_extractors_filename + 'c') +except OSError: + pass from youtube_dl.extractor import _ALL_CLASSES from youtube_dl.extractor.common import InfoExtractor, SearchInfoExtractor @@ -22,7 +27,10 @@ with open('devscripts/lazy_load_template.py', 'rt') as f: module_contents = [ module_template + '\n' + getsource(InfoExtractor.suitable) + '\n', - 'class LazyLoadSearchExtractor(LazyLoadExtractor):\n pass\n'] + 'class LazyLoadSearchExtractor(LazyLoadExtractor):\n pass\n', + # needed for suitable() methods of Youtube extractor (see #28780) + 'from youtube_dl.utils import parse_qs\n', +] ie_template = ''' class {name}({bases}): diff --git a/test/test_execution.py b/test/test_execution.py index 32948d93e..704e14612 100644 --- a/test/test_execution.py +++ b/test/test_execution.py @@ -40,14 +40,16 @@ class TestExecution(unittest.TestCase): self.assertFalse(stderr) def test_lazy_extractors(self): + lazy_extractors = 'youtube_dl/extractor/lazy_extractors.py' try: - subprocess.check_call([sys.executable, 'devscripts/make_lazy_extractors.py', 'youtube_dl/extractor/lazy_extractors.py'], cwd=rootDir, stdout=_DEV_NULL) + subprocess.check_call([sys.executable, 'devscripts/make_lazy_extractors.py', lazy_extractors], cwd=rootDir, stdout=_DEV_NULL) subprocess.check_call([sys.executable, 'test/test_all_urls.py'], cwd=rootDir, stdout=_DEV_NULL) finally: - try: - os.remove('youtube_dl/extractor/lazy_extractors.py') - except (IOError, OSError): - pass + for x in ['', 'c'] if sys.version_info[0] < 3 else ['']: + try: + os.remove(lazy_extractors + x) + except (IOError, OSError): + pass if __name__ == '__main__': diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 6c1cfe7f2..6c70a98d1 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -14,12 +14,11 @@ from ..compat import ( compat_chr, compat_HTTPError, compat_map as map, - compat_parse_qs, compat_str, + compat_urllib_parse, + compat_urllib_parse_parse_qs as compat_parse_qs, compat_urllib_parse_unquote_plus, - compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, - compat_urlparse, ) from ..jsinterp import JSInterpreter from ..utils import ( @@ -33,6 +32,7 @@ from ..utils import ( mimetype2ext, parse_codecs, parse_duration, + parse_qs, qualities, remove_start, smuggle_url, @@ -50,10 +50,6 @@ from ..utils import ( ) -def parse_qs(url): - return compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - - class YoutubeBaseInfoExtractor(InfoExtractor): """Provide base functions for Youtube extractors""" _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' @@ -636,6 +632,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}', 'duration': 142, 'uploader': 'The Witcher', + 'uploader_id': 'WitcherGame', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame', 'upload_date': '20140605', 'thumbnail': 'https://i.ytimg.com/vi/HtVdAasjOgU/maxresdefault.jpg', 'age_limit': 18, @@ -671,7 +669,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, }, { - 'note': 'Age-gated video embedable only with clientScreen=EMBED', + 'note': 'Age-gated video embeddable only with clientScreen=EMBED', 'url': 'https://youtube.com/watch?v=Tq92D6wQ1mg', 'info_dict': { 'id': 'Tq92D6wQ1mg', @@ -1392,11 +1390,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): @classmethod def suitable(cls, url): - # Hack for lazy extractors until more generic solution is implemented - # (see #28780) - from .youtube import parse_qs - qs = parse_qs(url) - if qs.get('list', [None])[0]: + if parse_qs(url).get('list', [None])[0]: return False return super(YoutubeIE, cls).suitable(url) @@ -1546,7 +1540,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if player_url.startswith('//'): player_url = 'https:' + player_url elif not re.match(r'https?://', player_url): - player_url = compat_urlparse.urljoin( + player_url = compat_urllib_parse.urljoin( 'https://www.youtube.com', player_url) return player_url @@ -1628,9 +1622,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _unthrottle_format_urls(self, video_id, player_url, formats): for fmt in formats: - parsed_fmt_url = compat_urlparse.urlparse(fmt['url']) - qs = compat_urlparse.parse_qs(parsed_fmt_url.query) - n_param = qs.get('n') + parsed_fmt_url = compat_urllib_parse.urlparse(fmt['url']) + n_param = compat_parse_qs(parsed_fmt_url.query).get('n') if not n_param: continue n_param = n_param[-1] @@ -1638,9 +1631,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if n_response is None: # give up if descrambling failed break - qs['n'] = [n_response] - fmt['url'] = compat_urlparse.urlunparse( - parsed_fmt_url._replace(query=compat_urllib_parse_urlencode(qs, True))) + fmt['url'] = update_url( + parsed_fmt_url, query_update={'n': [n_response]}) # from yt-dlp, with tweaks def _extract_signature_timestamp(self, video_id, player_url, ytcfg=None, fatal=False): @@ -1669,20 +1661,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor): lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl'])) if not playback_url: return - parsed_playback_url = compat_urlparse.urlparse(playback_url) - qs = compat_urlparse.parse_qs(parsed_playback_url.query) # cpn generation algorithm is reverse engineered from base.js. # In fact it works even with dummy cpn. CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_' cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16))) - qs.update({ - 'ver': ['2'], - 'cpn': [cpn], - }) - playback_url = compat_urlparse.urlunparse( - parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True))) + playback_url = update_url( + playback_url, query_update={ + 'ver': ['2'], + 'cpn': [cpn], + }) self._download_webpage( playback_url, video_id, 'Marking watched', @@ -2075,9 +2064,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): thumbnails = [] for container in (video_details, microformat): - for thumbnail in (try_get( + for thumbnail in try_get( container, - lambda x: x['thumbnail']['thumbnails'], list) or []): + lambda x: x['thumbnail']['thumbnails'], list) or []: thumbnail_url = url_or_none(thumbnail.get('url')) if not thumbnail_url: continue @@ -3287,11 +3276,7 @@ class YoutubePlaylistIE(InfoExtractor): def suitable(cls, url): if YoutubeTabIE.suitable(url): return False - # Hack for lazy extractors until more generic solution is implemented - # (see #28780) - from .youtube import parse_qs - qs = parse_qs(url) - if qs.get('v', [None])[0]: + if parse_qs(url).get('v', [None])[0]: return False return super(YoutubePlaylistIE, cls).suitable(url) @@ -3430,9 +3415,9 @@ class YoutubeSearchURLIE(YoutubeBaseInfoExtractor): }] def _real_extract(self, url): - qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) - query = (qs.get('search_query') or qs.get('q'))[0] - params = qs.get('sp', ('',))[0] + qs = parse_qs(url) + query = (qs.get('search_query') or qs.get('q'))[-1] + params = qs.get('sp', ('',))[-1] return self.playlist_result(self._search_results(query, params), query, query)