From b14b33a2e9c35d41a1fd16cf53afae612392bf44 Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 31 Jan 2022 04:28:54 +0000 Subject: [PATCH] [YouTube] Bypass age-gating for certain restricted videos * Use TVHTML5_SIMPLY_EMBEDDED_PLAYER client * Also add and fix tests * Introduce and use new utility function `update_url()` --- youtube_dl/extractor/youtube.py | 202 +++++++++++++++++++++++++------- youtube_dl/utils.py | 11 ++ 2 files changed, 168 insertions(+), 45 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 28fdb086a..65428528d 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -42,6 +42,7 @@ from ..utils import ( unescapeHTML, unified_strdate, unsmuggle_url, + update_url, update_url_query, url_or_none, urlencode_postdata, @@ -286,15 +287,18 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;' _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|= 18): + + self.report_age_confirmation() + + # Thanks: https://github.com/yt-dlp/yt-dlp/pull/3233 + pb_context = {'html5Preference': 'HTML5_PREF_WANTS'} + query = { + 'playbackContext': {'contentPlaybackContext': {'html5Preference': 'HTML5_PREF_WANTS'}}, + 'contentCheckOk': True, + 'racyCheckOk': True, + 'context': { + 'client': {'clientName': 'TVHTML5_SIMPLY_EMBEDDED_PLAYER', 'clientVersion': '2.0', 'hl': 'en', 'clientScreen': 'EMBED'}, + 'thirdParty': {'embedUrl': 'https://google.com'}, + }, + 'videoId': video_id, + } + headers = { + 'X-YouTube-Client-Name': '85', + 'X-YouTube-Client-Version': '2.0', + 'Origin': 'https://www.youtube.com' + } + + video_info = self._call_api('player', query, video_id, fatal=False, headers=headers) + age_gate_status = get_playability_status(video_info) + if age_gate_status.get('status') == 'OK': + player_response = video_info + playability_status = age_gate_status trailer_video_id = try_get( playability_status, @@ -1932,12 +2048,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): for thumbnail in (try_get( container, lambda x: x['thumbnail']['thumbnails'], list) or []): - thumbnail_url = thumbnail.get('url') + thumbnail_url = url_or_none(thumbnail.get('url')) if not thumbnail_url: continue thumbnails.append({ 'height': int_or_none(thumbnail.get('height')), - 'url': thumbnail_url, + 'url': update_url(thumbnail_url, query=None, fragment=None), 'width': int_or_none(thumbnail.get('width')), }) if thumbnails: @@ -2142,6 +2258,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): sbr_tooltip = try_get( vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip']) if sbr_tooltip: + # however dislike_count was hidden by YT, as if there could ever be dislikable content on YT like_count, dislike_count = sbr_tooltip.split(' / ') info.update({ 'like_count': str_to_int(like_count), @@ -2411,7 +2528,6 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'tags': list, 'view_count': int, 'like_count': int, - 'dislike_count': int, }, 'params': { 'skip_download': True, @@ -2438,7 +2554,6 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'categories': ['News & Politics'], 'tags': list, 'like_count': int, - 'dislike_count': int, }, 'params': { 'skip_download': True, @@ -2458,7 +2573,6 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'categories': ['News & Politics'], 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'], 'like_count': int, - 'dislike_count': int, }, 'params': { 'skip_download': True, @@ -3043,8 +3157,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): def _real_extract(self, url): item_id = self._match_id(url) - url = compat_urlparse.urlunparse( - compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com')) + url = update_url(url, netloc='www.youtube.com') # Handle both video/playlist URLs qs = parse_qs(url) video_id = qs.get('v', [None])[0] @@ -3178,7 +3291,6 @@ class YoutubeYtBeIE(InfoExtractor): 'categories': ['Nonprofits & Activism'], 'tags': list, 'like_count': int, - 'dislike_count': int, }, 'params': { 'noplaylist': True, diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index e3c3ccff9..d5cc6386d 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -4121,6 +4121,17 @@ def update_url_query(url, query): query=compat_urllib_parse_urlencode(qs, True))) +def update_url(url, **kwargs): + """Replace URL components specified by kwargs + url: compat_str or parsed URL tuple + returns: compat_str""" + if not kwargs: + return compat_urlparse.urlunparse(url) if isinstance(url, tuple) else url + if not isinstance(url, tuple): + url = compat_urlparse.urlparse(url) + return compat_urlparse.urlunparse(url._replace(**kwargs)) + + def update_Request(req, url=None, data=None, headers={}, query={}): req_headers = req.headers.copy() req_headers.update(headers)