Merge d7b502a7278097f68592dc5f6423141be7c69efb into 4d05f84325070c3f6fe2ed6096138757675469a4

2025-07-20 17:26:19 +00:00 · 2024-06-27 06:36:55 +08:00 · 2024-06-27 06:36:55 +08:00 · 0e08823ff3
commit 0e08823ff3
parent 4d05f84325 d7b502a727
4 changed files with 124 additions and 64 deletions
--- a/youtube_dl/compat.py
+++ b/youtube_dl/compat.py
@ -2448,12 +2448,12 @@ try:
 except ImportError:
    import BaseHTTPServer as compat_http_server
 # urllib.parse
 try:
    from urllib.parse import unquote_to_bytes as compat_urllib_parse_unquote_to_bytes
    from urllib.parse import unquote as compat_urllib_parse_unquote
    from urllib.parse import unquote_plus as compat_urllib_parse_unquote_plus
    from urllib.parse import urlencode as compat_urllib_parse_urlencode
    from urllib.parse import parse_qs as compat_parse_qs
 except ImportError:  # Python 2
    _asciire = (compat_urllib_parse._asciire if hasattr(compat_urllib_parse, '_asciire')
                else re.compile(r'([\x00-\x7f]+)'))
@ -2543,60 +2543,80 @@ except ImportError:  # Python 2
        return compat_urllib_parse._urlencode(encode_elem(query), doseq=doseq)
    # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
    # Python 2's version is apparently totally broken
    def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
                   encoding='utf-8', errors='replace'):
        qs, _coerce_result = qs, compat_str
        pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
        r = []
        for name_value in pairs:
            if not name_value and not strict_parsing:
                continue
            nv = name_value.split('=', 1)
            if len(nv) != 2:
                if strict_parsing:
                    raise ValueError('bad query field: %r' % (name_value,))
                # Handle case of a control-name with no equal sign
                if keep_blank_values:
                    nv.append('')
                else:
                    continue
            if len(nv[1]) or keep_blank_values:
                name = nv[0].replace('+', ' ')
                name = compat_urllib_parse_unquote(
                    name, encoding=encoding, errors=errors)
                name = _coerce_result(name)
                value = nv[1].replace('+', ' ')
                value = compat_urllib_parse_unquote(
                    value, encoding=encoding, errors=errors)
                value = _coerce_result(value)
                r.append((name, value))
        return r
    def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
                        encoding='utf-8', errors='replace'):
        parsed_result = {}
        pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
                           encoding=encoding, errors=errors)
        for name, value in pairs:
            if name in parsed_result:
                parsed_result[name].append(value)
            else:
                parsed_result[name] = [value]
        return parsed_result
    setattr(compat_urllib_parse, '_urlencode',
            getattr(compat_urllib_parse, 'urlencode'))
    for name, fix in (
            ('unquote_to_bytes', compat_urllib_parse_unquote_to_bytes),
            ('parse_unquote', compat_urllib_parse_unquote),
            ('unquote_plus', compat_urllib_parse_unquote_plus),
-            ('urlencode', compat_urllib_parse_urlencode),
+            ('urlencode', compat_urllib_parse_urlencode)):
            ('parse_qs', compat_parse_qs)):
        setattr(compat_urllib_parse, name, fix)
 finally:
    try:
        # arguments changed in 3.8 and 3.10
        from urllib.parse import parse_qs as _parse_qs
        _parse_qs('a=b', separator='&')
        compat_parse_qs = _parse_qs
    except (ImportError, TypeError):  # Python 2, < 3.10
-compat_urllib_parse_parse_qs = compat_parse_qs
+        # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
        # Python 2's version is apparently totally broken
        # Also use this implementation for Py < 3.10
        # * support only default separator '&', not r'[&;]', like 3.10+
        # * support max_num_fields, like 3.8+
        def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
                       encoding='utf-8', errors='replace',
                       max_num_fields=None, separator='&'):
            if not isinstance(separator, (compat_str, str)):
                raise ValueError('Separator must be of type string or bytes')
            # DoS protection, if anyone cares
            if qs and max_num_fields is not None and qs.count(separator) >= max_num_fields:
                raise ValueError('Too many fields')
            _coerce_result = compat_str
            r = []
            for name_value in qs.split(separator):
                if not name_value and not strict_parsing:
                    continue
                nv = name_value.split('=', 1)
                if len(nv) != 2:
                    if strict_parsing:
                        raise ValueError('bad query field: %r' % (name_value,))
                    # Handle case of a control-name with no equal sign
                    if keep_blank_values:
                        nv.append('')
                    else:
                        continue
                if len(nv[1]) or keep_blank_values:
                    name = nv[0].replace('+', ' ')
                    name = compat_urllib_parse_unquote(
                        name, encoding=encoding, errors=errors)
                    name = _coerce_result(name)
                    value = nv[1].replace('+', ' ')
                    value = compat_urllib_parse_unquote(
                        value, encoding=encoding, errors=errors)
                    value = _coerce_result(value)
                    r.append((name, value))
            return r
        def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
                            encoding='utf-8', errors='replace',
                            max_num_fields=None, separator='&'):
            parsed_result = {}
            pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
                               encoding, errors, max_num_fields, separator)
            for name, value in pairs:
                if name in parsed_result:
                    parsed_result[name].append(value)
                else:
                    parsed_result[name] = [value]
            return parsed_result
        for name, fix in (
                ('parse_qs', compat_parse_qs),
                ('parse_qsl', _parse_qsl)):
            setattr(compat_urllib_parse, name, fix)
    compat_urllib_parse_parse_qs = compat_parse_qs
 try:
    from urllib.request import DataHandler as compat_urllib_request_DataHandler
--- a/youtube_dl/extractor/bfi.py
+++ b/youtube_dl/extractor/bfi.py
@ -4,7 +4,12 @@ from __future__ import unicode_literals
 import re
 from .common import InfoExtractor
-from ..utils import extract_attributes
+from ..utils import (
    extract_attributes,
    parse_qs,
    remove_start,
    smuggle_url,
 )
 class BFIPlayerIE(InfoExtractor):
@ -12,26 +17,39 @@ class BFIPlayerIE(InfoExtractor):
    _VALID_URL = r'https?://player\.bfi\.org\.uk/[^/]+/film/watch-(?P<id>[\w-]+)-online'
    _TEST = {
        'url': 'https://player.bfi.org.uk/free/film/watch-computer-doctor-1974-online',
-        'md5': 'e8783ebd8e061ec4bc6e9501ed547de8',
+        'md5': '15598bdd6a413ce9363970754f054d76',
        'info_dict': {
            'id': 'htNnhlZjE60C9VySkQEIBtU-cNV1Xx63',
            'ext': 'mp4',
            'title': 'Computer Doctor',
            'description': 'md5:fb6c240d40c4dbe40428bdd62f78203b',
            'timestamp': 1564424975,
            'upload_date': '20190729',
            'uploader_id': '6057949427001',
        },
-        'skip': 'BFI Player films cannot be played outside of the UK',
+        # 'skip': 'BFI Player films cannot be played outside of the UK',
    }
    _BRIGHTCOVE_ACCOUNT_ID = '6057949427001'
    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)
-        entries = []
+
-        for player_el in re.findall(r'(?s)<[^>]+class="player"[^>]*>', webpage):
+        film_only = 'play-film' in parse_qs(url, keep_blank_values=True)
-            player_attr = extract_attributes(player_el)
+
-            ooyala_id = player_attr.get('data-video-id')
+        def entries():
-            if not ooyala_id:
+            for player_el in re.finditer(r'(?s)<video-js\b[^>]+>', webpage):
-                continue
+                player_attr = extract_attributes(player_el.group(0))
-            entries.append(self.url_result(
+                bcv_id, account_id, player_id, embed = (
-                'ooyala:' + ooyala_id, 'Ooyala',
+                    player_attr.get(x) for x in ('data-ref-id', 'data-acid', 'data-pid', 'data-embed'))
-                ooyala_id, player_attr.get('data-label')))
+                if not bcv_id:
-        return self.playlist_result(entries)
+                    continue
                if film_only and player_attr.get('data-video-type') != 'film':
                    continue
                bc_url = 'brightcove:new:%s:%s:%s:video:ref:%s' % (
                    account_id or self._BRIGHTCOVE_ACCOUNT_ID, player_id or 'default', embed or 'default', bcv_id)
                yield self.url_result(smuggle_url(
                    bc_url, {'referrer': url, 'force_videoid': remove_start(bcv_id, 'ref:')}), ie='BrightcoveNew', video_id=video_id)
        return self.playlist_result(entries())
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@ -340,7 +340,7 @@ class BrightcoveLegacyIE(InfoExtractor):
 class BrightcoveNewIE(AdobePassIE):
    IE_NAME = 'brightcove:new'
-    _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*(?P<content_type>video|playlist)Id=(?P<video_id>\d+|ref:[^&]+)'
+    _VALID_URL = r'(?:brightcove:new|(?P<u>https?)):(?(u)//players\.brightcove\.net/)(?P<account_id>\d+)(?(u)/|:)(?P<player_id>[^/]+)(?(u)_|:)(?P<embed>[^/]+)(?(u)/index\.html\?.*|:)(?P<content_type>video|playlist)(?(u)Id=|:)(?P<video_id>\d+|ref:[^&]+)'
    _TESTS = [{
        'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001',
        'md5': 'c8100925723840d4b0d243f7025703be',
@ -593,7 +593,7 @@ class BrightcoveNewIE(AdobePassIE):
            'ip_blocks': smuggled_data.get('geo_ip_blocks'),
        })
-        account_id, player_id, embed, content_type, video_id = re.match(self._VALID_URL, url).groups()
+        account_id, player_id, embed, content_type, video_id = re.match(self._VALID_URL, url).groups()[1:]
        policy_key_id = '%s_%s' % (account_id, player_id)
        policy_key = self._downloader.cache.load('brightcove', policy_key_id)
@ -678,4 +678,4 @@ class BrightcoveNewIE(AdobePassIE):
                json_data.get('description'))
        return self._parse_brightcove_metadata(
-            json_data, video_id, headers=headers)
+            json_data, smuggled_data.get('force_videoid') or video_id, headers=headers)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@ -2402,7 +2402,7 @@ class YoutubeDLError(Exception):
 class ExtractorError(YoutubeDLError):
    """Error during info extraction."""
-    def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
+    def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
        """ tb, if given, is the original traceback (so that it can be printed out).
        If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
        """
@ -2421,6 +2421,7 @@ class ExtractorError(YoutubeDLError):
        self.exc_info = sys.exc_info()  # preserve original exception
        self.cause = cause
        self.video_id = video_id
        self.ie = ie
    def format_traceback(self):
        if self.traceback is None:
@ -6561,3 +6562,24 @@ def join_nonempty(*values, **kwargs):
    if from_dict is not None:
        values = (traverse_obj(from_dict, variadic(v)) for v in values)
    return delim.join(map(compat_str, filter(None, values)))
 class classproperty(object):
    """property access for class methods with optional caching"""
    def __new__(cls, *args, **kwargs):
        func = args[0] if len(args) > 0 else kwargs.get('func')
        if not func:
            return functools.partial(cls, *args, **kwargs)
        return super(classproperty, cls).__new__(cls)
    def __init__(self, func, cache=False):
        functools.update_wrapper(self, func)
        self.func = func
        self._cache = {} if cache else None
    def __get__(self, _, cls):
        if self._cache is None:
            return self.func(cls)
        elif cls not in self._cache:
            self._cache[cls] = self.func(cls)
        return self._cache[cls]