Merge remote-tracking branch 'origin/master'

Conflicts: youtube_dl/YoutubeDL.py
2026-05-24 14:25:14 +00:00 · 2015-03-09 03:01:28 +01:00
parent d475b3384c dd7831fe94
commit dcca581967
13 changed files with 246 additions and 34 deletions
@@ -38,6 +38,7 @@ from youtube_dl.utils import (
    parse_iso8601,
    read_batch_urls,
    sanitize_filename,
    sanitize_path,
    shell_quote,
    smuggle_url,
    str_to_int,
@@ -131,6 +132,37 @@ class TestUtil(unittest.TestCase):
        self.assertEqual(sanitize_filename('_BD_eEpuzXw', is_id=True), '_BD_eEpuzXw')
        self.assertEqual(sanitize_filename('N0Y__7-UOdI', is_id=True), 'N0Y__7-UOdI')
    def test_sanitize_path(self):
        if sys.platform != 'win32':
            return
        self.assertEqual(sanitize_path('abc'), 'abc')
        self.assertEqual(sanitize_path('abc/def'), 'abc\\def')
        self.assertEqual(sanitize_path('abc\\def'), 'abc\\def')
        self.assertEqual(sanitize_path('abc|def'), 'abc#def')
        self.assertEqual(sanitize_path('<>:"|?*'), '#######')
        self.assertEqual(sanitize_path('C:/abc/def'), 'C:\\abc\\def')
        self.assertEqual(sanitize_path('C?:/abc/def'), 'C##\\abc\\def')
        self.assertEqual(sanitize_path('\\\\?\\UNC\\ComputerName\\abc'), '\\\\?\\UNC\\ComputerName\\abc')
        self.assertEqual(sanitize_path('\\\\?\\UNC/ComputerName/abc'), '\\\\?\\UNC\\ComputerName\\abc')
        self.assertEqual(sanitize_path('\\\\?\\C:\\abc'), '\\\\?\\C:\\abc')
        self.assertEqual(sanitize_path('\\\\?\\C:/abc'), '\\\\?\\C:\\abc')
        self.assertEqual(sanitize_path('\\\\?\\C:\\ab?c\\de:f'), '\\\\?\\C:\\ab#c\\de#f')
        self.assertEqual(sanitize_path('\\\\?\\C:\\abc'), '\\\\?\\C:\\abc')
        self.assertEqual(
            sanitize_path('youtube/%(uploader)s/%(autonumber)s-%(title)s-%(upload_date)s.%(ext)s'),
            'youtube\\%(uploader)s\\%(autonumber)s-%(title)s-%(upload_date)s.%(ext)s')
        self.assertEqual(
            sanitize_path('youtube/TheWreckingYard ./00001-Not bad, Especially for Free! (1987 Yamaha 700)-20141116.mp4.part'),
            'youtube\\TheWreckingYard #\\00001-Not bad, Especially for Free! (1987 Yamaha 700)-20141116.mp4.part')
        self.assertEqual(sanitize_path('abc/def...'), 'abc\\def..#')
        self.assertEqual(sanitize_path('abc.../def'), 'abc..#\\def')
        self.assertEqual(sanitize_path('abc.../def...'), 'abc..#\\def..#')
    def test_ordered_set(self):
        self.assertEqual(orderedSet([1, 1, 2, 3, 4, 4, 5, 6, 7, 3, 5]), [1, 2, 3, 4, 5, 6, 7])
        self.assertEqual(orderedSet([]), [])
@@ -61,6 +61,7 @@ from .utils import (
    render_table,
    SameFileError,
    sanitize_filename,
    sanitize_path,
    std_headers,
    subtitles_filename,
    takewhile_inclusive,
@@ -562,7 +563,7 @@ class YoutubeDL(object):
                                 if v is not None)
            template_dict = collections.defaultdict(lambda: 'NA', template_dict)
-            outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
+            outtmpl = sanitize_path(self.params.get('outtmpl', DEFAULT_OUTTMPL))
            tmpl = compat_expanduser(outtmpl)
            filename = tmpl % template_dict
            # Temporary fix for #4787
@@ -1261,7 +1262,7 @@ class YoutubeDL(object):
            return
        try:
-            dn = os.path.dirname(encodeFilename(filename))
+            dn = os.path.dirname(sanitize_path(encodeFilename(filename)))
            if dn and not os.path.exists(dn):
                os.makedirs(dn)
        except (OSError, IOError) as err:
@@ -281,7 +281,7 @@ class F4mFD(FileDownloader):
            boot_info = self._get_bootstrap_from_url(bootstrap_url)
        else:
            bootstrap_url = None
-            bootstrap = base64.b64decode(node.text)
+            bootstrap = base64.b64decode(node.text.encode('ascii'))
            boot_info = read_bootstrap_info(bootstrap)
        return (boot_info, bootstrap_url)
@@ -308,7 +308,7 @@ class F4mFD(FileDownloader):
        live = boot_info['live']
        metadata_node = media.find(_add_ns('metadata'))
        if metadata_node is not None:
-            metadata = base64.b64decode(metadata_node.text)
+            metadata = base64.b64decode(metadata_node.text.encode('ascii'))
        else:
            metadata = None
@@ -175,6 +175,7 @@ from .gameone import (
 from .gamespot import GameSpotIE
 from .gamestar import GameStarIE
 from .gametrailers import GametrailersIE
 from .gazeta import GazetaIE
 from .gdcvault import GDCVaultIE
 from .generic import GenericIE
 from .giantbomb import GiantBombIE
@@ -363,6 +364,7 @@ from .pbs import PBSIE
 from .phoenix import PhoenixIE
 from .photobucket import PhotobucketIE
 from .planetaplay import PlanetaPlayIE
 from .pladform import PladformIE
 from .played import PlayedIE
 from .playfm import PlayFMIE
 from .playvid import PlayvidIE
@@ -2,13 +2,12 @@
 from __future__ import unicode_literals
 import re
 import json
 from .common import InfoExtractor
 from ..utils import (
    ExtractorError,
    xpath_text,
    float_or_none,
    xpath_text,
 )
@@ -60,6 +59,24 @@ class AdultSwimIE(InfoExtractor):
            'title': 'American Dad - Putting Francine Out of Business',
            'description': 'Stan hatches a plan to get Francine out of the real estate business.Watch more American Dad on [adult swim].'
        },
    }, {
        'url': 'http://www.adultswim.com/videos/tim-and-eric-awesome-show-great-job/dr-steve-brule-for-your-wine/',
        'playlist': [
            {
                'md5': '3e346a2ab0087d687a05e1e7f3b3e529',
                'info_dict': {
                    'id': 'sY3cMUR_TbuE4YmdjzbIcQ-0',
                    'ext': 'flv',
                    'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine',
                    'description': 'Dr. Brule reports live from Wine Country with a special report on wines.  \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n',
                },
            }
        ],
        'info_dict': {
            'id': 'sY3cMUR_TbuE4YmdjzbIcQ',
            'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine',
            'description': 'Dr. Brule reports live from Wine Country with a special report on wines.  \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n',
        },
    }]
    @staticmethod
@@ -80,6 +97,7 @@ class AdultSwimIE(InfoExtractor):
            for video in collection.get('videos'):
                if video.get('slug') == slug:
                    return collection, video
        return None, None
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
@@ -90,28 +108,30 @@ class AdultSwimIE(InfoExtractor):
        webpage = self._download_webpage(url, episode_path)
        # Extract the value of `bootstrappedData` from the Javascript in the page.
-        bootstrappedDataJS = self._search_regex(r'var bootstrappedData = ({.*});', webpage, episode_path)
+        bootstrapped_data = self._parse_json(self._search_regex(
-
+            r'var bootstrappedData = ({.*});', webpage, 'bootstraped data'), episode_path)
        try:
            bootstrappedData = json.loads(bootstrappedDataJS)
        except ValueError as ve:
            errmsg = '%s: Failed to parse JSON ' % episode_path
            raise ExtractorError(errmsg, cause=ve)
        # Downloading videos from a /videos/playlist/ URL needs to be handled differently.
        # NOTE: We are only downloading one video (the current one) not the playlist
        if is_playlist:
-            collections = bootstrappedData['playlists']['collections']
+            collections = bootstrapped_data['playlists']['collections']
            collection = self.find_collection_by_linkURL(collections, show_path)
            video_info = self.find_video_info(collection, episode_path)
            show_title = video_info['showTitle']
            segment_ids = [video_info['videoPlaybackID']]
        else:
-            collections = bootstrappedData['show']['collections']
+            collections = bootstrapped_data['show']['collections']
            collection, video_info = self.find_collection_containing_video(collections, episode_path)
-            show = bootstrappedData['show']
+            # Video wasn't found in the collections, let's try `slugged_video`.
            if video_info is None:
                if bootstrapped_data.get('slugged_video', {}).get('slug') == episode_path:
                    video_info = bootstrapped_data['slugged_video']
                else:
                    raise ExtractorError('Unable to find video info')
            show = bootstrapped_data['show']
            show_title = show['title']
            segment_ids = [clip['videoPlaybackID'] for clip in video_info['clips']]
@@ -41,7 +41,7 @@ class BreakIE(InfoExtractor):
            'tbr': media['bitRate'],
            'width': media['width'],
            'height': media['height'],
-        } for media in info['media']]
+        } for media in info['media'] if media.get('mediaPurpose') == 'play']
        if not formats:
            formats.append({
@@ -0,0 +1,38 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import re
 from .common import InfoExtractor
 class GazetaIE(InfoExtractor):
    _VALID_URL = r'(?P<url>https?://(?:www\.)?gazeta\.ru/(?:[^/]+/)?video/(?:(?:main|\d{4}/\d{2}/\d{2})/)?(?P<id>[A-Za-z0-9-_.]+)\.s?html)'
    _TESTS = [{
        'url': 'http://www.gazeta.ru/video/main/zadaite_vopros_vladislavu_yurevichu.shtml',
        'md5': 'd49c9bdc6e5a7888f27475dc215ee789',
        'info_dict': {
            'id': '205566',
            'ext': 'mp4',
            'title': '«70–80 процентов гражданских в Донецке на грани голода»',
            'description': 'md5:38617526050bd17b234728e7f9620a71',
            'thumbnail': 're:^https?://.*\.jpg',
        },
    }, {
        'url': 'http://www.gazeta.ru/lifestyle/video/2015/03/08/master-klass_krasivoi_byt._delaem_vesennii_makiyazh.shtml',
        'only_matching': True,
    }]
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        display_id = mobj.group('id')
        embed_url = '%s?p=embed' % mobj.group('url')
        embed_page = self._download_webpage(
            embed_url, display_id, 'Downloading embed page')
        video_id = self._search_regex(
            r'<div[^>]*?class="eagleplayer"[^>]*?data-id="([^"]+)"', embed_page, 'video id')
        return self.url_result(
            'eagleplatform:gazeta.media.eagleplatform.com:%s' % video_id, 'EaglePlatform')
@@ -596,6 +596,19 @@ class GenericIE(InfoExtractor):
                'view_count': int,
            },
        },
        # Pladform embed
        {
            'url': 'http://muz-tv.ru/kinozal/view/7400/',
            'info_dict': {
                'id': '100183293',
                'ext': 'mp4',
                'title': 'Тайны перевала Дятлова • Тайна перевала Дятлова 1 серия 2 часть',
                'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века',
                'thumbnail': 're:^https?://.*\.jpg$',
                'duration': 694,
                'age_limit': 0,
            },
        },
        # RSS feed with enclosure
        {
            'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
@@ -1193,6 +1206,12 @@ class GenericIE(InfoExtractor):
        if mobj is not None:
            return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform')
        # Look for Pladform embeds
        mobj = re.search(
            r'<iframe[^>]+src="(?P<url>https?://out\.pladform\.ru/player\?.+?)"', webpage)
        if mobj is not None:
            return self.url_result(mobj.group('url'), 'Pladform')
        def check_video(vurl):
            if YoutubeIE.suitable(vurl):
                return True
@@ -0,0 +1,90 @@
 # coding: utf-8
 from __future__ import unicode_literals
 from .common import InfoExtractor
 from ..utils import (
    ExtractorError,
    int_or_none,
    xpath_text,
    qualities,
 )
 class PladformIE(InfoExtractor):
    _VALID_URL = r'''(?x)
                    https?://
                        (?:
                            (?:
                                out\.pladform\.ru/player|
                                static\.pladform\.ru/player\.swf
                            )
                            \?.*\bvideoid=|
                            video\.pladform\.ru/catalog/video/videoid/
                        )
                        (?P<id>\d+)
                    '''
    _TESTS = [{
        # http://muz-tv.ru/kinozal/view/7400/
        'url': 'http://out.pladform.ru/player?pl=24822&videoid=100183293',
        'md5': '61f37b575dd27f1bb2e1854777fe31f4',
        'info_dict': {
            'id': '100183293',
            'ext': 'mp4',
            'title': 'Тайны перевала Дятлова • Тайна перевала Дятлова 1 серия 2 часть',
            'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века',
            'thumbnail': 're:^https?://.*\.jpg$',
            'duration': 694,
            'age_limit': 0,
        },
    }, {
        'url': 'http://static.pladform.ru/player.swf?pl=21469&videoid=100183293&vkcid=0',
        'only_matching': True,
    }, {
        'url': 'http://video.pladform.ru/catalog/video/videoid/100183293/vkcid/0',
        'only_matching': True,
    }]
    def _real_extract(self, url):
        video_id = self._match_id(url)
        video = self._download_xml(
            'http://out.pladform.ru/getVideo?pl=1&videoid=%s' % video_id,
            video_id)
        if video.tag == 'error':
            raise ExtractorError(
                '%s returned error: %s' % (self.IE_NAME, video.text),
                expected=True)
        quality = qualities(('ld', 'sd', 'hd'))
        formats = [{
            'url': src.text,
            'format_id': src.get('quality'),
            'quality': quality(src.get('quality')),
        } for src in video.findall('./src')]
        self._sort_formats(formats)
        webpage = self._download_webpage(
            'http://video.pladform.ru/catalog/video/videoid/%s' % video_id,
            video_id)
        title = self._og_search_title(webpage, fatal=False) or xpath_text(
            video, './/title', 'title', fatal=True)
        description = self._search_regex(
            r'</h3>\s*<p>([^<]+)</p>', webpage, 'description', fatal=False)
        thumbnail = self._og_search_thumbnail(webpage) or xpath_text(
            video, './/cover', 'cover')
        duration = int_or_none(xpath_text(video, './/time', 'duration'))
        age_limit = int_or_none(xpath_text(video, './/age18', 'age limit'))
        return {
            'id': video_id,
            'title': title,
            'description': description,
            'thumbnail': thumbnail,
            'duration': duration,
            'age_limit': age_limit,
            'formats': formats,
        }
@@ -53,10 +53,10 @@ class TeamcocoIE(InfoExtractor):
        embed = self._download_webpage(
            embed_url, video_id, 'Downloading embed page')
-        encoded_data = self._search_regex(
+        player_data = self._parse_json(self._search_regex(
-            r'"preload"\s*:\s*"([^"]+)"', embed, 'encoded data')
+            r'Y\.Ginger\.Module\.Player\((\{.*?\})\);', embed, 'player data'), video_id)
        data = self._parse_json(
-            base64.b64decode(encoded_data.encode('ascii')).decode('utf-8'), video_id)
+            base64.b64decode(player_data['preload'].encode('ascii')).decode('utf-8'), video_id)
        formats = []
        get_quality = qualities(['500k', '480p', '1000k', '720p', '1080p'])
@@ -358,13 +358,12 @@ class TwitchStreamIE(TwitchBaseIE):
            'p': random.randint(1000000, 10000000),
            'player': 'twitchweb',
            'segment_preference': '4',
-            'sig': access_token['sig'],
+            'sig': access_token['sig'].encode('utf-8'),
-            'token': access_token['token'],
+            'token': access_token['token'].encode('utf-8'),
        }
        formats = self._extract_m3u8_formats(
            '%s/api/channel/hls/%s.m3u8?%s'
-            % (self._USHER_BASE, channel_id, compat_urllib_parse.urlencode(query).encode('utf-8')),
+            % (self._USHER_BASE, channel_id, compat_urllib_parse.urlencode(query)),
            channel_id, 'mp4')
        self._prefer_source(formats)
@@ -41,13 +41,10 @@ class VidmeIE(InfoExtractor):
        duration = float_or_none(self._html_search_regex(
            r'data-duration="([^"]+)"', webpage, 'duration', fatal=False))
        view_count = str_to_int(self._html_search_regex(
-            r'<span class="video_views">\s*([\d,\.]+)\s*plays?', webpage, 'view count', fatal=False))
+            r'<(?:li|span) class="video_views">\s*([\d,\.]+)\s*plays?', webpage, 'view count', fatal=False))
        like_count = str_to_int(self._html_search_regex(
            r'class="score js-video-vote-score"[^>]+data-score="([\d,\.\s]+)">',
            webpage, 'like count', fatal=False))
        comment_count = str_to_int(self._html_search_regex(
            r'class="js-comment-count"[^>]+data-count="([\d,\.\s]+)">',
            webpage, 'comment count', fatal=False))
        return {
            'id': video_id,
@@ -61,5 +58,4 @@ class VidmeIE(InfoExtractor):
            'duration': duration,
            'view_count': view_count,
            'like_count': like_count,
            'comment_count': comment_count,
        }
@@ -252,15 +252,12 @@ def sanitize_open(filename, open_mode):
            raise
        # In case of error, try to remove win32 forbidden chars
-        alt_filename = os.path.join(
+        alt_filename = sanitize_path(filename)
            re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
            for path_part in os.path.split(filename)
        )
        if alt_filename == filename:
            raise
        else:
            # An exception here should be caught in the caller
-            stream = open(encodeFilename(filename), open_mode)
+            stream = open(encodeFilename(alt_filename), open_mode)
            return (stream, alt_filename)
@@ -311,6 +308,24 @@ def sanitize_filename(s, restricted=False, is_id=False):
    return result
 def sanitize_path(s):
    """Sanitizes and normalizes path on Windows"""
    if sys.platform != 'win32':
        return s
    drive, _ = os.path.splitdrive(s)
    unc, _ = os.path.splitunc(s)
    unc_or_drive = unc or drive
    norm_path = os.path.normpath(remove_start(s, unc_or_drive)).split(os.path.sep)
    if unc_or_drive:
        norm_path.pop(0)
    sanitized_path = [
        re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
        for path_part in norm_path]
    if unc_or_drive:
        sanitized_path.insert(0, unc_or_drive + os.path.sep)
    return os.path.join(*sanitized_path)
 def orderedSet(iterable):
    """ Remove all duplicates from the input iterable """
    res = []