release 2014.09.29.2

[pbs] Add support for series/jwplayer type video (Fixes #3849 )
release 2014.09.29.1
2026-06-12 07:30:09 +00:00 · 2014-09-29 04:49:11 +02:00 · 2014-09-29 04:48:50 +02:00 · 2014-09-29 02:04:28 +02:00 · 2014-09-29 02:04:16 +02:00 · 2014-09-29 01:51:53 +02:00
21 changed files with 526 additions and 102 deletions
@@ -442,8 +442,6 @@ If you want to add support for a new site, you can follow this quick list (assum
    # coding: utf-8
    from __future__ import unicode_literals

-    import re
-
    from .common import InfoExtractor


@@ -451,7 +449,7 @@ If you want to add support for a new site, you can follow this quick list (assum
        _VALID_URL = r'https?://(?:www\.)?yourextractor\.com/watch/(?P<id>[0-9]+)'
        _TEST = {
            'url': 'http://yourextractor.com/watch/42',
-            'md5': 'TODO: md5 sum of the first 10KiB of the video file',
+            'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)',
            'info_dict': {
                'id': '42',
                'ext': 'mp4',
@@ -466,8 +464,7 @@ If you want to add support for a new site, you can follow this quick list (assum
        }

        def _real_extract(self, url):
-            mobj = re.match(self._VALID_URL, url)
-            video_id = mobj.group('id')
+            video_id = self._match_id(url)

            # TODO more code goes here, for example ...
            webpage = self._download_webpage(url, video_id)
@@ -139,7 +139,9 @@ def generator(test_case):

            if is_playlist:
                self.assertEqual(res_dict['_type'], 'playlist')
+                self.assertTrue('entries' in res_dict)
                expect_info_dict(self, test_case.get('info_dict', {}), res_dict)
+
            if 'playlist_mincount' in test_case:
                assertGreaterEqual(
                    self,
@@ -188,7 +190,7 @@ def generator(test_case):
                expect_info_dict(self, tc.get('info_dict', {}), info_dict)
        finally:
            try_rm_tcs_files()
-            if is_playlist and res_dict is not None:
+            if is_playlist and res_dict is not None and res_dict.get('entries'):
                # Remove all other files that may have been extracted if the
                # extractor returns full results even with extract_flat
                res_tcs = [{'info_dict': e} for e in res_dict['entries']]
@@ -22,7 +22,8 @@ from youtube_dl.utils import (
    fix_xml_ampersands,
    get_meta_content,
    orderedSet,
-    PagedList,
+    OnDemandPagedList,
+    InAdvancePagedList,
    parse_duration,
    read_batch_urls,
    sanitize_filename,
@@ -246,10 +247,14 @@ class TestUtil(unittest.TestCase):
                for i in range(firstid, upto):
                    yield i

-            pl = PagedList(get_page, pagesize)
+            pl = OnDemandPagedList(get_page, pagesize)
            got = pl.getslice(*sliceargs)
            self.assertEqual(got, expected)

+            iapl = InAdvancePagedList(get_page, size // pagesize + 1, pagesize)
+            got = iapl.getslice(*sliceargs)
+            self.assertEqual(got, expected)
+
        testPL(5, 2, (), [0, 1, 2, 3, 4])
        testPL(5, 2, (1,), [1, 2, 3, 4])
        testPL(5, 2, (2,), [2, 3, 4])
@@ -135,12 +135,14 @@ from .gametrailers import GametrailersIE
 from .gdcvault import GDCVaultIE
 from .generic import GenericIE
 from .godtube import GodTubeIE
+from .golem import GolemIE
 from .googleplus import GooglePlusIE
 from .googlesearch import GoogleSearchIE
 from .gorillavid import GorillaVidIE
 from .goshgay import GoshgayIE
 from .grooveshark import GroovesharkIE
 from .hark import HarkIE
+from .heise import HeiseIE
 from .helsinki import HelsinkiIE
 from .hentaistigma import HentaiStigmaIE
 from .hornbunny import HornBunnyIE
@@ -272,6 +274,7 @@ from .parliamentliveuk import ParliamentLiveUKIE
 from .patreon import PatreonIE
 from .pbs import PBSIE
 from .photobucket import PhotobucketIE
+from .played import PlayedIE
 from .playfm import PlayFMIE
 from .playvid import PlayvidIE
 from .podomatic import PodomaticIE
@@ -368,7 +371,10 @@ from .thisav import ThisAVIE
 from .tinypic import TinyPicIE
 from .tlc import TlcIE, TlcDeIE
 from .tnaflix import TNAFlixIE
-from .thvideo import THVideoIE
+from .thvideo import (
+    THVideoIE,
+    THVideoPlaylistIE
+)
 from .toutv import TouTvIE
 from .toypics import ToypicsUserIE, ToypicsIE
 from .traileraddict import TrailerAddictIE
@@ -409,11 +415,12 @@ from .videoweed import VideoWeedIE
 from .vidme import VidmeIE
 from .vimeo import (
    VimeoIE,
-    VimeoChannelIE,
-    VimeoUserIE,
    VimeoAlbumIE,
+    VimeoChannelIE,
    VimeoGroupsIE,
+    VimeoLikesIE,
    VimeoReviewIE,
+    VimeoUserIE,
    VimeoWatchLaterIE,
 )
 from .vimple import VimpleIE
@@ -8,8 +8,6 @@ from ..utils import (
    determine_ext,
    ExtractorError,
    qualities,
-    compat_urllib_parse_urlparse,
-    compat_urllib_parse,
    int_or_none,
    parse_duration,
    unified_strdate,
@@ -22,6 +22,7 @@ from ..utils import (
    clean_html,
    compiled_regex_type,
    ExtractorError,
+    float_or_none,
    int_or_none,
    RegexNotFoundError,
    sanitize_filename,
@@ -720,6 +721,28 @@ class InfoExtractor(object):
        now_str = now.strftime("%Y-%m-%d %H:%M")
        return name + ' ' + now_str

+    def _int(self, v, name, fatal=False, **kwargs):
+        res = int_or_none(v, **kwargs)
+        if 'get_attr' in kwargs:
+            print(getattr(v, kwargs['get_attr']))
+        if res is None:
+            msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
+            if fatal:
+                raise ExtractorError(msg)
+            else:
+                self._downloader.report_warning(msg)
+        return res
+
+    def _float(self, v, name, fatal=False, **kwargs):
+        res = float_or_none(v, **kwargs)
+        if res is None:
+            msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
+            if fatal:
+                raise ExtractorError(msg)
+            else:
+                self._downloader.report_warning(msg)
+        return res
+

 class SearchInfoExtractor(InfoExtractor):
    """
@@ -397,12 +397,6 @@ class GenericIE(InfoExtractor):
        },
    ]

-    def report_download_webpage(self, video_id):
-        """Report webpage download."""
-        if not self._downloader.params.get('test', False):
-            self._downloader.report_warning('Falling back on generic information extractor.')
-        super(GenericIE, self).report_download_webpage(video_id)
-
    def report_following_redirect(self, new_url):
        """Report information extraction."""
        self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
@@ -502,6 +496,7 @@ class GenericIE(InfoExtractor):

        url, smuggled_data = unsmuggle_url(url)
        force_videoid = None
+        is_intentional = smuggled_data and smuggled_data.get('to_generic')
        if smuggled_data and 'force_videoid' in smuggled_data:
            force_videoid = smuggled_data['force_videoid']
            video_id = force_videoid
@@ -544,6 +539,9 @@ class GenericIE(InfoExtractor):
                    'upload_date': upload_date,
                }

+        if not self._downloader.params.get('test', False) and not is_intentional:
+            self._downloader.report_warning('Falling back on generic information extractor.')
+
        try:
            webpage = self._download_webpage(url, video_id)
        except ValueError:
@@ -0,0 +1,71 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_urlparse,
+    determine_ext,
+)
+
+
+class GolemIE(InfoExtractor):
+    _VALID_URL = r'^https?://video\.golem\.de/.+?/(?P<id>.+?)/'
+    _TEST = {
+        'url': 'http://video.golem.de/handy/14095/iphone-6-und-6-plus-test.html',
+        'md5': 'c1a2c0a3c863319651c7c992c5ee29bf',
+        'info_dict': {
+            'id': '14095',
+            'format_id': 'high',
+            'ext': 'mp4',
+            'title': 'iPhone 6 und 6 Plus - Test',
+            'duration': 300.44,
+            'filesize': 65309548,
+        }
+    }
+
+    _PREFIX = 'http://video.golem.de'
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        config = self._download_xml(
+            'https://video.golem.de/xml/{0}.xml'.format(video_id), video_id)
+
+        info = {
+            'id': video_id,
+            'title': config.findtext('./title', 'golem'),
+            'duration': self._float(config.findtext('./playtime'), 'duration'),
+        }
+
+        formats = []
+        for e in config.findall('./*[url]'):
+            url = e.findtext('./url')
+            if not url:
+                self._downloader.report_warning(
+                    "{0}: url: empty, skipping".format(e.tag))
+                continue
+
+            formats.append({
+                'format_id': e.tag,
+                'url': compat_urlparse.urljoin(self._PREFIX, url),
+                'height': self._int(e.get('height'), 'height'),
+                'width': self._int(e.get('width'), 'width'),
+                'filesize': self._int(e.findtext('filesize'), 'filesize'),
+                'ext': determine_ext(e.findtext('./filename')),
+            })
+        self._sort_formats(formats)
+        info['formats'] = formats
+
+        thumbnails = []
+        for e in config.findall('.//teaser[url]'):
+            url = e.findtext('./url')
+            if not url:
+                continue
+            thumbnails.append({
+                'url': compat_urlparse.urljoin(self._PREFIX, url),
+                'width': self._int(e.get('width'), 'thumbnail width'),
+                'height': self._int(e.get('height'), 'thumbnail height'),
+            })
+        info['thumbnails'] = thumbnails
+
+        return info
@@ -0,0 +1,81 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    get_meta_content,
+    parse_iso8601,
+)
+
+
+class HeiseIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+        https?://(?:www\.)?heise\.de/video/artikel/
+        .+?(?P<id>[0-9]+)\.html(?:$|[?#])
+    '''
+    _TEST = {
+        'url': (
+            'http://www.heise.de/video/artikel/Podcast-c-t-uplink-3-3-Owncloud-Tastaturen-Peilsender-Smartphone-2404147.html'
+        ),
+        'md5': 'ffed432483e922e88545ad9f2f15d30e',
+        'info_dict': {
+            'id': '2404147',
+            'ext': 'mp4',
+            'title': (
+                "Podcast: c't uplink 3.3 – Owncloud / Tastaturen / Peilsender Smartphone"
+            ),
+            'format_id': 'mp4_720',
+            'timestamp': 1411812600,
+            'upload_date': '20140927',
+            'description': 'In uplink-Episode 3.3 geht es darum, wie man sich von Cloud-Anbietern emanzipieren kann, worauf man beim Kauf einer Tastatur achten sollte und was Smartphones über uns verraten.',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+        json_url = self._search_regex(
+            r'json_url:\s*"([^"]+)"', webpage, 'json URL')
+        config = self._download_json(json_url, video_id)
+
+        info = {
+            'id': video_id,
+            'thumbnail': config.get('poster'),
+            'timestamp': parse_iso8601(get_meta_content('date', webpage)),
+            'description': self._og_search_description(webpage),
+        }
+
+        title = get_meta_content('fulltitle', webpage)
+        if title:
+            info['title'] = title
+        elif config.get('title'):
+            info['title'] = config['title']
+        else:
+            info['title'] = self._og_search_title(webpage)
+
+        formats = []
+        for t, rs in config['formats'].items():
+            if not rs or not hasattr(rs, 'items'):
+                self._downloader.report_warning(
+                    'formats: {0}: no resolutions'.format(t))
+                continue
+
+            for height_str, obj in rs.items():
+                format_id = '{0}_{1}'.format(t, height_str)
+
+                if not obj or not obj.get('url'):
+                    self._downloader.report_warning(
+                        'formats: {0}: no url'.format(format_id))
+                    continue
+
+                formats.append({
+                    'url': obj['url'],
+                    'format_id': format_id,
+                    'height': self._int(height_str, 'height'),
+                })
+
+        self._sort_formats(formats)
+        info['formats'] = formats
+
+        return info
@@ -22,6 +22,7 @@ class MuenchenTVIE(InfoExtractor):
            'ext': 'mp4',
            'title': 're:^münchen.tv-Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
            'is_live': True,
+            'thumbnail': 're:^https?://.*\.jpg$'
        },
        'params': {
            'skip_download': True,
@@ -70,5 +71,6 @@ class MuenchenTVIE(InfoExtractor):
            'title': title,
            'formats': formats,
            'is_live': True,
+            'thumbnail': thumbnail,
        }

@@ -6,6 +6,7 @@ import re
 from .common import InfoExtractor
 from ..utils import (
    ExtractorError,
+    compat_urllib_parse,
    int_or_none,
    remove_end,
 )
@@ -13,76 +14,116 @@ from ..utils import (

 class NFLIE(InfoExtractor):
    IE_NAME = 'nfl.com'
-    _VALID_URL = r'(?x)https?://(?:www\.)?nfl\.com/(?:videos/(?:.+)/|.*?\#video=)(?P<id>\d..[0-9]+)'
-    _PLAYER_CONFIG_URL = 'http://www.nfl.com/static/content/static/config/video/config.json'
-    _TEST = {
-        'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights',
-        # 'md5': '5eb8c40a727dda106d510e5d6ffa79e5',  # md5 checksum fluctuates
-        'info_dict': {
-            'id': '0ap3000000398478',
-            'ext': 'mp4',
-            'title': 'Week 3: Washington Redskins vs. Philadelphia Eagles highlights',
-            'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478',
-            'upload_date': '20140921',
-            'timestamp': 1411337580,
-            'thumbnail': 're:^https?://.*\.jpg$',
+    _VALID_URL = r'''(?x)https?://
+        (?P<host>(?:www\.)?(?:nfl\.com|.*?\.clubs\.nfl\.com))/
+        (?:.+?/)*
+        (?P<id>(?:\d[a-z]{2}\d{13}|\w{8}\-(?:\w{4}\-){3}\w{12}))'''
+    _TESTS = [
+        {
+            'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights',
+            'md5': '394ef771ddcd1354f665b471d78ec4c6',
+            'info_dict': {
+                'id': '0ap3000000398478',
+                'ext': 'mp4',
+                'title': 'Week 3: Redskins vs. Eagles highlights',
+                'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478',
+                'upload_date': '20140921',
+                'timestamp': 1411337580,
+                'thumbnail': 're:^https?://.*\.jpg$',
+            }
+        },
+        {
+            'url': 'http://prod.www.steelers.clubs.nfl.com/video-and-audio/videos/LIVE_Post_Game_vs_Browns/9d72f26a-9e2b-4718-84d3-09fb4046c266',
+            'md5': 'cf85bdb4bc49f6e9d3816d130c78279c',
+            'info_dict': {
+                'id': '9d72f26a-9e2b-4718-84d3-09fb4046c266',
+                'ext': 'mp4',
+                'title': 'LIVE: Post Game vs. Browns',
+                'description': 'md5:6a97f7e5ebeb4c0e69a418a89e0636e8',
+                'upload_date': '20131229',
+                'timestamp': 1388354455,
+                'thumbnail': 're:^https?://.*\.jpg$',
+            }
+        }
+    ]
+
+    @staticmethod
+    def prepend_host(host, url):
+        if not url.startswith('http'):
+            if not url.startswith('/'):
+                url = '/%s' % url
+            url = 'http://{0:}{1:}'.format(host, url)
+        return url
+
+    @staticmethod
+    def format_from_stream(stream, protocol, host, path_prefix='',
+                           preference=0, note=None):
+        url = '{protocol:}://{host:}/{prefix:}{path:}'.format(
+            protocol=protocol,
+            host=host,
+            prefix=path_prefix,
+            path=stream.get('path'),
+        )
+        return {
+            'url': url,
+            'vbr': int_or_none(stream.get('rate', 0), 1000),
+            'preference': preference,
+            'format_note': note,
        }
-    }

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
+        video_id, host = mobj.group('id'), mobj.group('host')

-        config = self._download_json(self._PLAYER_CONFIG_URL, video_id,
+        webpage = self._download_webpage(url, video_id)
+
+        config_url = NFLIE.prepend_host(host, self._search_regex(
+            r'(?:config|configURL)\s*:\s*"([^"]+)"', webpage, 'config URL'))
+        config = self._download_json(config_url, video_id,
                                     note='Downloading player config')
-        url_template = 'http://nfl.com{contentURLTemplate:s}'.format(**config)
-        video_data = self._download_json(url_template.format(id=video_id), video_id)
-
-        cdns = config.get('cdns')
-        if not cdns:
-            raise ExtractorError('Failed to get CDN data', expected=True)
+        url_template = NFLIE.prepend_host(
+            host, '{contentURLTemplate:}'.format(**config))
+        video_data = self._download_json(
+            url_template.format(id=video_id), video_id)

        formats = []
-        streams = video_data.get('cdnData', {}).get('bitrateInfo', [])
-        for name, cdn in cdns.items():
-            # LimeLight streams don't seem to work
-            if cdn.get('name') == 'LIMELIGHT':
-                continue
-
-            protocol = cdn.get('protocol')
-            host = remove_end(cdn.get('host', ''), '/')
-            if not (protocol and host):
-                continue
-
-            path_prefix = cdn.get('pathprefix', '')
-            if path_prefix and not path_prefix.endswith('/'):
-                path_prefix = '%s/' % path_prefix
-
-            get_url = lambda p: '{protocol:s}://{host:s}/{prefix:s}{path:}'.format(
-                protocol=protocol,
-                host=host,
-                prefix=path_prefix,
-                path=p,
-            )
-
-            if protocol == 'rtmp':
-                preference = -2
-            elif 'prog' in name.lower():
-                preference = -1
-            else:
-                preference = 0
-
+        cdn_data = video_data.get('cdnData', {})
+        streams = cdn_data.get('bitrateInfo', [])
+        if cdn_data.get('format') == 'EXTERNAL_HTTP_STREAM':
+            parts = compat_urllib_parse.urlparse(cdn_data.get('uri'))
+            protocol, host = parts.scheme, parts.netloc
            for stream in streams:
-                path = stream.get('path')
-                if not path:
+                formats.append(
+                    NFLIE.format_from_stream(stream, protocol, host))
+        else:
+            cdns = config.get('cdns')
+            if not cdns:
+                raise ExtractorError('Failed to get CDN data', expected=True)
+
+            for name, cdn in cdns.items():
+                # LimeLight streams don't seem to work
+                if cdn.get('name') == 'LIMELIGHT':
                    continue

-                formats.append({
-                    'url': get_url(path),
-                    'vbr': int_or_none(stream.get('rate', 0), 1000),
-                    'preference': preference,
-                    'format_note': name,
-                })
+                protocol = cdn.get('protocol')
+                host = remove_end(cdn.get('host', ''), '/')
+                if not (protocol and host):
+                    continue
+
+                prefix = cdn.get('pathprefix', '')
+                if prefix and not prefix.endswith('/'):
+                    prefix = '%s/' % prefix
+
+                preference = 0
+                if protocol == 'rtmp':
+                    preference = -2
+                elif 'prog' in name.lower():
+                    preference = 1
+
+                for stream in streams:
+                    formats.append(
+                        NFLIE.format_from_stream(stream, protocol, host,
+                                                 prefix, preference, name))

        self._sort_formats(formats)

@@ -94,7 +135,7 @@ class NFLIE(InfoExtractor):

        return {
            'id': video_id,
-            'title': video_data.get('storyHeadline'),
+            'title': video_data.get('headline'),
            'formats': formats,
            'description': video_data.get('caption'),
            'duration': video_data.get('duration'),
@@ -4,6 +4,7 @@ import re

 from .common import InfoExtractor
 from ..utils import (
+    unified_strdate,
    US_RATINGS,
 )

@@ -11,10 +12,10 @@ from ..utils import (
 class PBSIE(InfoExtractor):
    _VALID_URL = r'''(?x)https?://
        (?:
-            # Direct video URL
-            video\.pbs\.org/(?:viralplayer|video)/(?P<id>[0-9]+)/? |
-            # Article with embedded player
-           (?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+)/?(?:$|[?\#]) |
+           # Direct video URL
+           video\.pbs\.org/(?:viralplayer|video)/(?P<id>[0-9]+)/? |
+           # Article with embedded player (or direct video)
+           (?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) |
           # Player
           video\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)/
        )
@@ -65,10 +66,25 @@ class PBSIE(InfoExtractor):
                'duration': 6559,
                'thumbnail': 're:^https?://.*\.jpg$',
            }
+        },
+        {
+            'url': 'http://www.pbs.org/wgbh/nova/earth/killer-typhoon.html',
+            'md5': '908f3e5473a693b266b84e25e1cf9703',
+            'info_dict': {
+                'id': '2365160389',
+                'display_id': 'killer-typhoon',
+                'ext': 'mp4',
+                'description': 'md5:c741d14e979fc53228c575894094f157',
+                'title': 'Killer Typhoon',
+                'duration': 3172,
+                'thumbnail': 're:^https?://.*\.jpg$',
+                'upload_date': '20140122',
+            }
        }
+
    ]

-    def _extract_ids(self, url):
+    def _extract_webpage(self, url):
        mobj = re.match(self._VALID_URL, url)

        presumptive_id = mobj.group('presumptive_id')
@@ -76,15 +92,20 @@ class PBSIE(InfoExtractor):
        if presumptive_id:
            webpage = self._download_webpage(url, display_id)

+            upload_date = unified_strdate(self._search_regex(
+                r'<input type="hidden" id="air_date_[0-9]+" value="([^"]+)"',
+                webpage, 'upload date', default=None))
+
            MEDIA_ID_REGEXES = [
                r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'",  # frontline video embed
                r'class="coveplayerid">([^<]+)<',                       # coveplayer
+                r'<input type="hidden" id="pbs_video_id_[0-9]+" value="([0-9]+)"/>',  # jwplayer
            ]

            media_id = self._search_regex(
                MEDIA_ID_REGEXES, webpage, 'media ID', fatal=False, default=None)
            if media_id:
-                return media_id, presumptive_id
+                return media_id, presumptive_id, upload_date

            url = self._search_regex(
                r'<iframe\s+(?:class|id)=["\']partnerPlayer["\'].*?\s+src=["\'](.*?)["\']>',
@@ -104,10 +125,10 @@ class PBSIE(InfoExtractor):
            video_id = mobj.group('id')
            display_id = video_id

-        return video_id, display_id
+        return video_id, display_id, None

    def _real_extract(self, url):
-        video_id, display_id = self._extract_ids(url)
+        video_id, display_id, upload_date = self._extract_webpage(url)

        info_url = 'http://video.pbs.org/videoInfo/%s?format=json' % video_id
        info = self._download_json(info_url, display_id)
@@ -119,6 +140,7 @@ class PBSIE(InfoExtractor):

        return {
            'id': video_id,
+            'display_id': display_id,
            'title': info['title'],
            'url': info['alternate_encoding']['url'],
            'ext': 'mp4',
@@ -126,4 +148,5 @@ class PBSIE(InfoExtractor):
            'thumbnail': info.get('image_url'),
            'duration': info.get('duration'),
            'age_limit': age_limit,
+            'upload_date': upload_date,
        }
@@ -0,0 +1,55 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import os.path
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_urllib_parse,
+    compat_urllib_request,
+)
+
+
+class PlayedIE(InfoExtractor):
+    IE_NAME = 'played.to'
+    _VALID_URL = r'https?://(?:www\.)?played\.to/(?P<id>[a-zA-Z0-9_-]+)'
+
+    _TEST = {
+        'url': 'http://played.to/j2f2sfiiukgt',
+        'md5': 'c2bd75a368e82980e7257bf500c00637',
+        'info_dict': {
+            'id': 'j2f2sfiiukgt',
+            'ext': 'flv',
+            'title': 'youtube-dl_test_video.mp4',
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        orig_webpage = self._download_webpage(url, video_id)
+        fields = re.findall(
+            r'type="hidden" name="([^"]+)"\s+value="([^"]+)">', orig_webpage)
+        data = dict(fields)
+
+        self._sleep(2, video_id)
+
+        post = compat_urllib_parse.urlencode(data)
+        headers = {
+            b'Content-Type': b'application/x-www-form-urlencoded',
+        }
+        req = compat_urllib_request.Request(url, post, headers)
+        webpage = self._download_webpage(
+            req, video_id, note='Downloading video page ...')
+
+        title = os.path.splitext(data['fname'])[0]
+
+        video_url = self._search_regex(
+            r'file: "?(.+?)",', webpage, 'video URL')
+
+        return {
+            'id': video_id,
+            'title': title,
+            'url': video_url,
+        }
@@ -26,8 +26,7 @@ class THVideoIE(InfoExtractor):
    }

    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
+        video_id = self._match_id(url)

        # extract download link from mobile player page
        webpage_player = self._download_webpage(
@@ -57,3 +56,29 @@ class THVideoIE(InfoExtractor):
            'description': description,
            'upload_date': upload_date
        }
+
+
+class THVideoPlaylistIE(InfoExtractor):
+    _VALID_URL = r'http?://(?:www\.)?thvideo\.tv/mylist(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://thvideo.tv/mylist2',
+        'info_dict': {
+            'id': '2',
+            'title': '幻想万華鏡',
+        },
+        'playlist_mincount': 23,
+    }
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, playlist_id)
+        list_title = self._html_search_regex(
+            r'<h1 class="show_title">(.*?)<b id', webpage, 'playlist title',
+            fatal=False)
+
+        entries = [
+            self.url_result('http://thvideo.tv/v/th' + id, 'THVideo')
+            for id in re.findall(r'<dd><a href="http://thvideo.tv/v/th(\d+)/" target=', webpage)]
+
+        return self.playlist_result(entries, playlist_id, list_title)
@@ -5,7 +5,6 @@ import xml.etree.ElementTree

 from .common import InfoExtractor
 from ..utils import (
-    compat_HTTPError,
    compat_urllib_request,
    ExtractorError,
 )
@@ -8,17 +8,19 @@ import itertools
 from .common import InfoExtractor
 from .subtitles import SubtitlesInfoExtractor
 from ..utils import (
+    clean_html,
    compat_HTTPError,
    compat_urllib_parse,
    compat_urllib_request,
-    clean_html,
-    get_element_by_attribute,
+    compat_urlparse,
    ExtractorError,
+    get_element_by_attribute,
+    InAdvancePagedList,
+    int_or_none,
    RegexNotFoundError,
    std_headers,
    unsmuggle_url,
    urlencode_postdata,
-    int_or_none,
 )


@@ -529,3 +531,58 @@ class VimeoWatchLaterIE(VimeoBaseInfoExtractor, VimeoChannelIE):

    def _real_extract(self, url):
        return self._extract_videos('watchlater', 'https://vimeo.com/home/watchlater')
+
+
+class VimeoLikesIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?vimeo\.com/user(?P<id>[0-9]+)/likes/?(?:$|[?#]|sort:)'
+    IE_NAME = 'vimeo:likes'
+    IE_DESC = 'Vimeo user likes'
+    _TEST = {
+        'url': 'https://vimeo.com/user755559/likes/',
+        'playlist_mincount': 293,
+        "info_dict": {
+            "description": "See all the videos urza likes",
+            "title": 'Videos urza likes',
+        },
+    }
+
+    def _real_extract(self, url):
+        user_id = self._match_id(url)
+        webpage = self._download_webpage(url, user_id)
+        page_count = self._int(
+            self._search_regex(
+                r'''(?x)<li><a\s+href="[^"]+"\s+data-page="([0-9]+)">
+                    .*?</a></li>\s*<li\s+class="pagination_next">
+                ''', webpage, 'page count'),
+            'page count', fatal=True)
+        PAGE_SIZE = 12
+        title = self._html_search_regex(
+            r'(?s)<h1>(.+?)</h1>', webpage, 'title', fatal=False)
+        description = self._html_search_meta('description', webpage)
+
+        def _get_page(idx):
+            page_url = '%s//vimeo.com/user%s/likes/page:%d/sort:date' % (
+                self.http_scheme(), user_id, idx + 1)
+            webpage = self._download_webpage(
+                page_url, user_id,
+                note='Downloading page %d/%d' % (idx + 1, page_count))
+            video_list = self._search_regex(
+                r'(?s)<ol class="js-browse_list[^"]+"[^>]*>(.*?)</ol>',
+                webpage, 'video content')
+            paths = re.findall(
+                r'<li[^>]*>\s*<a\s+href="([^"]+)"', video_list)
+            for path in paths:
+                yield {
+                    '_type': 'url',
+                    'url': compat_urlparse.urljoin(page_url, path),
+                }
+
+        pl = InAdvancePagedList(_get_page, page_count, PAGE_SIZE)
+
+        return {
+            '_type': 'playlist',
+            'id': 'user%s_likes' % user_id,
+            'title': title,
+            'description': description,
+            'entries': pl,
+        }
@@ -2,7 +2,6 @@

 from __future__ import unicode_literals

-import json
 import math
 import random
 import re
@@ -26,7 +26,7 @@ from ..utils import (
    get_element_by_attribute,
    ExtractorError,
    int_or_none,
-    PagedList,
+    OnDemandPagedList,
    unescapeHTML,
    unified_strdate,
    orderedSet,
@@ -655,6 +655,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):

        # Get video webpage
        url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
+        pref_cookies = [
+            c for c in self._downloader.cookiejar
+            if c.domain == '.youtube.com' and c.name == 'PREF']
+        for pc in pref_cookies:
+            if 'hl=' in pc.value:
+                pc.value = re.sub(r'hl=[^&]+', 'hl=en', pc.value)
+            else:
+                if pc.value:
+                    pc.value += '&'
+                pc.value += 'hl=en'
        video_webpage = self._download_webpage(url, video_id)

        # Attempt to extract SWF player URL
@@ -1341,7 +1351,7 @@ class YoutubeUserIE(InfoExtractor):
                    'id': video_id,
                    'title': title,
                }
-        url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
+        url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)

        return self.playlist_result(url_results, playlist_title=username)

@@ -87,7 +87,7 @@ def parseOpts(overrideArguments=None):
        for private_opt in ['-p', '--password', '-u', '--username', '--video-password']:
            try:
                i = opts.index(private_opt)
-                opts[i+1] = '<PRIVATE>'
+                opts[i+1] = 'PRIVATE'
            except ValueError:
                pass
        return opts
@@ -1384,14 +1384,16 @@ def check_executable(exe, args=[]):


 class PagedList(object):
-    def __init__(self, pagefunc, pagesize):
-        self._pagefunc = pagefunc
-        self._pagesize = pagesize
-
    def __len__(self):
        # This is only useful for tests
        return len(self.getslice())

+
+class OnDemandPagedList(PagedList):
+    def __init__(self, pagefunc, pagesize):
+        self._pagefunc = pagefunc
+        self._pagesize = pagesize
+
    def getslice(self, start=0, end=None):
        res = []
        for pagenum in itertools.count(start // self._pagesize):
@@ -1430,6 +1432,35 @@ class PagedList(object):
        return res


+class InAdvancePagedList(PagedList):
+    def __init__(self, pagefunc, pagecount, pagesize):
+        self._pagefunc = pagefunc
+        self._pagecount = pagecount
+        self._pagesize = pagesize
+
+    def getslice(self, start=0, end=None):
+        res = []
+        start_page = start // self._pagesize
+        end_page = (
+            self._pagecount if end is None else (end // self._pagesize + 1))
+        skip_elems = start - start_page * self._pagesize
+        only_more = None if end is None else end - start
+        for pagenum in range(start_page, end_page):
+            page = list(self._pagefunc(pagenum))
+            if skip_elems:
+                page = page[skip_elems:]
+                skip_elems = None
+            if only_more is not None:
+                if len(page) < only_more:
+                    only_more -= len(page)
+                else:
+                    page = page[:only_more]
+                    res.extend(page)
+                    break
+            res.extend(page)
+        return res
+
+
 def uppercase_escape(s):
    unicode_escape = codecs.getdecoder('unicode_escape')
    return re.sub(
@@ -1,2 +1,2 @@

-__version__ = '2014.09.28'
+__version__ = '2014.09.29.2'
Author	SHA1	Message	Date
Philipp Hagemeister	35d3e63d24	release 2014.09.29.2	2014-09-29 04:49:11 +02:00
Philipp Hagemeister	27aede9074	[pbs] Add support for series/jwplayer type video (Fixes #3849 )	2014-09-29 04:48:50 +02:00
Philipp Hagemeister	f5b7e6a842	release 2014.09.29.1	2014-09-29 02:04:28 +02:00
Philipp Hagemeister	a1f934b171	[youtube] Correct language cookie handling	2014-09-29 02:04:16 +02:00
Philipp Hagemeister	a43ee88c6f	release 2014.09.29	2014-09-29 01:51:53 +02:00
Philipp Hagemeister	e2dce53781	[youtube] Always request webpage in English (Fixes #3844 )	2014-09-29 01:39:26 +02:00
Philipp Hagemeister	1770ed9e86	[thvideo] Simplify (#3848 )	2014-09-29 00:38:37 +02:00
Philipp Hagemeister	457ac58cc7	Merge remote-tracking branch 'diffycat/thvideo-update'	2014-09-29 00:36:55 +02:00
Philipp Hagemeister	9c44d2429b	[vimeo:likes] Support large like lists (Fixes #3847 )	2014-09-29 00:36:06 +02:00
Philipp Hagemeister	d2e32f7df5	Do not use HTML characters in output This messes up the format when people paste it outside of code tags.	2014-09-29 00:23:43 +02:00
Anton Larionov	67077b182b	[thvideo] Add support for playlists	2014-09-28 23:36:55 +04:00
Naglis Jonaitis	5f4c318844	[nfl] Support team micro-sites (fixes #3831 )	2014-09-28 21:48:26 +03:00
Naglis Jonaitis	dfee83234b	[nfl] Prefer progressive downloads	2014-09-28 19:25:28 +03:00
Sergey M․	7f5c0c4a19	[README] Clarify test's md5 filesize (#3846 )	2014-09-28 22:10:20 +07:00
Philipp Hagemeister	4bc77c8417	[README] Use _match_id helper function	2014-09-28 13:52:21 +02:00
Philipp Hagemeister	22dd3fad86	release 2014.09.28.1	2014-09-28 12:14:25 +02:00
Philipp Hagemeister	d6e6a42256	[vimeo:likes] Add new extractor (Fixes #3835 )	2014-09-28 12:14:16 +02:00
Philipp Hagemeister	76e7d1e74b	[played] Remove unused import	2014-09-28 10:56:36 +02:00
Philipp Hagemeister	38c4d41b74	[played] Simplify (#3798 )	2014-09-28 10:55:27 +02:00
Philipp Hagemeister	f0b8e3607d	Merge remote-tracking branch 'r4mos/played'	2014-09-28 10:52:23 +02:00
Philipp Hagemeister	51ee08c4bb	Remove unused imports	2014-09-28 10:50:43 +02:00
Philipp Hagemeister	c841789772	[muenchentv] Add thumbnail	2014-09-28 10:49:58 +02:00
Philipp Hagemeister	c121a75b36	[heise] Add support for description	2014-09-28 10:49:12 +02:00
Philipp Hagemeister	5a8b77551d	[heise] Simplify (#3842 )	2014-09-28 10:47:25 +02:00
Philipp Hagemeister	0217aee154	Merge remote-tracking branch 'd912e3/heise'	2014-09-28 10:36:44 +02:00
Philipp Hagemeister	b14f3a4c1d	[golem] Simplify (#3828 )	2014-09-28 10:35:19 +02:00
Philipp Hagemeister	92f7963f6e	Merge remote-tracking branch 'd912e3/golem'	2014-09-28 10:10:34 +02:00
Mats	7b7518124e	[heise] Don't check string type Before Python 3 could be unicode, so don't check at all.	2014-09-27 21:12:23 +02:00
Mats	70752ccefd	[golem] Don't omit positional argument specifiers Required by Python 2.6.	2014-09-27 19:35:55 +02:00
Mats	0155549d6c	[heise] Add new extractor	2014-09-27 19:28:01 +02:00
Mats	6a5af6acb9	[golem] Add new extractor	2014-09-25 16:25:53 +02:00
Carlos Ramos	5aa38e75b2	[played] Add new extractor	2014-09-19 22:46:57 +02:00