release 2014.01.22

Merge branch 'master' of github.com:rg3/youtube-dl
Conflicts: youtube_dl/extractor/mtv.py
2026-06-21 20:04:50 +00:00 · 2014-01-22 00:21:41 +01:00 · 2014-01-22 00:21:27 +01:00 · 2014-01-22 00:18:52 +01:00 · 2014-01-22 00:18:09 +01:00 · 2014-01-22 00:17:33 +01:00
19 changed files with 324 additions and 85 deletions
@@ -71,6 +71,7 @@ which means you can modify it, redistribute it or use it however you like.
    --download-archive FILE    Download only videos not listed in the archive
                               file. Record the IDs of all downloaded videos in
                               it.
+    --include-ads              Download advertisements as well (experimental)

 ## Download Options:
    -r, --rate-limit LIMIT     maximum download rate in bytes per second (e.g.
@@ -16,6 +16,7 @@ from youtube_dl.utils import (
    DateRange,
    encodeFilename,
    find_xpath_attr,
+    fix_xml_ampersands,
    get_meta_content,
    orderedSet,
    parse_duration,
@@ -200,5 +201,18 @@ class TestUtil(unittest.TestCase):
        self.assertEqual(parse_duration('9:12:43'), 33163)
        self.assertEqual(parse_duration('x:y'), None)

+    def test_fix_xml_ampersands(self):
+        self.assertEqual(
+            fix_xml_ampersands('"&x=y&z=a'), '"&amp;x=y&amp;z=a')
+        self.assertEqual(
+            fix_xml_ampersands('"&amp;x=y&wrong;&z=a'),
+            '"&amp;x=y&amp;wrong;&amp;z=a')
+        self.assertEqual(
+            fix_xml_ampersands('&amp;&apos;&gt;&lt;&quot;'),
+            '&amp;&apos;&gt;&lt;&quot;')
+        self.assertEqual(
+            fix_xml_ampersands('&#1234;&#x1abC;'), '&#1234;&#x1abC;')
+        self.assertEqual(fix_xml_ampersands('&#&#'), '&amp;#&amp;#')
+
 if __name__ == '__main__':
    unittest.main()
@@ -151,6 +151,7 @@ class YoutubeDL(object):
    bidi_workaround:   Work around buggy terminals without bidirectional text
                       support, using fridibi
    debug_printtraffic:Print out sent and received HTTP traffic
+    include_ads:       Download ads as well

    The following parameters are not used by YoutubeDL itself, they are used by
    the FileDownloader:
@@ -238,7 +238,10 @@ def parseOpts(overrideArguments=None):
    selection.add_option('--download-archive', metavar='FILE',
                         dest='download_archive',
                         help='Download only videos not listed in the archive file. Record the IDs of all downloaded videos in it.')
-
+    selection.add_option(
+        '--include-ads', dest='include_ads',
+        action='store_true',
+        help='Download advertisements as well (experimental)')

    authentication.add_option('-u', '--username',
            dest='username', metavar='USERNAME', help='account username')
@@ -716,6 +719,7 @@ def _real_main(argv=None):
        'bidi_workaround': opts.bidi_workaround,
        'debug_printtraffic': opts.debug_printtraffic,
        'prefer_ffmpeg': opts.prefer_ffmpeg,
+        'include_ads': opts.include_ads,
    }

    with YoutubeDL(ydl_opts) as ydl:
@@ -119,7 +119,10 @@ from .mit import TechTVMITIE, MITIE
 from .mixcloud import MixcloudIE
 from .mpora import MporaIE
 from .mofosex import MofosexIE
-from .mtv import MTVIE
+from .mtv import (
+    MTVIE,
+    MTVIggyIE,
+)
 from .muzu import MuzuTVIE
 from .myspace import MySpaceIE
 from .myspass import MySpassIE
@@ -152,6 +155,7 @@ from .rottentomatoes import RottenTomatoesIE
 from .roxwel import RoxwelIE
 from .rtlnow import RTLnowIE
 from .rutube import RutubeIE
+from .servingsys import ServingSysIE
 from .sina import SinaIE
 from .slashdot import SlashdotIE
 from .slideshare import SlideshareIE
@@ -170,6 +174,7 @@ from .southparkstudios import (
 from .space import SpaceIE
 from .spankwire import SpankwireIE
 from .spiegel import SpiegelIE
+from .spike import SpikeIE
 from .stanfordoc import StanfordOpenClassroomIE
 from .statigram import StatigramIE
 from .steam import SteamIE
@@ -9,9 +9,11 @@ from .common import InfoExtractor
 from ..utils import (
    compat_urllib_parse,
    find_xpath_attr,
+    fix_xml_ampersands,
    compat_urlparse,
    compat_str,
    compat_urllib_request,
+    compat_parse_qs,

    ExtractorError,
    unsmuggle_url,
@@ -83,17 +85,33 @@ class BrightcoveIE(InfoExtractor):
                            lambda m: m.group(1) + '/>', object_str)
        # Fix up some stupid XML, see https://github.com/rg3/youtube-dl/issues/1608
        object_str = object_str.replace('<--', '<!--')
+        object_str = fix_xml_ampersands(object_str)

        object_doc = xml.etree.ElementTree.fromstring(object_str)
-        assert 'BrightcoveExperience' in object_doc.attrib['class']
-        params = {
-            'playerID': find_xpath_attr(object_doc, './param', 'name', 'playerID').attrib['value'],
-        }
+
+        fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars')
+        if fv_el is not None:
+            flashvars = dict(
+                (k, v[0])
+                for k, v in compat_parse_qs(fv_el.attrib['value']).items())
+        else:
+            flashvars = {}
+
        def find_param(name):
+            if name in flashvars:
+                return flashvars[name]
            node = find_xpath_attr(object_doc, './param', 'name', name)
            if node is not None:
                return node.attrib['value']
            return None
+
+        params = {}
+
+        playerID = find_param('playerID')
+        if playerID is None:
+            raise ExtractorError('Cannot find player ID')
+        params['playerID'] = playerID
+
        playerKey = find_param('playerKey')
        # Not all pages define this value
        if playerKey is not None:
@@ -114,8 +132,12 @@ class BrightcoveIE(InfoExtractor):
        if it can't be found
        """
        m_brightcove = re.search(
-            r'<object[^>]+?class=([\'"])[^>]*?BrightcoveExperience.*?\1.+?</object>',
-            webpage, re.DOTALL)
+            r'''(?sx)<object
+            (?:
+                [^>]+?class=([\'"])[^>]*?BrightcoveExperience.*?\1 |
+                [^>]*?>\s*<param\s+name="movie"\s+value="https?://[^/]*brightcove\.com/
+            ).+?</object>''',
+            webpage)
        if m_brightcove is not None:
            return cls._build_brighcove_url(m_brightcove.group())
        else:
@@ -156,6 +178,7 @@ class BrightcoveIE(InfoExtractor):
        info = self._search_regex(r'var experienceJSON = ({.*?});', webpage, 'json')
        info = json.loads(info)['data']
        video_info = info['programmedContent']['videoPlayer']['mediaDTO']
+        video_info['_youtubedl_adServerURL'] = info.get('adServerURL')

        return self._extract_video_info(video_info)

@@ -193,6 +216,23 @@ class BrightcoveIE(InfoExtractor):
            info.update({
                'url': video_info['FLVFullLengthURL'],
            })
-        else:
+
+        if self._downloader.params.get('include_ads', False):
+            adServerURL = video_info.get('_youtubedl_adServerURL')
+            if adServerURL:
+                ad_info = {
+                    '_type': 'url',
+                    'url': adServerURL,
+                }
+                if 'url' in info:
+                    return {
+                        '_type': 'playlist',
+                        'title': info['title'],
+                        'entries': [ad_info, info],
+                    }
+                else:
+                    return ad_info
+
+        if 'url' not in info and not info.get('formats'):
            raise ExtractorError('Unable to extract video url for %s' % info['id'])
        return info
@@ -3,7 +3,7 @@ import re
 from .common import InfoExtractor
 from ..utils import (
    find_xpath_attr,
-    fix_xml_all_ampersand,
+    fix_xml_ampersands
 )


@@ -33,7 +33,7 @@ class ClipsyndicateIE(InfoExtractor):
        pdoc = self._download_xml(
            'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars,
            video_id, u'Downloading video info',
-            transform_source=fix_xml_all_ampersand) 
+            transform_source=fix_xml_ampersands)

        track_doc = pdoc.find('trackList/track')
        def find_param(name):
@@ -220,6 +220,8 @@ class InfoExtractor(object):
                          webpage_bytes[:1024])
            if m:
                encoding = m.group(1).decode('ascii')
+            elif webpage_bytes.startswith(b'\xff\xfe'):
+                encoding = 'utf-16'
            else:
                encoding = 'utf-8'
        if self._downloader.params.get('dump_intermediate_pages', False):
@@ -236,7 +238,7 @@ class InfoExtractor(object):
            except AttributeError:
                url = url_or_request
            if len(url) > 200:
-                h = hashlib.md5(url).hexdigest()
+                h = u'___' + hashlib.md5(url).hexdigest()
                url = url[:200 - len(h)] + h
            raw_filename = ('%s_%s.dump' % (video_id, url))
            filename = sanitize_filename(raw_filename, restricted=True)
@@ -17,7 +17,12 @@ from ..utils import (
 class FacebookIE(InfoExtractor):
    """Information Extractor for Facebook"""

-    _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:[^#?]*#!/)?(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
+    _VALID_URL = r'''(?x)
+        (?:https?://)?(?:\w+\.)?facebook\.com/
+        (?:[^#?]*\#!/)?
+        (?:video/video\.php|photo\.php|video/embed)\?(?:.*?)
+        (?:v|video_id)=(?P<id>[0-9]+)
+        (?:.*)'''
    _LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1'
    _CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1'
    _NETRC_MACHINE = 'facebook'
@@ -90,7 +95,7 @@ class FacebookIE(InfoExtractor):
        mobj = re.match(self._VALID_URL, url)
        if mobj is None:
            raise ExtractorError(u'Invalid URL: %s' % url)
-        video_id = mobj.group('ID')
+        video_id = mobj.group('id')

        url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
        webpage = self._download_webpage(url, video_id)
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
 import re

 from .mtv import MTVServicesInfoExtractor
@@ -6,12 +8,12 @@ from .mtv import MTVServicesInfoExtractor
 class GametrailersIE(MTVServicesInfoExtractor):
    _VALID_URL = r'http://www\.gametrailers\.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
    _TEST = {
-        u'url': u'http://www.gametrailers.com/videos/zbvr8i/mirror-s-edge-2-e3-2013--debut-trailer',
-        u'file': u'70e9a5d7-cf25-4a10-9104-6f3e7342ae0d.mp4',
-        u'md5': u'4c8e67681a0ea7ec241e8c09b3ea8cf7',
-        u'info_dict': {
-            u'title': u'E3 2013: Debut Trailer',
-            u'description': u'Faith is back!  Check out the World Premiere trailer for Mirror\'s Edge 2 straight from the EA Press Conference at E3 2013!',
+        'url': 'http://www.gametrailers.com/videos/zbvr8i/mirror-s-edge-2-e3-2013--debut-trailer',
+        'file': '70e9a5d7-cf25-4a10-9104-6f3e7342ae0d.mp4',
+        'md5': '4c8e67681a0ea7ec241e8c09b3ea8cf7',
+        'info_dict': {
+            'title': 'Mirror\'s Edge 2|E3 2013: Debut Trailer',
+            'description': 'Faith is back!  Check out the World Premiere trailer for Mirror\'s Edge 2 straight from the EA Press Conference at E3 2013!',
        },
    }

@@ -23,5 +25,5 @@ class GametrailersIE(MTVServicesInfoExtractor):
        webpage = self._download_webpage(url, video_id)
        mgid = self._search_regex([r'data-video="(?P<mgid>mgid:.*?)"',
                                   r'data-contentId=\'(?P<mgid>mgid:.*?)\''],
-                                  webpage, u'mgid')
+                                  webpage, 'mgid')
        return self._get_videos_info(mgid)
@@ -92,11 +92,12 @@ class GenericIE(InfoExtractor):
        # ooyala video
        {
            'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
+            'file': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ.mp4',
            'md5': '5644c6ca5d5782c1d0d350dad9bd840c',
            'info_dict': {
                'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
                'ext': 'mp4',
-                'title': '2cc213299525360.mov', #that's what we get
+                'title': '2cc213299525360.mov',  # that's what we get
            },
        },
    ]
@@ -318,6 +319,12 @@ class GenericIE(InfoExtractor):
        if mobj is not None:
            return self.url_result(mobj.group('url'), 'Novamov')

+        # Look for embedded Facebook player
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>https://www.facebook.com/video/embed.+?)\1', webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'), 'Facebook')
+
        # Start with something easy: JW Player in SWFObject
        mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
        if mobj is None:
@@ -4,7 +4,7 @@ import re

 from .common import InfoExtractor
 from ..utils import (
-    fix_xml_all_ampersand,
+    fix_xml_ampersands,
 )


@@ -27,7 +27,7 @@ class MetacriticIE(InfoExtractor):
        webpage = self._download_webpage(url, video_id)
        # The xml is not well formatted, there are raw '&'
        info = self._download_xml('http://www.metacritic.com/video_data?video=' + video_id,
-            video_id, 'Downloading info xml', transform_source=fix_xml_all_ampersand)
+            video_id, 'Downloading info xml', transform_source=fix_xml_ampersands)

        clip = next(c for c in info.findall('playList/clip') if c.find('id').text == video_id)
        formats = []
@@ -1,12 +1,18 @@
+from __future__ import unicode_literals
+
 import re
-import xml.etree.ElementTree

 from .common import InfoExtractor
 from ..utils import (
    compat_urllib_parse,
    ExtractorError,
+    find_xpath_attr,
+    fix_xml_ampersands,
+    url_basename,
+    RegexNotFoundError,
 )

+
 def _media_xml_tag(tag):
    return '{http://search.yahoo.com/mrss/}%s' % tag

@@ -33,10 +39,9 @@ class MTVServicesInfoExtractor(InfoExtractor):
        else:
            return thumb_node.attrib['url']

-    def _extract_video_formats(self, metadataXml):
-        if '/error_country_block.swf' in metadataXml:
-            raise ExtractorError(u'This video is not available from your country.', expected=True)
-        mdoc = xml.etree.ElementTree.fromstring(metadataXml.encode('utf-8'))
+    def _extract_video_formats(self, mdoc):
+        if re.match(r'.*/error_country_block\.swf$', mdoc.find('.//src').text) is not None:
+            raise ExtractorError('This video is not available from your country.', expected=True)

        formats = []
        for rendition in mdoc.findall('.//rendition'):
@@ -59,11 +64,12 @@ class MTVServicesInfoExtractor(InfoExtractor):
        self.report_extraction(video_id)
        mediagen_url = itemdoc.find('%s/%s' % (_media_xml_tag('group'), _media_xml_tag('content'))).attrib['url']
        # Remove the templates, like &device={device}
-        mediagen_url = re.sub(r'&[^=]*?={.*?}(?=(&|$))', u'', mediagen_url)
+        mediagen_url = re.sub(r'&[^=]*?={.*?}(?=(&|$))', '', mediagen_url)
        if 'acceptMethods' not in mediagen_url:
            mediagen_url += '&acceptMethods=fms'
-        mediagen_page = self._download_webpage(mediagen_url, video_id,
-                                               u'Downloading video urls')
+
+        mediagen_doc = self._download_xml(mediagen_url, video_id,
+            'Downloading video urls')

        description_node = itemdoc.find('description')
        if description_node is not None:
@@ -71,9 +77,22 @@ class MTVServicesInfoExtractor(InfoExtractor):
        else:
            description = None

+        title_el = None
+        if title_el is None:
+            title_el = find_xpath_attr(
+                itemdoc, './/{http://search.yahoo.com/mrss/}category',
+                'scheme', 'urn:mtvn:video_title')
+        if title_el is None:
+            title_el = itemdoc.find('.//{http://search.yahoo.com/mrss/}title')
+        if title_el is None:
+            title_el = itemdoc.find('.//title')
+        title = title_el.text
+        if title is None:
+            raise ExtractorError('Could not find video title')
+
        return {
-            'title': itemdoc.find('title').text,
-            'formats': self._extract_video_formats(mediagen_page),
+            'title': title,
+            'formats': self._extract_video_formats(mediagen_doc),
            'id': video_id,
            'thumbnail': self._get_thumbnail_url(uri, itemdoc),
            'description': description,
@@ -83,14 +102,22 @@ class MTVServicesInfoExtractor(InfoExtractor):
        video_id = self._id_from_uri(uri)
        data = compat_urllib_parse.urlencode({'uri': uri})

-        def fix_ampersand(s):
-            """ Fix unencoded ampersand in XML """
-            return s.replace(u'& ', '&amp; ')
        idoc = self._download_xml(
            self._FEED_URL + '?' + data, video_id,
-            u'Downloading info', transform_source=fix_ampersand)
+            'Downloading info', transform_source=fix_xml_ampersands)
        return [self._get_video_info(item) for item in idoc.findall('.//item')]

+    def _real_extract(self, url):
+        title = url_basename(url)
+        webpage = self._download_webpage(url, title)
+        try:
+            # the url is in the format http://media.mtvnservices.com/fb/{mgid}.swf
+            fb_url = self._og_search_video_url(webpage)
+            mgid = url_basename(fb_url).rpartition('.')[0]
+        except RegexNotFoundError:
+            mgid = self._search_regex(r'data-mgid="(.*?)"', webpage, u'mgid')
+        return self._get_videos_info(mgid)
+

 class MTVIE(MTVServicesInfoExtractor):
    _VALID_URL = r'''(?x)^https?://
@@ -101,25 +128,25 @@ class MTVIE(MTVServicesInfoExtractor):

    _TESTS = [
        {
-            u'url': u'http://www.mtv.com/videos/misc/853555/ours-vh1-storytellers.jhtml',
-            u'file': u'853555.mp4',
-            u'md5': u'850f3f143316b1e71fa56a4edfd6e0f8',
-            u'info_dict': {
-                u'title': u'Taylor Swift - "Ours (VH1 Storytellers)"',
-                u'description': u'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.',
+            'url': 'http://www.mtv.com/videos/misc/853555/ours-vh1-storytellers.jhtml',
+            'file': '853555.mp4',
+            'md5': '850f3f143316b1e71fa56a4edfd6e0f8',
+            'info_dict': {
+                'title': 'Taylor Swift - "Ours (VH1 Storytellers)"',
+                'description': 'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.',
            },
        },
        {
-            u'add_ie': ['Vevo'],
-            u'url': u'http://www.mtv.com/videos/taylor-swift/916187/everything-has-changed-ft-ed-sheeran.jhtml',
-            u'file': u'USCJY1331283.mp4',
-            u'md5': u'73b4e7fcadd88929292fe52c3ced8caf',
-            u'info_dict': {
-                u'title': u'Everything Has Changed',
-                u'upload_date': u'20130606',
-                u'uploader': u'Taylor Swift',
+            'add_ie': ['Vevo'],
+            'url': 'http://www.mtv.com/videos/taylor-swift/916187/everything-has-changed-ft-ed-sheeran.jhtml',
+            'file': 'USCJY1331283.mp4',
+            'md5': '73b4e7fcadd88929292fe52c3ced8caf',
+            'info_dict': {
+                'title': 'Everything Has Changed',
+                'upload_date': '20130606',
+                'uploader': 'Taylor Swift',
            },
-            u'skip': u'VEVO is only available in some countries',
+            'skip': 'VEVO is only available in some countries',
        },
    ]

@@ -138,8 +165,22 @@ class MTVIE(MTVServicesInfoExtractor):
                               webpage, re.DOTALL)
            if m_vevo:
                vevo_id = m_vevo.group(1);
-                self.to_screen(u'Vevo video detected: %s' % vevo_id)
+                self.to_screen('Vevo video detected: %s' % vevo_id)
                return self.url_result('vevo:%s' % vevo_id, ie='Vevo')
    
-            uri = self._html_search_regex(r'/uri/(.*?)\?', webpage, u'uri')
+            uri = self._html_search_regex(r'/uri/(.*?)\?', webpage, 'uri')
        return self._get_videos_info(uri)
+
+
+class MTVIggyIE(MTVServicesInfoExtractor):
+    IE_NAME = 'mtviggy.com'
+    _VALID_URL = r'https?://www\.mtviggy\.com/videos/.+'
+    _TEST = {
+        'url': 'http://www.mtviggy.com/videos/arcade-fire-behind-the-scenes-at-the-biggest-music-experiment-yet/',
+        'info_dict': {
+            'id': '984696',
+            'ext': 'mp4',
+            'title': 'Short',
+        }
+    }
+    _FEED_URL = 'http://all.mtvworldverticals.com/feed-xml/'
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
 import re

 from .common import InfoExtractor
@@ -6,14 +8,14 @@ from .common import InfoExtractor
 class RedTubeIE(InfoExtractor):
    _VALID_URL = r'http://(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
    _TEST = {
-        u'url': u'http://www.redtube.com/66418',
-        u'file': u'66418.mp4',
+        'url': 'http://www.redtube.com/66418',
+        'file': '66418.mp4',
        # md5 varies from time to time, as in
        # https://travis-ci.org/rg3/youtube-dl/jobs/14052463#L295
-        #u'md5': u'7b8c22b5e7098a3e1c09709df1126d2d',
-        u'info_dict': {
-            u"title": u"Sucked on a toilet",
-            u"age_limit": 18,
+        #'md5': u'7b8c22b5e7098a3e1c09709df1126d2d',
+        'info_dict': {
+            "title": "Sucked on a toilet",
+            "age_limit": 18,
        }
    }

@@ -33,14 +35,19 @@ class RedTubeIE(InfoExtractor):
            r'<h1 class="videoTitle[^"]*">(.+?)</h1>',
            webpage, u'title')

+        video_thumbnail = self._html_search_regex(
+            r'playerInnerHTML.+?<img\s+src="(.+?)"',
+            webpage, u'thumbnail', fatal=False)
+
        # No self-labeling, but they describe themselves as
        # "Home of Videos Porno"
        age_limit = 18

        return {
-            'id':        video_id,
-            'url':       video_url,
-            'ext':       video_extension,
-            'title':     video_title,
+            'id': video_id,
+            'url': video_url,
+            'ext': video_extension,
+            'title': video_title,
+            'thumbnail': video_thumbnail,
            'age_limit': age_limit,
        }
@@ -0,0 +1,70 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+)
+
+
+class ServingSysIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:[^.]+\.)?serving-sys\.com/BurstingPipe/adServer\.bs\?.*?&pli=(?P<id>[0-9]+)'
+
+    _TEST = {
+        'url': 'http://bs.serving-sys.com/BurstingPipe/adServer.bs?cn=is&c=23&pl=VAST&pli=5349193&PluID=0&pos=7135&ord=[timestamp]&cim=1?',
+        'playlist': [{
+            'file': '29955898.flv',
+            'md5': 'baed851342df6846eb8677a60a011a0f',
+            'info_dict': {
+                'title': 'AdAPPter_Hyundai_demo (1)',
+                'duration': 74,
+                'tbr': 1378,
+                'width': 640,
+                'height': 400,
+            },
+        }, {
+            'file': '29907998.flv',
+            'md5': '979b4da2655c4bc2d81aeb915a8c5014',
+            'info_dict': {
+                'title': 'AdAPPter_Hyundai_demo (2)',
+                'duration': 34,
+                'width': 854,
+                'height': 480,
+                'tbr': 516,
+            },
+        }],
+        'params': {
+            'playlistend': 2,
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        pl_id = mobj.group('id')
+
+        vast_doc = self._download_xml(url, pl_id)
+        title = vast_doc.find('.//AdTitle').text
+        media = vast_doc.find('.//MediaFile').text
+        info_url = self._search_regex(r'&adData=([^&]+)&', media, 'info URL')
+
+        doc = self._download_xml(info_url, pl_id, 'Downloading video info')
+        entries = [{
+            '_type': 'video',
+            'id': a.attrib['id'],
+            'title': '%s (%s)' % (title, a.attrib['assetID']),
+            'url': a.attrib['URL'],
+            'duration': int_or_none(a.attrib.get('length')),
+            'tbr': int_or_none(a.attrib.get('bitrate')),
+            'height': int_or_none(a.attrib.get('height')),
+            'width': int_or_none(a.attrib.get('width')),
+        } for a in doc.findall('.//AdditionalAssets/asset')]
+
+        return {
+            '_type': 'playlist',
+            'id': pl_id,
+            'title': title,
+            'entries': entries,
+        }
+
+ 
@@ -0,0 +1,19 @@
+from __future__ import unicode_literals
+
+from .mtv import MTVServicesInfoExtractor
+
+
+class SpikeIE(MTVServicesInfoExtractor):
+    _VALID_URL = r'https?://www\.spike\.com/(video-clips|episodes)/.+'
+    _TEST = {
+        'url': 'http://www.spike.com/video-clips/lhtu8m/auction-hunters-can-allen-ride-a-hundred-year-old-motorcycle',
+        'md5': '1a9265f32b0c375793d6c4ce45255256',
+        'info_dict': {
+            'id': 'b9c8221a-4e50-479a-b86d-3333323e38ba',
+            'ext': 'mp4',
+            'title': 'Can Allen Ride A Hundred Year-Old Motorcycle?',
+            'description': 'md5:fbed7e82ed5fad493615b3094a9499cb',
+        },
+    }
+
+    _FEED_URL = 'http://www.spike.com/feeds/mrss/'
@@ -1,4 +1,6 @@
 # encoding: utf-8
+from __future__ import unicode_literals
+
 import re
 import json

@@ -10,19 +12,27 @@ from ..utils import (


 class VKIE(InfoExtractor):
-    IE_NAME = u'vk.com'
+    IE_NAME = 'vk.com'
    _VALID_URL = r'https?://vk\.com/(?:videos.*?\?.*?z=)?video(?P<id>.*?)(?:\?|%2F|$)'

-    _TEST = {
-        u'url': u'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521',
-        u'md5': u'0deae91935c54e00003c2a00646315f0',
-        u'info_dict': {
-            u'id': u'162222515',
-            u'ext': u'flv',
-            u'title': u'ProtivoGunz - Хуёвая песня',
-            u'uploader': u'Noize MC',
+    _TESTS = [{
+        'url': 'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521',
+        'file': '162222515.flv',
+        'md5': '0deae91935c54e00003c2a00646315f0',
+        'info_dict': {
+            'title': 'ProtivoGunz - Хуёвая песня',
+            'uploader': 'Noize MC',
        },
-    }
+    },
+    {
+        'url': 'http://vk.com/video4643923_163339118',
+        'file': '163339118.mp4',
+        'md5': 'f79bccb5cd182b1f43502ca5685b2b36',
+        'info_dict': {
+            'uploader': 'Elvira Dzhonik',
+            'title': 'Dream Theater - Hollow Years Live at Budokan 720*',
+        }
+    }]

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
@@ -33,13 +43,21 @@ class VKIE(InfoExtractor):
        if m_yt is not None:
            self.to_screen(u'Youtube video detected')
            return self.url_result(m_yt.group(1), 'Youtube')
-        vars_json = self._search_regex(r'var vars = ({.*?});', info_page, u'vars')
-        vars = json.loads(vars_json)
+        data_json = self._search_regex(r'var vars = ({.*?});', info_page, 'vars')
+        data = json.loads(data_json)
+
+        formats = [{
+            'format_id': k,
+            'url': v,
+            'width': int(k[len('url'):]),
+        } for k, v in data.items()
+            if k.startswith('url')]
+        self._sort_formats(formats)

        return {
-            'id': compat_str(vars['vid']),
-            'url': vars['url240'],
-            'title': unescapeHTML(vars['md_title']),
-            'thumbnail': vars['jpg'],
-            'uploader': vars['md_author'],
+            'id': compat_str(data['vid']),
+            'formats': formats,
+            'title': unescapeHTML(data['md_title']),
+            'thumbnail': data.get('jpg'),
+            'uploader': data.get('md_author'),
        }
@@ -1092,9 +1092,12 @@ def month_by_name(name):
        return None


-def fix_xml_all_ampersand(xml_str):
+def fix_xml_ampersands(xml_str):
    """Replace all the '&' by '&amp;' in XML"""
-    return xml_str.replace(u'&', u'&amp;')
+    return re.sub(
+        r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
+        u'&amp;',
+        xml_str)


 def setproctitle(title):
@@ -1,2 +1,2 @@

-__version__ = '2014.01.20'
+__version__ = '2014.01.22'
Author	SHA1	Message	Date
Philipp Hagemeister	398edd0689	release 2014.01.22	2014-01-22 00:21:41 +01:00
Philipp Hagemeister	6562df768d	Merge branch 'master' of github.com:rg3/youtube-dl Conflicts: youtube_dl/extractor/mtv.py	2014-01-22 00:21:27 +01:00
Philipp Hagemeister	06769acd71	[gametrailers] Use unicode_literals Conflicts: youtube_dl/extractor/gametrailers.py	2014-01-22 00:18:52 +01:00
Philipp Hagemeister	32dac6943d	[mtv] Use unicode_literals	2014-01-22 00:18:09 +01:00
Philipp Hagemeister	90834c78fe	[mtv] Fix title for gametrailers (Fixes #2188 ) We now prefer the title including the category, because that title is what is presented at the actual sites.	2014-01-22 00:17:33 +01:00
Jaime Marquínez Ferrándiz	47917f24c4	[brightcove] Fix extraction of embedded videos There was a leading ‘:’ in the regex. The ‘flashvars’ parameter is not always available.	2014-01-21 22:04:46 +01:00
Jaime Marquínez Ferrándiz	d614aa40e3	[brightcove] Fix check for url in the result It may have the ‘formats’ field instead of ‘url’.	2014-01-21 21:53:10 +01:00
Jaime Marquínez Ferrándiz	bc4ba05fcb	[mtv] Add an extractor for mtviggy.com (#2072 )	2014-01-21 20:59:31 +01:00
Jaime Marquínez Ferrándiz	8d9453b9e8	Add an extractor for spike.com (#2072 ) Added a generic _real_extract to MTVServicesInfoExtractor	2014-01-21 20:54:47 +01:00
Jaime Marquínez Ferrándiz	e4f320a4d0	[mtv] Check for geo-blocked videos in the xml document, not in the xml’s string Allows to use the `_download_xml` method	2014-01-21 19:59:02 +01:00
Jaime Marquínez Ferrándiz	ef9f2ba7af	[mtv] Use unicode_literals	2014-01-21 19:58:21 +01:00
Philipp Hagemeister	4a3b72771f	release 2014.01.21.1	2014-01-21 18:21:53 +01:00
Philipp Hagemeister	913f32929b	[vk] Add support for HQ videos (Fixes #2187 )	2014-01-21 18:21:44 +01:00
Philipp Hagemeister	9834872bf6	[facebook] Add support for embeds Example URL: http://www.hostblogger.de/blog/archives/6181-Auto-jagt-Betonmischer.html	2014-01-21 18:10:17 +01:00
Philipp Hagemeister	94a23d2a1e	[vk] Use unicode_literals	2014-01-21 17:32:03 +01:00
Philipp Hagemeister	608bf69880	[vk] avoid built-in names	2014-01-21 17:29:04 +01:00
Philipp Hagemeister	032b3df5af	[redtube] Use unicode_literals	2014-01-21 14:16:44 +01:00
Mike Col	9d11a41fe4	[redtube] Add support for thumbnails Signed-off-by: Philipp Hagemeister <phihag@phihag.de>	2014-01-21 14:14:55 +01:00
Philipp Hagemeister	2989501131	release 2014.01.21	2014-01-21 14:07:41 +01:00
Philipp Hagemeister	7b0817e8e1	[servingsys] Add support This also adds support for brightcove advertisements. Fixes #2181	2014-01-21 02:09:51 +01:00
Philipp Hagemeister	9d4288b2d4	[extractor/common] Clarify when and when not we generate the filename	2014-01-21 01:41:13 +01:00
Philipp Hagemeister	3486df383b	[generic] Improve testcase	2014-01-21 01:40:34 +01:00
Philipp Hagemeister	b60016e831	Deal with implicitly UTF-16 decoded webpages These webpages don't specify an encoding and rely on the BOM	2014-01-21 01:39:40 +01:00
Philipp Hagemeister	5aafe895fc	Correct XML ampersand fixup	2014-01-20 22:11:34 +01:00