1
0
mirror of https://github.com/ytdl-org/youtube-dl.git synced 2026-06-21 20:04:50 +00:00

Compare commits

..

24 Commits

Author SHA1 Message Date
Philipp Hagemeister 398edd0689 release 2014.01.22 2014-01-22 00:21:41 +01:00
Philipp Hagemeister 6562df768d Merge branch 'master' of github.com:rg3/youtube-dl
Conflicts:
	youtube_dl/extractor/mtv.py
2014-01-22 00:21:27 +01:00
Philipp Hagemeister 06769acd71 [gametrailers] Use unicode_literals
Conflicts:
	youtube_dl/extractor/gametrailers.py
2014-01-22 00:18:52 +01:00
Philipp Hagemeister 32dac6943d [mtv] Use unicode_literals 2014-01-22 00:18:09 +01:00
Philipp Hagemeister 90834c78fe [mtv] Fix title for gametrailers (Fixes #2188)
We now prefer the title including the category, because that title is what is presented at the actual sites.
2014-01-22 00:17:33 +01:00
Jaime Marquínez Ferrándiz 47917f24c4 [brightcove] Fix extraction of embedded videos
There was a leading ‘:’ in the regex.
The ‘flashvars’ parameter is not always available.
2014-01-21 22:04:46 +01:00
Jaime Marquínez Ferrándiz d614aa40e3 [brightcove] Fix check for url in the result
It may have the ‘formats’ field instead of ‘url’.
2014-01-21 21:53:10 +01:00
Jaime Marquínez Ferrándiz bc4ba05fcb [mtv] Add an extractor for mtviggy.com (#2072) 2014-01-21 20:59:31 +01:00
Jaime Marquínez Ferrándiz 8d9453b9e8 Add an extractor for spike.com (#2072)
Added a generic _real_extract to MTVServicesInfoExtractor
2014-01-21 20:54:47 +01:00
Jaime Marquínez Ferrándiz e4f320a4d0 [mtv] Check for geo-blocked videos in the xml document, not in the xml’s string
Allows to use the `_download_xml` method
2014-01-21 19:59:02 +01:00
Jaime Marquínez Ferrándiz ef9f2ba7af [mtv] Use unicode_literals 2014-01-21 19:58:21 +01:00
Philipp Hagemeister 4a3b72771f release 2014.01.21.1 2014-01-21 18:21:53 +01:00
Philipp Hagemeister 913f32929b [vk] Add support for HQ videos (Fixes #2187) 2014-01-21 18:21:44 +01:00
Philipp Hagemeister 9834872bf6 [facebook] Add support for embeds
Example URL: http://www.hostblogger.de/blog/archives/6181-Auto-jagt-Betonmischer.html
2014-01-21 18:10:17 +01:00
Philipp Hagemeister 94a23d2a1e [vk] Use unicode_literals 2014-01-21 17:32:03 +01:00
Philipp Hagemeister 608bf69880 [vk] avoid built-in names 2014-01-21 17:29:04 +01:00
Philipp Hagemeister 032b3df5af [redtube] Use unicode_literals 2014-01-21 14:16:44 +01:00
Mike Col 9d11a41fe4 [redtube] Add support for thumbnails
Signed-off-by: Philipp Hagemeister <phihag@phihag.de>
2014-01-21 14:14:55 +01:00
Philipp Hagemeister 2989501131 release 2014.01.21 2014-01-21 14:07:41 +01:00
Philipp Hagemeister 7b0817e8e1 [servingsys] Add support
This also adds support for brightcove advertisements.
Fixes #2181
2014-01-21 02:09:51 +01:00
Philipp Hagemeister 9d4288b2d4 [extractor/common] Clarify when and when not we generate the filename 2014-01-21 01:41:13 +01:00
Philipp Hagemeister 3486df383b [generic] Improve testcase 2014-01-21 01:40:34 +01:00
Philipp Hagemeister b60016e831 Deal with implicitly UTF-16 decoded webpages
These webpages don't specify an encoding and rely on the BOM
2014-01-21 01:39:40 +01:00
Philipp Hagemeister 5aafe895fc Correct XML ampersand fixup 2014-01-20 22:11:34 +01:00
19 changed files with 324 additions and 85 deletions
+1
View File
@@ -71,6 +71,7 @@ which means you can modify it, redistribute it or use it however you like.
--download-archive FILE Download only videos not listed in the archive
file. Record the IDs of all downloaded videos in
it.
--include-ads Download advertisements as well (experimental)
## Download Options:
-r, --rate-limit LIMIT maximum download rate in bytes per second (e.g.
+14
View File
@@ -16,6 +16,7 @@ from youtube_dl.utils import (
DateRange,
encodeFilename,
find_xpath_attr,
fix_xml_ampersands,
get_meta_content,
orderedSet,
parse_duration,
@@ -200,5 +201,18 @@ class TestUtil(unittest.TestCase):
self.assertEqual(parse_duration('9:12:43'), 33163)
self.assertEqual(parse_duration('x:y'), None)
def test_fix_xml_ampersands(self):
self.assertEqual(
fix_xml_ampersands('"&x=y&z=a'), '"&amp;x=y&amp;z=a')
self.assertEqual(
fix_xml_ampersands('"&amp;x=y&wrong;&z=a'),
'"&amp;x=y&amp;wrong;&amp;z=a')
self.assertEqual(
fix_xml_ampersands('&amp;&apos;&gt;&lt;&quot;'),
'&amp;&apos;&gt;&lt;&quot;')
self.assertEqual(
fix_xml_ampersands('&#1234;&#x1abC;'), '&#1234;&#x1abC;')
self.assertEqual(fix_xml_ampersands('&#&#'), '&amp;#&amp;#')
if __name__ == '__main__':
unittest.main()
+1
View File
@@ -151,6 +151,7 @@ class YoutubeDL(object):
bidi_workaround: Work around buggy terminals without bidirectional text
support, using fridibi
debug_printtraffic:Print out sent and received HTTP traffic
include_ads: Download ads as well
The following parameters are not used by YoutubeDL itself, they are used by
the FileDownloader:
+5 -1
View File
@@ -238,7 +238,10 @@ def parseOpts(overrideArguments=None):
selection.add_option('--download-archive', metavar='FILE',
dest='download_archive',
help='Download only videos not listed in the archive file. Record the IDs of all downloaded videos in it.')
selection.add_option(
'--include-ads', dest='include_ads',
action='store_true',
help='Download advertisements as well (experimental)')
authentication.add_option('-u', '--username',
dest='username', metavar='USERNAME', help='account username')
@@ -716,6 +719,7 @@ def _real_main(argv=None):
'bidi_workaround': opts.bidi_workaround,
'debug_printtraffic': opts.debug_printtraffic,
'prefer_ffmpeg': opts.prefer_ffmpeg,
'include_ads': opts.include_ads,
}
with YoutubeDL(ydl_opts) as ydl:
+6 -1
View File
@@ -119,7 +119,10 @@ from .mit import TechTVMITIE, MITIE
from .mixcloud import MixcloudIE
from .mpora import MporaIE
from .mofosex import MofosexIE
from .mtv import MTVIE
from .mtv import (
MTVIE,
MTVIggyIE,
)
from .muzu import MuzuTVIE
from .myspace import MySpaceIE
from .myspass import MySpassIE
@@ -152,6 +155,7 @@ from .rottentomatoes import RottenTomatoesIE
from .roxwel import RoxwelIE
from .rtlnow import RTLnowIE
from .rutube import RutubeIE
from .servingsys import ServingSysIE
from .sina import SinaIE
from .slashdot import SlashdotIE
from .slideshare import SlideshareIE
@@ -170,6 +174,7 @@ from .southparkstudios import (
from .space import SpaceIE
from .spankwire import SpankwireIE
from .spiegel import SpiegelIE
from .spike import SpikeIE
from .stanfordoc import StanfordOpenClassroomIE
from .statigram import StatigramIE
from .steam import SteamIE
+47 -7
View File
@@ -9,9 +9,11 @@ from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
find_xpath_attr,
fix_xml_ampersands,
compat_urlparse,
compat_str,
compat_urllib_request,
compat_parse_qs,
ExtractorError,
unsmuggle_url,
@@ -83,17 +85,33 @@ class BrightcoveIE(InfoExtractor):
lambda m: m.group(1) + '/>', object_str)
# Fix up some stupid XML, see https://github.com/rg3/youtube-dl/issues/1608
object_str = object_str.replace('<--', '<!--')
object_str = fix_xml_ampersands(object_str)
object_doc = xml.etree.ElementTree.fromstring(object_str)
assert 'BrightcoveExperience' in object_doc.attrib['class']
params = {
'playerID': find_xpath_attr(object_doc, './param', 'name', 'playerID').attrib['value'],
}
fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars')
if fv_el is not None:
flashvars = dict(
(k, v[0])
for k, v in compat_parse_qs(fv_el.attrib['value']).items())
else:
flashvars = {}
def find_param(name):
if name in flashvars:
return flashvars[name]
node = find_xpath_attr(object_doc, './param', 'name', name)
if node is not None:
return node.attrib['value']
return None
params = {}
playerID = find_param('playerID')
if playerID is None:
raise ExtractorError('Cannot find player ID')
params['playerID'] = playerID
playerKey = find_param('playerKey')
# Not all pages define this value
if playerKey is not None:
@@ -114,8 +132,12 @@ class BrightcoveIE(InfoExtractor):
if it can't be found
"""
m_brightcove = re.search(
r'<object[^>]+?class=([\'"])[^>]*?BrightcoveExperience.*?\1.+?</object>',
webpage, re.DOTALL)
r'''(?sx)<object
(?:
[^>]+?class=([\'"])[^>]*?BrightcoveExperience.*?\1 |
[^>]*?>\s*<param\s+name="movie"\s+value="https?://[^/]*brightcove\.com/
).+?</object>''',
webpage)
if m_brightcove is not None:
return cls._build_brighcove_url(m_brightcove.group())
else:
@@ -156,6 +178,7 @@ class BrightcoveIE(InfoExtractor):
info = self._search_regex(r'var experienceJSON = ({.*?});', webpage, 'json')
info = json.loads(info)['data']
video_info = info['programmedContent']['videoPlayer']['mediaDTO']
video_info['_youtubedl_adServerURL'] = info.get('adServerURL')
return self._extract_video_info(video_info)
@@ -193,6 +216,23 @@ class BrightcoveIE(InfoExtractor):
info.update({
'url': video_info['FLVFullLengthURL'],
})
else:
if self._downloader.params.get('include_ads', False):
adServerURL = video_info.get('_youtubedl_adServerURL')
if adServerURL:
ad_info = {
'_type': 'url',
'url': adServerURL,
}
if 'url' in info:
return {
'_type': 'playlist',
'title': info['title'],
'entries': [ad_info, info],
}
else:
return ad_info
if 'url' not in info and not info.get('formats'):
raise ExtractorError('Unable to extract video url for %s' % info['id'])
return info
+2 -2
View File
@@ -3,7 +3,7 @@ import re
from .common import InfoExtractor
from ..utils import (
find_xpath_attr,
fix_xml_all_ampersand,
fix_xml_ampersands
)
@@ -33,7 +33,7 @@ class ClipsyndicateIE(InfoExtractor):
pdoc = self._download_xml(
'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars,
video_id, u'Downloading video info',
transform_source=fix_xml_all_ampersand)
transform_source=fix_xml_ampersands)
track_doc = pdoc.find('trackList/track')
def find_param(name):
+3 -1
View File
@@ -220,6 +220,8 @@ class InfoExtractor(object):
webpage_bytes[:1024])
if m:
encoding = m.group(1).decode('ascii')
elif webpage_bytes.startswith(b'\xff\xfe'):
encoding = 'utf-16'
else:
encoding = 'utf-8'
if self._downloader.params.get('dump_intermediate_pages', False):
@@ -236,7 +238,7 @@ class InfoExtractor(object):
except AttributeError:
url = url_or_request
if len(url) > 200:
h = hashlib.md5(url).hexdigest()
h = u'___' + hashlib.md5(url).hexdigest()
url = url[:200 - len(h)] + h
raw_filename = ('%s_%s.dump' % (video_id, url))
filename = sanitize_filename(raw_filename, restricted=True)
+7 -2
View File
@@ -17,7 +17,12 @@ from ..utils import (
class FacebookIE(InfoExtractor):
"""Information Extractor for Facebook"""
_VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:[^#?]*#!/)?(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
_VALID_URL = r'''(?x)
(?:https?://)?(?:\w+\.)?facebook\.com/
(?:[^#?]*\#!/)?
(?:video/video\.php|photo\.php|video/embed)\?(?:.*?)
(?:v|video_id)=(?P<id>[0-9]+)
(?:.*)'''
_LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1'
_CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1'
_NETRC_MACHINE = 'facebook'
@@ -90,7 +95,7 @@ class FacebookIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group('ID')
video_id = mobj.group('id')
url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
webpage = self._download_webpage(url, video_id)
+9 -7
View File
@@ -1,3 +1,5 @@
from __future__ import unicode_literals
import re
from .mtv import MTVServicesInfoExtractor
@@ -6,12 +8,12 @@ from .mtv import MTVServicesInfoExtractor
class GametrailersIE(MTVServicesInfoExtractor):
_VALID_URL = r'http://www\.gametrailers\.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
_TEST = {
u'url': u'http://www.gametrailers.com/videos/zbvr8i/mirror-s-edge-2-e3-2013--debut-trailer',
u'file': u'70e9a5d7-cf25-4a10-9104-6f3e7342ae0d.mp4',
u'md5': u'4c8e67681a0ea7ec241e8c09b3ea8cf7',
u'info_dict': {
u'title': u'E3 2013: Debut Trailer',
u'description': u'Faith is back! Check out the World Premiere trailer for Mirror\'s Edge 2 straight from the EA Press Conference at E3 2013!',
'url': 'http://www.gametrailers.com/videos/zbvr8i/mirror-s-edge-2-e3-2013--debut-trailer',
'file': '70e9a5d7-cf25-4a10-9104-6f3e7342ae0d.mp4',
'md5': '4c8e67681a0ea7ec241e8c09b3ea8cf7',
'info_dict': {
'title': 'Mirror\'s Edge 2|E3 2013: Debut Trailer',
'description': 'Faith is back! Check out the World Premiere trailer for Mirror\'s Edge 2 straight from the EA Press Conference at E3 2013!',
},
}
@@ -23,5 +25,5 @@ class GametrailersIE(MTVServicesInfoExtractor):
webpage = self._download_webpage(url, video_id)
mgid = self._search_regex([r'data-video="(?P<mgid>mgid:.*?)"',
r'data-contentId=\'(?P<mgid>mgid:.*?)\''],
webpage, u'mgid')
webpage, 'mgid')
return self._get_videos_info(mgid)
+8 -1
View File
@@ -92,11 +92,12 @@ class GenericIE(InfoExtractor):
# ooyala video
{
'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
'file': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ.mp4',
'md5': '5644c6ca5d5782c1d0d350dad9bd840c',
'info_dict': {
'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
'ext': 'mp4',
'title': '2cc213299525360.mov', #that's what we get
'title': '2cc213299525360.mov', # that's what we get
},
},
]
@@ -318,6 +319,12 @@ class GenericIE(InfoExtractor):
if mobj is not None:
return self.url_result(mobj.group('url'), 'Novamov')
# Look for embedded Facebook player
mobj = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>https://www.facebook.com/video/embed.+?)\1', webpage)
if mobj is not None:
return self.url_result(mobj.group('url'), 'Facebook')
# Start with something easy: JW Player in SWFObject
mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
if mobj is None:
+2 -2
View File
@@ -4,7 +4,7 @@ import re
from .common import InfoExtractor
from ..utils import (
fix_xml_all_ampersand,
fix_xml_ampersands,
)
@@ -27,7 +27,7 @@ class MetacriticIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
# The xml is not well formatted, there are raw '&'
info = self._download_xml('http://www.metacritic.com/video_data?video=' + video_id,
video_id, 'Downloading info xml', transform_source=fix_xml_all_ampersand)
video_id, 'Downloading info xml', transform_source=fix_xml_ampersands)
clip = next(c for c in info.findall('playList/clip') if c.find('id').text == video_id)
formats = []
+72 -31
View File
@@ -1,12 +1,18 @@
from __future__ import unicode_literals
import re
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
ExtractorError,
find_xpath_attr,
fix_xml_ampersands,
url_basename,
RegexNotFoundError,
)
def _media_xml_tag(tag):
return '{http://search.yahoo.com/mrss/}%s' % tag
@@ -33,10 +39,9 @@ class MTVServicesInfoExtractor(InfoExtractor):
else:
return thumb_node.attrib['url']
def _extract_video_formats(self, metadataXml):
if '/error_country_block.swf' in metadataXml:
raise ExtractorError(u'This video is not available from your country.', expected=True)
mdoc = xml.etree.ElementTree.fromstring(metadataXml.encode('utf-8'))
def _extract_video_formats(self, mdoc):
if re.match(r'.*/error_country_block\.swf$', mdoc.find('.//src').text) is not None:
raise ExtractorError('This video is not available from your country.', expected=True)
formats = []
for rendition in mdoc.findall('.//rendition'):
@@ -59,11 +64,12 @@ class MTVServicesInfoExtractor(InfoExtractor):
self.report_extraction(video_id)
mediagen_url = itemdoc.find('%s/%s' % (_media_xml_tag('group'), _media_xml_tag('content'))).attrib['url']
# Remove the templates, like &device={device}
mediagen_url = re.sub(r'&[^=]*?={.*?}(?=(&|$))', u'', mediagen_url)
mediagen_url = re.sub(r'&[^=]*?={.*?}(?=(&|$))', '', mediagen_url)
if 'acceptMethods' not in mediagen_url:
mediagen_url += '&acceptMethods=fms'
mediagen_page = self._download_webpage(mediagen_url, video_id,
u'Downloading video urls')
mediagen_doc = self._download_xml(mediagen_url, video_id,
'Downloading video urls')
description_node = itemdoc.find('description')
if description_node is not None:
@@ -71,9 +77,22 @@ class MTVServicesInfoExtractor(InfoExtractor):
else:
description = None
title_el = None
if title_el is None:
title_el = find_xpath_attr(
itemdoc, './/{http://search.yahoo.com/mrss/}category',
'scheme', 'urn:mtvn:video_title')
if title_el is None:
title_el = itemdoc.find('.//{http://search.yahoo.com/mrss/}title')
if title_el is None:
title_el = itemdoc.find('.//title')
title = title_el.text
if title is None:
raise ExtractorError('Could not find video title')
return {
'title': itemdoc.find('title').text,
'formats': self._extract_video_formats(mediagen_page),
'title': title,
'formats': self._extract_video_formats(mediagen_doc),
'id': video_id,
'thumbnail': self._get_thumbnail_url(uri, itemdoc),
'description': description,
@@ -83,14 +102,22 @@ class MTVServicesInfoExtractor(InfoExtractor):
video_id = self._id_from_uri(uri)
data = compat_urllib_parse.urlencode({'uri': uri})
def fix_ampersand(s):
""" Fix unencoded ampersand in XML """
return s.replace(u'& ', '&amp; ')
idoc = self._download_xml(
self._FEED_URL + '?' + data, video_id,
u'Downloading info', transform_source=fix_ampersand)
'Downloading info', transform_source=fix_xml_ampersands)
return [self._get_video_info(item) for item in idoc.findall('.//item')]
def _real_extract(self, url):
title = url_basename(url)
webpage = self._download_webpage(url, title)
try:
# the url is in the format http://media.mtvnservices.com/fb/{mgid}.swf
fb_url = self._og_search_video_url(webpage)
mgid = url_basename(fb_url).rpartition('.')[0]
except RegexNotFoundError:
mgid = self._search_regex(r'data-mgid="(.*?)"', webpage, u'mgid')
return self._get_videos_info(mgid)
class MTVIE(MTVServicesInfoExtractor):
_VALID_URL = r'''(?x)^https?://
@@ -101,25 +128,25 @@ class MTVIE(MTVServicesInfoExtractor):
_TESTS = [
{
u'url': u'http://www.mtv.com/videos/misc/853555/ours-vh1-storytellers.jhtml',
u'file': u'853555.mp4',
u'md5': u'850f3f143316b1e71fa56a4edfd6e0f8',
u'info_dict': {
u'title': u'Taylor Swift - "Ours (VH1 Storytellers)"',
u'description': u'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.',
'url': 'http://www.mtv.com/videos/misc/853555/ours-vh1-storytellers.jhtml',
'file': '853555.mp4',
'md5': '850f3f143316b1e71fa56a4edfd6e0f8',
'info_dict': {
'title': 'Taylor Swift - "Ours (VH1 Storytellers)"',
'description': 'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.',
},
},
{
u'add_ie': ['Vevo'],
u'url': u'http://www.mtv.com/videos/taylor-swift/916187/everything-has-changed-ft-ed-sheeran.jhtml',
u'file': u'USCJY1331283.mp4',
u'md5': u'73b4e7fcadd88929292fe52c3ced8caf',
u'info_dict': {
u'title': u'Everything Has Changed',
u'upload_date': u'20130606',
u'uploader': u'Taylor Swift',
'add_ie': ['Vevo'],
'url': 'http://www.mtv.com/videos/taylor-swift/916187/everything-has-changed-ft-ed-sheeran.jhtml',
'file': 'USCJY1331283.mp4',
'md5': '73b4e7fcadd88929292fe52c3ced8caf',
'info_dict': {
'title': 'Everything Has Changed',
'upload_date': '20130606',
'uploader': 'Taylor Swift',
},
u'skip': u'VEVO is only available in some countries',
'skip': 'VEVO is only available in some countries',
},
]
@@ -138,8 +165,22 @@ class MTVIE(MTVServicesInfoExtractor):
webpage, re.DOTALL)
if m_vevo:
vevo_id = m_vevo.group(1);
self.to_screen(u'Vevo video detected: %s' % vevo_id)
self.to_screen('Vevo video detected: %s' % vevo_id)
return self.url_result('vevo:%s' % vevo_id, ie='Vevo')
uri = self._html_search_regex(r'/uri/(.*?)\?', webpage, u'uri')
uri = self._html_search_regex(r'/uri/(.*?)\?', webpage, 'uri')
return self._get_videos_info(uri)
class MTVIggyIE(MTVServicesInfoExtractor):
IE_NAME = 'mtviggy.com'
_VALID_URL = r'https?://www\.mtviggy\.com/videos/.+'
_TEST = {
'url': 'http://www.mtviggy.com/videos/arcade-fire-behind-the-scenes-at-the-biggest-music-experiment-yet/',
'info_dict': {
'id': '984696',
'ext': 'mp4',
'title': 'Short',
}
}
_FEED_URL = 'http://all.mtvworldverticals.com/feed-xml/'
+17 -10
View File
@@ -1,3 +1,5 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
@@ -6,14 +8,14 @@ from .common import InfoExtractor
class RedTubeIE(InfoExtractor):
_VALID_URL = r'http://(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
_TEST = {
u'url': u'http://www.redtube.com/66418',
u'file': u'66418.mp4',
'url': 'http://www.redtube.com/66418',
'file': '66418.mp4',
# md5 varies from time to time, as in
# https://travis-ci.org/rg3/youtube-dl/jobs/14052463#L295
#u'md5': u'7b8c22b5e7098a3e1c09709df1126d2d',
u'info_dict': {
u"title": u"Sucked on a toilet",
u"age_limit": 18,
#'md5': u'7b8c22b5e7098a3e1c09709df1126d2d',
'info_dict': {
"title": "Sucked on a toilet",
"age_limit": 18,
}
}
@@ -33,14 +35,19 @@ class RedTubeIE(InfoExtractor):
r'<h1 class="videoTitle[^"]*">(.+?)</h1>',
webpage, u'title')
video_thumbnail = self._html_search_regex(
r'playerInnerHTML.+?<img\s+src="(.+?)"',
webpage, u'thumbnail', fatal=False)
# No self-labeling, but they describe themselves as
# "Home of Videos Porno"
age_limit = 18
return {
'id': video_id,
'url': video_url,
'ext': video_extension,
'title': video_title,
'id': video_id,
'url': video_url,
'ext': video_extension,
'title': video_title,
'thumbnail': video_thumbnail,
'age_limit': age_limit,
}
+70
View File
@@ -0,0 +1,70 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
)
class ServingSysIE(InfoExtractor):
_VALID_URL = r'https?://(?:[^.]+\.)?serving-sys\.com/BurstingPipe/adServer\.bs\?.*?&pli=(?P<id>[0-9]+)'
_TEST = {
'url': 'http://bs.serving-sys.com/BurstingPipe/adServer.bs?cn=is&c=23&pl=VAST&pli=5349193&PluID=0&pos=7135&ord=[timestamp]&cim=1?',
'playlist': [{
'file': '29955898.flv',
'md5': 'baed851342df6846eb8677a60a011a0f',
'info_dict': {
'title': 'AdAPPter_Hyundai_demo (1)',
'duration': 74,
'tbr': 1378,
'width': 640,
'height': 400,
},
}, {
'file': '29907998.flv',
'md5': '979b4da2655c4bc2d81aeb915a8c5014',
'info_dict': {
'title': 'AdAPPter_Hyundai_demo (2)',
'duration': 34,
'width': 854,
'height': 480,
'tbr': 516,
},
}],
'params': {
'playlistend': 2,
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
pl_id = mobj.group('id')
vast_doc = self._download_xml(url, pl_id)
title = vast_doc.find('.//AdTitle').text
media = vast_doc.find('.//MediaFile').text
info_url = self._search_regex(r'&adData=([^&]+)&', media, 'info URL')
doc = self._download_xml(info_url, pl_id, 'Downloading video info')
entries = [{
'_type': 'video',
'id': a.attrib['id'],
'title': '%s (%s)' % (title, a.attrib['assetID']),
'url': a.attrib['URL'],
'duration': int_or_none(a.attrib.get('length')),
'tbr': int_or_none(a.attrib.get('bitrate')),
'height': int_or_none(a.attrib.get('height')),
'width': int_or_none(a.attrib.get('width')),
} for a in doc.findall('.//AdditionalAssets/asset')]
return {
'_type': 'playlist',
'id': pl_id,
'title': title,
'entries': entries,
}
+19
View File
@@ -0,0 +1,19 @@
from __future__ import unicode_literals
from .mtv import MTVServicesInfoExtractor
class SpikeIE(MTVServicesInfoExtractor):
_VALID_URL = r'https?://www\.spike\.com/(video-clips|episodes)/.+'
_TEST = {
'url': 'http://www.spike.com/video-clips/lhtu8m/auction-hunters-can-allen-ride-a-hundred-year-old-motorcycle',
'md5': '1a9265f32b0c375793d6c4ce45255256',
'info_dict': {
'id': 'b9c8221a-4e50-479a-b86d-3333323e38ba',
'ext': 'mp4',
'title': 'Can Allen Ride A Hundred Year-Old Motorcycle?',
'description': 'md5:fbed7e82ed5fad493615b3094a9499cb',
},
}
_FEED_URL = 'http://www.spike.com/feeds/mrss/'
+35 -17
View File
@@ -1,4 +1,6 @@
# encoding: utf-8
from __future__ import unicode_literals
import re
import json
@@ -10,19 +12,27 @@ from ..utils import (
class VKIE(InfoExtractor):
IE_NAME = u'vk.com'
IE_NAME = 'vk.com'
_VALID_URL = r'https?://vk\.com/(?:videos.*?\?.*?z=)?video(?P<id>.*?)(?:\?|%2F|$)'
_TEST = {
u'url': u'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521',
u'md5': u'0deae91935c54e00003c2a00646315f0',
u'info_dict': {
u'id': u'162222515',
u'ext': u'flv',
u'title': u'ProtivoGunz - Хуёвая песня',
u'uploader': u'Noize MC',
_TESTS = [{
'url': 'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521',
'file': '162222515.flv',
'md5': '0deae91935c54e00003c2a00646315f0',
'info_dict': {
'title': 'ProtivoGunz - Хуёвая песня',
'uploader': 'Noize MC',
},
}
},
{
'url': 'http://vk.com/video4643923_163339118',
'file': '163339118.mp4',
'md5': 'f79bccb5cd182b1f43502ca5685b2b36',
'info_dict': {
'uploader': 'Elvira Dzhonik',
'title': 'Dream Theater - Hollow Years Live at Budokan 720*',
}
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -33,13 +43,21 @@ class VKIE(InfoExtractor):
if m_yt is not None:
self.to_screen(u'Youtube video detected')
return self.url_result(m_yt.group(1), 'Youtube')
vars_json = self._search_regex(r'var vars = ({.*?});', info_page, u'vars')
vars = json.loads(vars_json)
data_json = self._search_regex(r'var vars = ({.*?});', info_page, 'vars')
data = json.loads(data_json)
formats = [{
'format_id': k,
'url': v,
'width': int(k[len('url'):]),
} for k, v in data.items()
if k.startswith('url')]
self._sort_formats(formats)
return {
'id': compat_str(vars['vid']),
'url': vars['url240'],
'title': unescapeHTML(vars['md_title']),
'thumbnail': vars['jpg'],
'uploader': vars['md_author'],
'id': compat_str(data['vid']),
'formats': formats,
'title': unescapeHTML(data['md_title']),
'thumbnail': data.get('jpg'),
'uploader': data.get('md_author'),
}
+5 -2
View File
@@ -1092,9 +1092,12 @@ def month_by_name(name):
return None
def fix_xml_all_ampersand(xml_str):
def fix_xml_ampersands(xml_str):
"""Replace all the '&' by '&amp;' in XML"""
return xml_str.replace(u'&', u'&amp;')
return re.sub(
r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
u'&amp;',
xml_str)
def setproctitle(title):
+1 -1
View File
@@ -1,2 +1,2 @@
__version__ = '2014.01.20'
__version__ = '2014.01.22'