1
0
mirror of https://github.com/ytdl-org/youtube-dl.git synced 2026-06-12 07:30:09 +00:00

Compare commits

..

32 Commits

Author SHA1 Message Date
Philipp Hagemeister 35d3e63d24 release 2014.09.29.2 2014-09-29 04:49:11 +02:00
Philipp Hagemeister 27aede9074 [pbs] Add support for series/jwplayer type video (Fixes #3849) 2014-09-29 04:48:50 +02:00
Philipp Hagemeister f5b7e6a842 release 2014.09.29.1 2014-09-29 02:04:28 +02:00
Philipp Hagemeister a1f934b171 [youtube] Correct language cookie handling 2014-09-29 02:04:16 +02:00
Philipp Hagemeister a43ee88c6f release 2014.09.29 2014-09-29 01:51:53 +02:00
Philipp Hagemeister e2dce53781 [youtube] Always request webpage in English (Fixes #3844) 2014-09-29 01:39:26 +02:00
Philipp Hagemeister 1770ed9e86 [thvideo] Simplify (#3848) 2014-09-29 00:38:37 +02:00
Philipp Hagemeister 457ac58cc7 Merge remote-tracking branch 'diffycat/thvideo-update' 2014-09-29 00:36:55 +02:00
Philipp Hagemeister 9c44d2429b [vimeo:likes] Support large like lists (Fixes #3847) 2014-09-29 00:36:06 +02:00
Philipp Hagemeister d2e32f7df5 Do not use HTML characters in output
This messes up the format when people paste it outside of code tags.
2014-09-29 00:23:43 +02:00
Anton Larionov 67077b182b [thvideo] Add support for playlists 2014-09-28 23:36:55 +04:00
Naglis Jonaitis 5f4c318844 [nfl] Support team micro-sites (fixes #3831) 2014-09-28 21:48:26 +03:00
Naglis Jonaitis dfee83234b [nfl] Prefer progressive downloads 2014-09-28 19:25:28 +03:00
Sergey M․ 7f5c0c4a19 [README] Clarify test's md5 filesize (#3846) 2014-09-28 22:10:20 +07:00
Philipp Hagemeister 4bc77c8417 [README] Use _match_id helper function 2014-09-28 13:52:21 +02:00
Philipp Hagemeister 22dd3fad86 release 2014.09.28.1 2014-09-28 12:14:25 +02:00
Philipp Hagemeister d6e6a42256 [vimeo:likes] Add new extractor (Fixes #3835) 2014-09-28 12:14:16 +02:00
Philipp Hagemeister 76e7d1e74b [played] Remove unused import 2014-09-28 10:56:36 +02:00
Philipp Hagemeister 38c4d41b74 [played] Simplify (#3798) 2014-09-28 10:55:27 +02:00
Philipp Hagemeister f0b8e3607d Merge remote-tracking branch 'r4mos/played' 2014-09-28 10:52:23 +02:00
Philipp Hagemeister 51ee08c4bb Remove unused imports 2014-09-28 10:50:43 +02:00
Philipp Hagemeister c841789772 [muenchentv] Add thumbnail 2014-09-28 10:49:58 +02:00
Philipp Hagemeister c121a75b36 [heise] Add support for description 2014-09-28 10:49:12 +02:00
Philipp Hagemeister 5a8b77551d [heise] Simplify (#3842) 2014-09-28 10:47:25 +02:00
Philipp Hagemeister 0217aee154 Merge remote-tracking branch 'd912e3/heise' 2014-09-28 10:36:44 +02:00
Philipp Hagemeister b14f3a4c1d [golem] Simplify (#3828) 2014-09-28 10:35:19 +02:00
Philipp Hagemeister 92f7963f6e Merge remote-tracking branch 'd912e3/golem' 2014-09-28 10:10:34 +02:00
Mats 7b7518124e [heise] Don't check string type
Before Python 3 could be unicode, so don't check at all.
2014-09-27 21:12:23 +02:00
Mats 70752ccefd [golem] Don't omit positional argument specifiers
Required by Python 2.6.
2014-09-27 19:35:55 +02:00
Mats 0155549d6c [heise] Add new extractor 2014-09-27 19:28:01 +02:00
Mats 6a5af6acb9 [golem] Add new extractor 2014-09-25 16:25:53 +02:00
Carlos Ramos 5aa38e75b2 [played] Add new extractor 2014-09-19 22:46:57 +02:00
21 changed files with 526 additions and 102 deletions
+2 -5
View File
@@ -442,8 +442,6 @@ If you want to add support for a new site, you can follow this quick list (assum
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
@@ -451,7 +449,7 @@ If you want to add support for a new site, you can follow this quick list (assum
_VALID_URL = r'https?://(?:www\.)?yourextractor\.com/watch/(?P<id>[0-9]+)'
_TEST = {
'url': 'http://yourextractor.com/watch/42',
'md5': 'TODO: md5 sum of the first 10KiB of the video file',
'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)',
'info_dict': {
'id': '42',
'ext': 'mp4',
@@ -466,8 +464,7 @@ If you want to add support for a new site, you can follow this quick list (assum
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
video_id = self._match_id(url)
# TODO more code goes here, for example ...
webpage = self._download_webpage(url, video_id)
+3 -1
View File
@@ -139,7 +139,9 @@ def generator(test_case):
if is_playlist:
self.assertEqual(res_dict['_type'], 'playlist')
self.assertTrue('entries' in res_dict)
expect_info_dict(self, test_case.get('info_dict', {}), res_dict)
if 'playlist_mincount' in test_case:
assertGreaterEqual(
self,
@@ -188,7 +190,7 @@ def generator(test_case):
expect_info_dict(self, tc.get('info_dict', {}), info_dict)
finally:
try_rm_tcs_files()
if is_playlist and res_dict is not None:
if is_playlist and res_dict is not None and res_dict.get('entries'):
# Remove all other files that may have been extracted if the
# extractor returns full results even with extract_flat
res_tcs = [{'info_dict': e} for e in res_dict['entries']]
+7 -2
View File
@@ -22,7 +22,8 @@ from youtube_dl.utils import (
fix_xml_ampersands,
get_meta_content,
orderedSet,
PagedList,
OnDemandPagedList,
InAdvancePagedList,
parse_duration,
read_batch_urls,
sanitize_filename,
@@ -246,10 +247,14 @@ class TestUtil(unittest.TestCase):
for i in range(firstid, upto):
yield i
pl = PagedList(get_page, pagesize)
pl = OnDemandPagedList(get_page, pagesize)
got = pl.getslice(*sliceargs)
self.assertEqual(got, expected)
iapl = InAdvancePagedList(get_page, size // pagesize + 1, pagesize)
got = iapl.getslice(*sliceargs)
self.assertEqual(got, expected)
testPL(5, 2, (), [0, 1, 2, 3, 4])
testPL(5, 2, (1,), [1, 2, 3, 4])
testPL(5, 2, (2,), [2, 3, 4])
+10 -3
View File
@@ -135,12 +135,14 @@ from .gametrailers import GametrailersIE
from .gdcvault import GDCVaultIE
from .generic import GenericIE
from .godtube import GodTubeIE
from .golem import GolemIE
from .googleplus import GooglePlusIE
from .googlesearch import GoogleSearchIE
from .gorillavid import GorillaVidIE
from .goshgay import GoshgayIE
from .grooveshark import GroovesharkIE
from .hark import HarkIE
from .heise import HeiseIE
from .helsinki import HelsinkiIE
from .hentaistigma import HentaiStigmaIE
from .hornbunny import HornBunnyIE
@@ -272,6 +274,7 @@ from .parliamentliveuk import ParliamentLiveUKIE
from .patreon import PatreonIE
from .pbs import PBSIE
from .photobucket import PhotobucketIE
from .played import PlayedIE
from .playfm import PlayFMIE
from .playvid import PlayvidIE
from .podomatic import PodomaticIE
@@ -368,7 +371,10 @@ from .thisav import ThisAVIE
from .tinypic import TinyPicIE
from .tlc import TlcIE, TlcDeIE
from .tnaflix import TNAFlixIE
from .thvideo import THVideoIE
from .thvideo import (
THVideoIE,
THVideoPlaylistIE
)
from .toutv import TouTvIE
from .toypics import ToypicsUserIE, ToypicsIE
from .traileraddict import TrailerAddictIE
@@ -409,11 +415,12 @@ from .videoweed import VideoWeedIE
from .vidme import VidmeIE
from .vimeo import (
VimeoIE,
VimeoChannelIE,
VimeoUserIE,
VimeoAlbumIE,
VimeoChannelIE,
VimeoGroupsIE,
VimeoLikesIE,
VimeoReviewIE,
VimeoUserIE,
VimeoWatchLaterIE,
)
from .vimple import VimpleIE
-2
View File
@@ -8,8 +8,6 @@ from ..utils import (
determine_ext,
ExtractorError,
qualities,
compat_urllib_parse_urlparse,
compat_urllib_parse,
int_or_none,
parse_duration,
unified_strdate,
+23
View File
@@ -22,6 +22,7 @@ from ..utils import (
clean_html,
compiled_regex_type,
ExtractorError,
float_or_none,
int_or_none,
RegexNotFoundError,
sanitize_filename,
@@ -720,6 +721,28 @@ class InfoExtractor(object):
now_str = now.strftime("%Y-%m-%d %H:%M")
return name + ' ' + now_str
def _int(self, v, name, fatal=False, **kwargs):
res = int_or_none(v, **kwargs)
if 'get_attr' in kwargs:
print(getattr(v, kwargs['get_attr']))
if res is None:
msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
if fatal:
raise ExtractorError(msg)
else:
self._downloader.report_warning(msg)
return res
def _float(self, v, name, fatal=False, **kwargs):
res = float_or_none(v, **kwargs)
if res is None:
msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
if fatal:
raise ExtractorError(msg)
else:
self._downloader.report_warning(msg)
return res
class SearchInfoExtractor(InfoExtractor):
"""
+4 -6
View File
@@ -397,12 +397,6 @@ class GenericIE(InfoExtractor):
},
]
def report_download_webpage(self, video_id):
"""Report webpage download."""
if not self._downloader.params.get('test', False):
self._downloader.report_warning('Falling back on generic information extractor.')
super(GenericIE, self).report_download_webpage(video_id)
def report_following_redirect(self, new_url):
"""Report information extraction."""
self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
@@ -502,6 +496,7 @@ class GenericIE(InfoExtractor):
url, smuggled_data = unsmuggle_url(url)
force_videoid = None
is_intentional = smuggled_data and smuggled_data.get('to_generic')
if smuggled_data and 'force_videoid' in smuggled_data:
force_videoid = smuggled_data['force_videoid']
video_id = force_videoid
@@ -544,6 +539,9 @@ class GenericIE(InfoExtractor):
'upload_date': upload_date,
}
if not self._downloader.params.get('test', False) and not is_intentional:
self._downloader.report_warning('Falling back on generic information extractor.')
try:
webpage = self._download_webpage(url, video_id)
except ValueError:
+71
View File
@@ -0,0 +1,71 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
compat_urlparse,
determine_ext,
)
class GolemIE(InfoExtractor):
_VALID_URL = r'^https?://video\.golem\.de/.+?/(?P<id>.+?)/'
_TEST = {
'url': 'http://video.golem.de/handy/14095/iphone-6-und-6-plus-test.html',
'md5': 'c1a2c0a3c863319651c7c992c5ee29bf',
'info_dict': {
'id': '14095',
'format_id': 'high',
'ext': 'mp4',
'title': 'iPhone 6 und 6 Plus - Test',
'duration': 300.44,
'filesize': 65309548,
}
}
_PREFIX = 'http://video.golem.de'
def _real_extract(self, url):
video_id = self._match_id(url)
config = self._download_xml(
'https://video.golem.de/xml/{0}.xml'.format(video_id), video_id)
info = {
'id': video_id,
'title': config.findtext('./title', 'golem'),
'duration': self._float(config.findtext('./playtime'), 'duration'),
}
formats = []
for e in config.findall('./*[url]'):
url = e.findtext('./url')
if not url:
self._downloader.report_warning(
"{0}: url: empty, skipping".format(e.tag))
continue
formats.append({
'format_id': e.tag,
'url': compat_urlparse.urljoin(self._PREFIX, url),
'height': self._int(e.get('height'), 'height'),
'width': self._int(e.get('width'), 'width'),
'filesize': self._int(e.findtext('filesize'), 'filesize'),
'ext': determine_ext(e.findtext('./filename')),
})
self._sort_formats(formats)
info['formats'] = formats
thumbnails = []
for e in config.findall('.//teaser[url]'):
url = e.findtext('./url')
if not url:
continue
thumbnails.append({
'url': compat_urlparse.urljoin(self._PREFIX, url),
'width': self._int(e.get('width'), 'thumbnail width'),
'height': self._int(e.get('height'), 'thumbnail height'),
})
info['thumbnails'] = thumbnails
return info
+81
View File
@@ -0,0 +1,81 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
get_meta_content,
parse_iso8601,
)
class HeiseIE(InfoExtractor):
_VALID_URL = r'''(?x)
https?://(?:www\.)?heise\.de/video/artikel/
.+?(?P<id>[0-9]+)\.html(?:$|[?#])
'''
_TEST = {
'url': (
'http://www.heise.de/video/artikel/Podcast-c-t-uplink-3-3-Owncloud-Tastaturen-Peilsender-Smartphone-2404147.html'
),
'md5': 'ffed432483e922e88545ad9f2f15d30e',
'info_dict': {
'id': '2404147',
'ext': 'mp4',
'title': (
"Podcast: c't uplink 3.3 Owncloud / Tastaturen / Peilsender Smartphone"
),
'format_id': 'mp4_720',
'timestamp': 1411812600,
'upload_date': '20140927',
'description': 'In uplink-Episode 3.3 geht es darum, wie man sich von Cloud-Anbietern emanzipieren kann, worauf man beim Kauf einer Tastatur achten sollte und was Smartphones über uns verraten.',
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
json_url = self._search_regex(
r'json_url:\s*"([^"]+)"', webpage, 'json URL')
config = self._download_json(json_url, video_id)
info = {
'id': video_id,
'thumbnail': config.get('poster'),
'timestamp': parse_iso8601(get_meta_content('date', webpage)),
'description': self._og_search_description(webpage),
}
title = get_meta_content('fulltitle', webpage)
if title:
info['title'] = title
elif config.get('title'):
info['title'] = config['title']
else:
info['title'] = self._og_search_title(webpage)
formats = []
for t, rs in config['formats'].items():
if not rs or not hasattr(rs, 'items'):
self._downloader.report_warning(
'formats: {0}: no resolutions'.format(t))
continue
for height_str, obj in rs.items():
format_id = '{0}_{1}'.format(t, height_str)
if not obj or not obj.get('url'):
self._downloader.report_warning(
'formats: {0}: no url'.format(format_id))
continue
formats.append({
'url': obj['url'],
'format_id': format_id,
'height': self._int(height_str, 'height'),
})
self._sort_formats(formats)
info['formats'] = formats
return info
+2
View File
@@ -22,6 +22,7 @@ class MuenchenTVIE(InfoExtractor):
'ext': 'mp4',
'title': 're:^münchen.tv-Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'is_live': True,
'thumbnail': 're:^https?://.*\.jpg$'
},
'params': {
'skip_download': True,
@@ -70,5 +71,6 @@ class MuenchenTVIE(InfoExtractor):
'title': title,
'formats': formats,
'is_live': True,
'thumbnail': thumbnail,
}
+101 -60
View File
@@ -6,6 +6,7 @@ import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
compat_urllib_parse,
int_or_none,
remove_end,
)
@@ -13,76 +14,116 @@ from ..utils import (
class NFLIE(InfoExtractor):
IE_NAME = 'nfl.com'
_VALID_URL = r'(?x)https?://(?:www\.)?nfl\.com/(?:videos/(?:.+)/|.*?\#video=)(?P<id>\d..[0-9]+)'
_PLAYER_CONFIG_URL = 'http://www.nfl.com/static/content/static/config/video/config.json'
_TEST = {
'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights',
# 'md5': '5eb8c40a727dda106d510e5d6ffa79e5', # md5 checksum fluctuates
'info_dict': {
'id': '0ap3000000398478',
'ext': 'mp4',
'title': 'Week 3: Washington Redskins vs. Philadelphia Eagles highlights',
'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478',
'upload_date': '20140921',
'timestamp': 1411337580,
'thumbnail': 're:^https?://.*\.jpg$',
_VALID_URL = r'''(?x)https?://
(?P<host>(?:www\.)?(?:nfl\.com|.*?\.clubs\.nfl\.com))/
(?:.+?/)*
(?P<id>(?:\d[a-z]{2}\d{13}|\w{8}\-(?:\w{4}\-){3}\w{12}))'''
_TESTS = [
{
'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights',
'md5': '394ef771ddcd1354f665b471d78ec4c6',
'info_dict': {
'id': '0ap3000000398478',
'ext': 'mp4',
'title': 'Week 3: Redskins vs. Eagles highlights',
'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478',
'upload_date': '20140921',
'timestamp': 1411337580,
'thumbnail': 're:^https?://.*\.jpg$',
}
},
{
'url': 'http://prod.www.steelers.clubs.nfl.com/video-and-audio/videos/LIVE_Post_Game_vs_Browns/9d72f26a-9e2b-4718-84d3-09fb4046c266',
'md5': 'cf85bdb4bc49f6e9d3816d130c78279c',
'info_dict': {
'id': '9d72f26a-9e2b-4718-84d3-09fb4046c266',
'ext': 'mp4',
'title': 'LIVE: Post Game vs. Browns',
'description': 'md5:6a97f7e5ebeb4c0e69a418a89e0636e8',
'upload_date': '20131229',
'timestamp': 1388354455,
'thumbnail': 're:^https?://.*\.jpg$',
}
}
]
@staticmethod
def prepend_host(host, url):
if not url.startswith('http'):
if not url.startswith('/'):
url = '/%s' % url
url = 'http://{0:}{1:}'.format(host, url)
return url
@staticmethod
def format_from_stream(stream, protocol, host, path_prefix='',
preference=0, note=None):
url = '{protocol:}://{host:}/{prefix:}{path:}'.format(
protocol=protocol,
host=host,
prefix=path_prefix,
path=stream.get('path'),
)
return {
'url': url,
'vbr': int_or_none(stream.get('rate', 0), 1000),
'preference': preference,
'format_note': note,
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
video_id, host = mobj.group('id'), mobj.group('host')
config = self._download_json(self._PLAYER_CONFIG_URL, video_id,
webpage = self._download_webpage(url, video_id)
config_url = NFLIE.prepend_host(host, self._search_regex(
r'(?:config|configURL)\s*:\s*"([^"]+)"', webpage, 'config URL'))
config = self._download_json(config_url, video_id,
note='Downloading player config')
url_template = 'http://nfl.com{contentURLTemplate:s}'.format(**config)
video_data = self._download_json(url_template.format(id=video_id), video_id)
cdns = config.get('cdns')
if not cdns:
raise ExtractorError('Failed to get CDN data', expected=True)
url_template = NFLIE.prepend_host(
host, '{contentURLTemplate:}'.format(**config))
video_data = self._download_json(
url_template.format(id=video_id), video_id)
formats = []
streams = video_data.get('cdnData', {}).get('bitrateInfo', [])
for name, cdn in cdns.items():
# LimeLight streams don't seem to work
if cdn.get('name') == 'LIMELIGHT':
continue
protocol = cdn.get('protocol')
host = remove_end(cdn.get('host', ''), '/')
if not (protocol and host):
continue
path_prefix = cdn.get('pathprefix', '')
if path_prefix and not path_prefix.endswith('/'):
path_prefix = '%s/' % path_prefix
get_url = lambda p: '{protocol:s}://{host:s}/{prefix:s}{path:}'.format(
protocol=protocol,
host=host,
prefix=path_prefix,
path=p,
)
if protocol == 'rtmp':
preference = -2
elif 'prog' in name.lower():
preference = -1
else:
preference = 0
cdn_data = video_data.get('cdnData', {})
streams = cdn_data.get('bitrateInfo', [])
if cdn_data.get('format') == 'EXTERNAL_HTTP_STREAM':
parts = compat_urllib_parse.urlparse(cdn_data.get('uri'))
protocol, host = parts.scheme, parts.netloc
for stream in streams:
path = stream.get('path')
if not path:
formats.append(
NFLIE.format_from_stream(stream, protocol, host))
else:
cdns = config.get('cdns')
if not cdns:
raise ExtractorError('Failed to get CDN data', expected=True)
for name, cdn in cdns.items():
# LimeLight streams don't seem to work
if cdn.get('name') == 'LIMELIGHT':
continue
formats.append({
'url': get_url(path),
'vbr': int_or_none(stream.get('rate', 0), 1000),
'preference': preference,
'format_note': name,
})
protocol = cdn.get('protocol')
host = remove_end(cdn.get('host', ''), '/')
if not (protocol and host):
continue
prefix = cdn.get('pathprefix', '')
if prefix and not prefix.endswith('/'):
prefix = '%s/' % prefix
preference = 0
if protocol == 'rtmp':
preference = -2
elif 'prog' in name.lower():
preference = 1
for stream in streams:
formats.append(
NFLIE.format_from_stream(stream, protocol, host,
prefix, preference, name))
self._sort_formats(formats)
@@ -94,7 +135,7 @@ class NFLIE(InfoExtractor):
return {
'id': video_id,
'title': video_data.get('storyHeadline'),
'title': video_data.get('headline'),
'formats': formats,
'description': video_data.get('caption'),
'duration': video_data.get('duration'),
+31 -8
View File
@@ -4,6 +4,7 @@ import re
from .common import InfoExtractor
from ..utils import (
unified_strdate,
US_RATINGS,
)
@@ -11,10 +12,10 @@ from ..utils import (
class PBSIE(InfoExtractor):
_VALID_URL = r'''(?x)https?://
(?:
# Direct video URL
video\.pbs\.org/(?:viralplayer|video)/(?P<id>[0-9]+)/? |
# Article with embedded player
(?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+)/?(?:$|[?\#]) |
# Direct video URL
video\.pbs\.org/(?:viralplayer|video)/(?P<id>[0-9]+)/? |
# Article with embedded player (or direct video)
(?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) |
# Player
video\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)/
)
@@ -65,10 +66,25 @@ class PBSIE(InfoExtractor):
'duration': 6559,
'thumbnail': 're:^https?://.*\.jpg$',
}
},
{
'url': 'http://www.pbs.org/wgbh/nova/earth/killer-typhoon.html',
'md5': '908f3e5473a693b266b84e25e1cf9703',
'info_dict': {
'id': '2365160389',
'display_id': 'killer-typhoon',
'ext': 'mp4',
'description': 'md5:c741d14e979fc53228c575894094f157',
'title': 'Killer Typhoon',
'duration': 3172,
'thumbnail': 're:^https?://.*\.jpg$',
'upload_date': '20140122',
}
}
]
def _extract_ids(self, url):
def _extract_webpage(self, url):
mobj = re.match(self._VALID_URL, url)
presumptive_id = mobj.group('presumptive_id')
@@ -76,15 +92,20 @@ class PBSIE(InfoExtractor):
if presumptive_id:
webpage = self._download_webpage(url, display_id)
upload_date = unified_strdate(self._search_regex(
r'<input type="hidden" id="air_date_[0-9]+" value="([^"]+)"',
webpage, 'upload date', default=None))
MEDIA_ID_REGEXES = [
r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'", # frontline video embed
r'class="coveplayerid">([^<]+)<', # coveplayer
r'<input type="hidden" id="pbs_video_id_[0-9]+" value="([0-9]+)"/>', # jwplayer
]
media_id = self._search_regex(
MEDIA_ID_REGEXES, webpage, 'media ID', fatal=False, default=None)
if media_id:
return media_id, presumptive_id
return media_id, presumptive_id, upload_date
url = self._search_regex(
r'<iframe\s+(?:class|id)=["\']partnerPlayer["\'].*?\s+src=["\'](.*?)["\']>',
@@ -104,10 +125,10 @@ class PBSIE(InfoExtractor):
video_id = mobj.group('id')
display_id = video_id
return video_id, display_id
return video_id, display_id, None
def _real_extract(self, url):
video_id, display_id = self._extract_ids(url)
video_id, display_id, upload_date = self._extract_webpage(url)
info_url = 'http://video.pbs.org/videoInfo/%s?format=json' % video_id
info = self._download_json(info_url, display_id)
@@ -119,6 +140,7 @@ class PBSIE(InfoExtractor):
return {
'id': video_id,
'display_id': display_id,
'title': info['title'],
'url': info['alternate_encoding']['url'],
'ext': 'mp4',
@@ -126,4 +148,5 @@ class PBSIE(InfoExtractor):
'thumbnail': info.get('image_url'),
'duration': info.get('duration'),
'age_limit': age_limit,
'upload_date': upload_date,
}
+55
View File
@@ -0,0 +1,55 @@
# coding: utf-8
from __future__ import unicode_literals
import re
import os.path
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
compat_urllib_request,
)
class PlayedIE(InfoExtractor):
IE_NAME = 'played.to'
_VALID_URL = r'https?://(?:www\.)?played\.to/(?P<id>[a-zA-Z0-9_-]+)'
_TEST = {
'url': 'http://played.to/j2f2sfiiukgt',
'md5': 'c2bd75a368e82980e7257bf500c00637',
'info_dict': {
'id': 'j2f2sfiiukgt',
'ext': 'flv',
'title': 'youtube-dl_test_video.mp4',
},
}
def _real_extract(self, url):
video_id = self._match_id(url)
orig_webpage = self._download_webpage(url, video_id)
fields = re.findall(
r'type="hidden" name="([^"]+)"\s+value="([^"]+)">', orig_webpage)
data = dict(fields)
self._sleep(2, video_id)
post = compat_urllib_parse.urlencode(data)
headers = {
b'Content-Type': b'application/x-www-form-urlencoded',
}
req = compat_urllib_request.Request(url, post, headers)
webpage = self._download_webpage(
req, video_id, note='Downloading video page ...')
title = os.path.splitext(data['fname'])[0]
video_url = self._search_regex(
r'file: "?(.+?)",', webpage, 'video URL')
return {
'id': video_id,
'title': title,
'url': video_url,
}
+27 -2
View File
@@ -26,8 +26,7 @@ class THVideoIE(InfoExtractor):
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
video_id = self._match_id(url)
# extract download link from mobile player page
webpage_player = self._download_webpage(
@@ -57,3 +56,29 @@ class THVideoIE(InfoExtractor):
'description': description,
'upload_date': upload_date
}
class THVideoPlaylistIE(InfoExtractor):
_VALID_URL = r'http?://(?:www\.)?thvideo\.tv/mylist(?P<id>[0-9]+)'
_TEST = {
'url': 'http://thvideo.tv/mylist2',
'info_dict': {
'id': '2',
'title': '幻想万華鏡',
},
'playlist_mincount': 23,
}
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
list_title = self._html_search_regex(
r'<h1 class="show_title">(.*?)<b id', webpage, 'playlist title',
fatal=False)
entries = [
self.url_result('http://thvideo.tv/v/th' + id, 'THVideo')
for id in re.findall(r'<dd><a href="http://thvideo.tv/v/th(\d+)/" target=', webpage)]
return self.playlist_result(entries, playlist_id, list_title)
-1
View File
@@ -5,7 +5,6 @@ import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
compat_HTTPError,
compat_urllib_request,
ExtractorError,
)
+60 -3
View File
@@ -8,17 +8,19 @@ import itertools
from .common import InfoExtractor
from .subtitles import SubtitlesInfoExtractor
from ..utils import (
clean_html,
compat_HTTPError,
compat_urllib_parse,
compat_urllib_request,
clean_html,
get_element_by_attribute,
compat_urlparse,
ExtractorError,
get_element_by_attribute,
InAdvancePagedList,
int_or_none,
RegexNotFoundError,
std_headers,
unsmuggle_url,
urlencode_postdata,
int_or_none,
)
@@ -529,3 +531,58 @@ class VimeoWatchLaterIE(VimeoBaseInfoExtractor, VimeoChannelIE):
def _real_extract(self, url):
return self._extract_videos('watchlater', 'https://vimeo.com/home/watchlater')
class VimeoLikesIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?vimeo\.com/user(?P<id>[0-9]+)/likes/?(?:$|[?#]|sort:)'
IE_NAME = 'vimeo:likes'
IE_DESC = 'Vimeo user likes'
_TEST = {
'url': 'https://vimeo.com/user755559/likes/',
'playlist_mincount': 293,
"info_dict": {
"description": "See all the videos urza likes",
"title": 'Videos urza likes',
},
}
def _real_extract(self, url):
user_id = self._match_id(url)
webpage = self._download_webpage(url, user_id)
page_count = self._int(
self._search_regex(
r'''(?x)<li><a\s+href="[^"]+"\s+data-page="([0-9]+)">
.*?</a></li>\s*<li\s+class="pagination_next">
''', webpage, 'page count'),
'page count', fatal=True)
PAGE_SIZE = 12
title = self._html_search_regex(
r'(?s)<h1>(.+?)</h1>', webpage, 'title', fatal=False)
description = self._html_search_meta('description', webpage)
def _get_page(idx):
page_url = '%s//vimeo.com/user%s/likes/page:%d/sort:date' % (
self.http_scheme(), user_id, idx + 1)
webpage = self._download_webpage(
page_url, user_id,
note='Downloading page %d/%d' % (idx + 1, page_count))
video_list = self._search_regex(
r'(?s)<ol class="js-browse_list[^"]+"[^>]*>(.*?)</ol>',
webpage, 'video content')
paths = re.findall(
r'<li[^>]*>\s*<a\s+href="([^"]+)"', video_list)
for path in paths:
yield {
'_type': 'url',
'url': compat_urlparse.urljoin(page_url, path),
}
pl = InAdvancePagedList(_get_page, page_count, PAGE_SIZE)
return {
'_type': 'playlist',
'id': 'user%s_likes' % user_id,
'title': title,
'description': description,
'entries': pl,
}
-1
View File
@@ -2,7 +2,6 @@
from __future__ import unicode_literals
import json
import math
import random
import re
+12 -2
View File
@@ -26,7 +26,7 @@ from ..utils import (
get_element_by_attribute,
ExtractorError,
int_or_none,
PagedList,
OnDemandPagedList,
unescapeHTML,
unified_strdate,
orderedSet,
@@ -655,6 +655,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
# Get video webpage
url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
pref_cookies = [
c for c in self._downloader.cookiejar
if c.domain == '.youtube.com' and c.name == 'PREF']
for pc in pref_cookies:
if 'hl=' in pc.value:
pc.value = re.sub(r'hl=[^&]+', 'hl=en', pc.value)
else:
if pc.value:
pc.value += '&'
pc.value += 'hl=en'
video_webpage = self._download_webpage(url, video_id)
# Attempt to extract SWF player URL
@@ -1341,7 +1351,7 @@ class YoutubeUserIE(InfoExtractor):
'id': video_id,
'title': title,
}
url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
return self.playlist_result(url_results, playlist_title=username)
+1 -1
View File
@@ -87,7 +87,7 @@ def parseOpts(overrideArguments=None):
for private_opt in ['-p', '--password', '-u', '--username', '--video-password']:
try:
i = opts.index(private_opt)
opts[i+1] = '<PRIVATE>'
opts[i+1] = 'PRIVATE'
except ValueError:
pass
return opts
+35 -4
View File
@@ -1384,14 +1384,16 @@ def check_executable(exe, args=[]):
class PagedList(object):
def __init__(self, pagefunc, pagesize):
self._pagefunc = pagefunc
self._pagesize = pagesize
def __len__(self):
# This is only useful for tests
return len(self.getslice())
class OnDemandPagedList(PagedList):
def __init__(self, pagefunc, pagesize):
self._pagefunc = pagefunc
self._pagesize = pagesize
def getslice(self, start=0, end=None):
res = []
for pagenum in itertools.count(start // self._pagesize):
@@ -1430,6 +1432,35 @@ class PagedList(object):
return res
class InAdvancePagedList(PagedList):
def __init__(self, pagefunc, pagecount, pagesize):
self._pagefunc = pagefunc
self._pagecount = pagecount
self._pagesize = pagesize
def getslice(self, start=0, end=None):
res = []
start_page = start // self._pagesize
end_page = (
self._pagecount if end is None else (end // self._pagesize + 1))
skip_elems = start - start_page * self._pagesize
only_more = None if end is None else end - start
for pagenum in range(start_page, end_page):
page = list(self._pagefunc(pagenum))
if skip_elems:
page = page[skip_elems:]
skip_elems = None
if only_more is not None:
if len(page) < only_more:
only_more -= len(page)
else:
page = page[:only_more]
res.extend(page)
break
res.extend(page)
return res
def uppercase_escape(s):
unicode_escape = codecs.getdecoder('unicode_escape')
return re.sub(
+1 -1
View File
@@ -1,2 +1,2 @@
__version__ = '2014.09.28'
__version__ = '2014.09.29.2'