From cd3a3ff93bd5d6866d3822cb438b0e172ffe4e39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 28 Jul 2018 22:09:53 +0700 Subject: [PATCH] [ted] Improve extraction and update tests --- youtube_dl/extractor/ted.py | 111 ++++++++++++++++++++---------------- 1 file changed, 61 insertions(+), 50 deletions(-) diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index dc9c5ce8e..212ac80ab 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -7,8 +7,10 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( + float_or_none, int_or_none, try_get, + url_or_none, ) @@ -30,7 +32,7 @@ class TEDIE(InfoExtractor): ''' _TESTS = [{ 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html', - 'md5': '0de43ac406aa3e4ea74b66c9c7789b13', + 'md5': 'b0ce2b05ca215042124fbc9e3886493a', 'info_dict': { 'id': '102', 'ext': 'mp4', @@ -42,24 +44,30 @@ class TEDIE(InfoExtractor): 'uploader': 'Dan Dennett', 'width': 853, 'duration': 1308, - } - }, { - 'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms', - 'md5': 'b899ac15e345fb39534d913f7606082b', - 'info_dict': { - 'id': 'tSVI8ta_P4w', - 'ext': 'mp4', - 'title': 'Vishal Sikka: The beauty and power of algorithms', - 'thumbnail': r're:^https?://.+\.jpg', - 'description': 'md5:6261fdfe3e02f4f579cbbfc00aff73f4', - 'upload_date': '20140122', - 'uploader_id': 'TEDInstitute', - 'uploader': 'TED Institute', + 'view_count': int, + 'comment_count': int, + 'tags': list, + }, + 'params': { + 'skip_download': True, + }, + }, { + # missing HTTP bitrates + 'url': 'https://www.ted.com/talks/vishal_sikka_the_beauty_and_power_of_algorithms', + 'info_dict': { + 'id': '6069', + 'ext': 'mp4', + 'title': 'The beauty and power of algorithms', + 'thumbnail': r're:^https?://.+\.jpg', + 'description': 'md5:734e352710fb00d840ab87ae31aaf688', + 'uploader': 'Vishal Sikka', + }, + 'params': { + 'skip_download': True, }, - 'add_ie': ['Youtube'], }, { 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best', - 'md5': '71b3ab2f4233012dce09d515c9c39ce2', + 'md5': 'e6b9617c01a7970ceac8bb2c92c346c0', 'info_dict': { 'id': '1972', 'ext': 'mp4', @@ -68,6 +76,9 @@ class TEDIE(InfoExtractor): 'description': 'md5:5174aed4d0f16021b704120360f72b92', 'duration': 1128, }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://www.ted.com/playlists/who_are_the_hackers', 'info_dict': { @@ -91,22 +102,6 @@ class TEDIE(InfoExtractor): 'params': { 'skip_download': True, }, - }, { - # YouTube video - 'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond', - 'add_ie': ['Youtube'], - 'info_dict': { - 'id': 'aFBIPO-P7LM', - 'ext': 'mp4', - 'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville', - 'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1', - 'uploader': 'TEDx Talks', - 'uploader_id': 'TEDxTalks', - 'upload_date': '20111216', - }, - 'params': { - 'skip_download': True, - }, }, { # no nativeDownloads 'url': 'https://www.ted.com/talks/tom_thum_the_orchestra_in_my_mouth', @@ -116,6 +111,9 @@ class TEDIE(InfoExtractor): 'title': 'The orchestra in my mouth', 'description': 'md5:5d1d78650e2f8dfcbb8ebee2951ac29a', 'uploader': 'Tom Thum', + 'view_count': int, + 'comment_count': int, + 'tags': list, }, 'params': { 'skip_download': True, @@ -174,24 +172,11 @@ class TEDIE(InfoExtractor): info = self._extract_info(webpage) - talk_info = try_get( - info, lambda x: x['__INITIAL_DATA__']['talks'][0], - dict) or info['talks'][0] + data = try_get(info, lambda x: x['__INITIAL_DATA__'], dict) or info + talk_info = data['talks'][0] title = talk_info['title'].strip() - external = talk_info.get('external') - if external: - service = external['service'] - self.to_screen('Found video from %s' % service) - ext_url = None - if service.lower() == 'youtube': - ext_url = external.get('code') - return { - '_type': 'url', - 'url': ext_url or external['uri'], - } - native_downloads = try_get( talk_info, (lambda x: x['downloads']['nativeDownloads'], @@ -211,10 +196,24 @@ class TEDIE(InfoExtractor): player_talk = talk_info['player_talks'][0] + external = player_talk.get('external') + if isinstance(external, dict): + service = external.get('service') + if isinstance(service, compat_str): + ext_url = None + if service.lower() == 'youtube': + ext_url = external.get('code') + return { + '_type': 'url', + 'url': ext_url or external['uri'], + } + resources_ = player_talk.get('resources') or talk_info.get('resources') http_url = None for format_id, resources in resources_.items(): + if not isinstance(resources, dict): + continue if format_id == 'h264': for resource in resources: h264_url = resource.get('file') @@ -243,8 +242,12 @@ class TEDIE(InfoExtractor): 'tbr': int_or_none(resource.get('bitrate')), }) elif format_id == 'hls': + stream_url = url_or_none(resources.get('stream')) + if not stream_url: + continue formats.extend(self._extract_m3u8_formats( - resources.get('stream'), video_name, 'mp4', m3u8_id=format_id, fatal=False)) + stream_url, video_name, 'mp4', m3u8_id=format_id, + fatal=False)) m3u8_formats = list(filter( lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none', @@ -254,9 +257,13 @@ class TEDIE(InfoExtractor): bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None) if not bitrate: continue + bitrate_url = re.sub(r'\d+k', bitrate, http_url) + if not self._is_valid_url( + bitrate_url, video_name, '%s bitrate' % bitrate): + continue f = m3u8_format.copy() f.update({ - 'url': re.sub(r'\d+k', bitrate, http_url), + 'url': bitrate_url, 'format_id': m3u8_format['format_id'].replace('hls', 'http'), 'protocol': 'http', }) @@ -282,7 +289,11 @@ class TEDIE(InfoExtractor): 'description': self._og_search_description(webpage), 'subtitles': self._get_subtitles(video_id, talk_info), 'formats': formats, - 'duration': talk_info.get('duration'), + 'duration': float_or_none(talk_info.get('duration')), + 'view_count': int_or_none(data.get('viewed_count')), + 'comment_count': int_or_none( + try_get(data, lambda x: x['comments']['count'])), + 'tags': try_get(talk_info, lambda x: x['tags'], list), } def _get_subtitles(self, video_id, talk_info):