From de4144a4aedd6ab9f24ffa1a777bce99e019468e Mon Sep 17 00:00:00 2001 From: lkho Date: Sat, 29 Aug 2020 15:04:16 +0800 Subject: [PATCH] [duboku] add playlist extractor --- youtube_dl/extractor/duboku.py | 93 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 5 +- 2 files changed, 97 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/duboku.py b/youtube_dl/extractor/duboku.py index 3e4cf8d5b..4db81a665 100644 --- a/youtube_dl/extractor/duboku.py +++ b/youtube_dl/extractor/duboku.py @@ -4,10 +4,49 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import * +def _get_elements_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True): + """Return the content of the tag with the specified attribute in the passed HTML document""" + + if tag is None: + tag = '[a-zA-Z0-9:._-]+' + if attribute is None: + attribute = '' + else: + attribute = r'\s+(?P%s)' % re.escape(attribute) + if value is None: + value = '' + else: + value = re.escape(value) if escape_value else value + value = '=[\'"]?(?P%s)[\'"]?' % value + + retlist = [] + for m in re.finditer(r'''(?xs) + <(?P%s) + (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*? + %s%s + (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*? + \s*> + (?P.*?) + + ''' % (tag, attribute, value), html): + retlist.append(m) + + return retlist + + +def _get_element_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True): + retval = _get_elements_by_tag_and_attrib(html, tag, attribute, value, escape_value) + return retval[0] if retval else None + + class DubokuIE(InfoExtractor): + IE_NAME = 'duboku' + IE_DESC = 'www.duboku.co' + _VALID_URL = r'(?:https?://[^/]+\.duboku\.co/vodplay/)(?P[0-9\-]+)\.html.*' _TESTS = [{ 'url': 'https://www.duboku.co/vodplay/1575-1-1.html', @@ -90,3 +129,57 @@ class DubokuIE(InfoExtractor): 'episode_id': episode_id, 'formats': formats, } + + +class DubokuPlaylistIE(InfoExtractor): + IE_NAME = 'duboku:list' + IE_DESC = 'www.duboku.co entire series' + + _VALID_URL = r'(?:https?://[^/]+\.duboku\.co/voddetail/)(?P[0-9]+)\.html.*' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError('Invalid URL: %s' % url) + series_id = mobj.group('id') + fragment = compat_urlparse.urlparse(url).fragment + + webpage_url = 'https://www.duboku.co/voddetail/%s.html' % series_id + webpage_html = self._download_webpage(webpage_url, series_id) + + # extract title + + title = _get_element_by_tag_and_attrib(webpage_html, 'h1', 'class', 'title') + title = unescapeHTML(title.group('content')) if title else None + if not title: + title = self._html_search_meta('keywords', webpage_html) + if not title: + title = _get_element_by_tag_and_attrib(webpage_html, 'title') + title = unescapeHTML(title.group('content')) if title else None + + # extract playlists + + playlists = {} + for div in _get_elements_by_tag_and_attrib( + webpage_html, attribute='id', value='playlist\\d+', escape_value=False): + playlist_id = div.group('value') + playlist = [] + for a in _get_elements_by_tag_and_attrib( + div.group('content'), 'a', 'href', value='[^\'"]+?', escape_value=False): + playlist.append({ + 'href': unescapeHTML(a.group('value')), + 'title': unescapeHTML(a.group('content')) + }) + playlists[playlist_id] = playlist + + # select the specified playlist if url fragment exists + playlist = playlists.get(fragment) if fragment else next(iter(playlists.values())) + if not playlist: + raise ExtractorError( + 'Cannot find %s' % fragment if fragment else 'Cannot extract playlist') + + # return url results + return self.playlist_result([ + self.url_result( + 'https://www.duboku.co' + x['href'], video_title=x.get('title')) + for x in playlist], series_id, title) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e6c008b6f..407701717 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -282,7 +282,10 @@ from .drtv import ( ) from .dtube import DTubeIE from .dvtv import DVTVIE -from .duboku import DubokuIE +from .duboku import ( + DubokuIE, + DubokuPlaylistIE +) from .dumpert import DumpertIE from .defense import DefenseGouvFrIE from .discovery import DiscoveryIE