From 001d5b8395b5b1eccfb829867c2e36e2ee7241b8 Mon Sep 17 00:00:00 2001 From: dirkf Date: Thu, 17 Nov 2022 19:23:43 +0000 Subject: [PATCH] Fix extraction --- youtube_dl/extractor/voe.py | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/voe.py b/youtube_dl/extractor/voe.py index 4f24e50bb..c4512154d 100644 --- a/youtube_dl/extractor/voe.py +++ b/youtube_dl/extractor/voe.py @@ -2,7 +2,11 @@ from __future__ import unicode_literals from .common import InfoExtractor - +from ..utils import ( + int_or_none, + js_to_json, + url_or_none, +) class VOEIE(InfoExtractor): IE_NAME = 'voe' @@ -12,7 +16,7 @@ class VOEIE(InfoExtractor): 'url': 'https://voe.sx/e/ng7ja5n5n2y8', 'info_dict': { 'id': 'ng7ja5n5n2y8', - 'title': 'md5:05ab15eb43a32f0f5918755156c5fb34', + 'title': 'md5:a86687fb962742f04652aee19ad34e06', 'thumbnail': r're:^https?://.*\.jpg$', 'ext': 'm3u8', }, @@ -24,21 +28,34 @@ class VOEIE(InfoExtractor): webpage = self._download_webpage( 'https://voe.sx/e/%s' % video_id, video_id) - m3u8 = self._search_regex( - r'(https.+m3u8)', - webpage, 'm3u8') + sources = self._parse_json( + self._search_regex(r'\bsources\s*=\s*(\{[^}]+\})', webpage, 'sources'), + video_id, transform_source=js_to_json) title = self._search_regex( - r'Watch (?P<title>.+)<\/title>', + r'<title>(?:Watch\s+)?(?P<title>.+?)(?:-\s+VOE\s+\|.+)?', webpage, 'title', group='title') - thumbnail = self._search_regex( - r'VOEPlayer.poster="(?Phttps.+)"', - webpage, 'thumbnail', group='thumbnail') + formats = [] + + f_url = url_or_none(sources.get('hls')) + if f_url: + formats.extend(self._extract_m3u8_formats( + f_url, video_id, entry_protocol='m3u8_native', fatal=False)) + f_url = url_or_none(sources.get('mp4')) + if f_url: + formats.append({ + 'url': f_url, + 'ext': 'mp4', + 'height': int_or_none(sources.get('video_height')), + }) - formats = self._extract_m3u8_formats(m3u8, video_id) self._sort_formats(formats) + thumbnail = url_or_none(self._search_regex( + r'(?:VOEPlayer.|data-)poster\s*=\s*(["\'])(?P(?:(?!\1)\S)+)\1', + webpage, 'thumbnail', group='thumbnail', default=None)) + return { 'id': video_id, 'title': title,