1
0
mirror of https://github.com/ytdl-org/youtube-dl.git synced 2025-01-09 19:18:47 +00:00

Possibly working version without storyboards (permanently?) and sttls (for now)

This commit is contained in:
dirkf 2022-11-14 16:00:04 +00:00 committed by GitHub
parent a98ff43ac2
commit 6468249594
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -1,14 +1,20 @@
import re # coding: utf-8
from __future__ import unicode_literals
import calendar import calendar
import json
import functools
from datetime import datetime from datetime import datetime
import functools
import json
import itertools
from random import random from random import random
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_map as map,
compat_parse_qs as parse_qs,
compat_str,
compat_urllib_parse_urlparse, compat_urllib_parse_urlparse,
compat_urlparse
) )
from ..utils import ( from ..utils import (
@ -16,12 +22,46 @@ from ..utils import (
ExtractorError, ExtractorError,
get_first, get_first,
int_or_none, int_or_none,
merge_dicts,
OnDemandPagedList, OnDemandPagedList,
parse_qs, orderedSet,
srt_subtitles_timecode, srt_subtitles_timecode,
traverse_obj, traverse_obj,
try_get,
update_url_query,
) )
import inspect
if len(try_get(InfoExtractor.report_warning,
(lambda x: inspect.getfullargspec(x).FullArgs,
lambda x: inspect.getargspec(x).Args, ), list) or []) <= 2:
BaseInfoExtractor = InfoExtractor
class InfoExtractor(BaseInfoExtractor):
def report_warning(self, warning, only_once=True, _memo=set()):
from hashlib import md5
if only_once:
w_hash = md5(warning).hexdigest()
if w_hash in _memo:
return
_memo.add(w_hash)
super(InfoExtractor, self).report_warning(self, warning)
@classmethod
def _match_valid_url(cls, url):
return re.match(cls._VALID_URL, url)
@staticmethod
def _merge_subtitle_items(subtitle_list1, subtitle_list2):
""" Merge subtitle items for one language. Items with duplicated URLs/data
will be dropped. """
list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
ret = list(subtitle_list1)
ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
return ret
class PanoptoBaseIE(InfoExtractor): class PanoptoBaseIE(InfoExtractor):
BASE_URL_RE = r'(?P<base_url>https?://[\w.-]+\.panopto.(?:com|eu)/Panopto)' BASE_URL_RE = r'(?P<base_url>https?://[\w.-]+\.panopto.(?:com|eu)/Panopto)'
@ -62,7 +102,7 @@ class PanoptoBaseIE(InfoExtractor):
if error_code == 2: if error_code == 2:
self.raise_login_required(method='cookies') self.raise_login_required(method='cookies')
elif error_code is not None: elif error_code is not None:
msg = f'Panopto said: {response.get("ErrorMessage")}' msg = '%s said: %s' % (self.IE_NAME, response.get('ErrorMessage') or '[no message]')
if fatal: if fatal:
raise ExtractorError(msg, video_id=video_id, expected=True) raise ExtractorError(msg, video_id=video_id, expected=True)
else: else:
@ -71,17 +111,14 @@ class PanoptoBaseIE(InfoExtractor):
@staticmethod @staticmethod
def _parse_fragment(url): def _parse_fragment(url):
return {k: json.loads(v[0]) for k, v in compat_urlparse.parse_qs(compat_urllib_parse_urlparse(url).fragment).items()} return dict((k, json.loads(v[0])) for k, v in parse_qs(compat_urllib_parse_urlparse(url).fragment).items())
@staticmethod
def _extract_urls(webpage):
return [m.group('url') for m in re.finditer(
r'<iframe[^>]+src=["\'](?P<url>%s/Pages/(Viewer|Embed|Sessions/List)\.aspx[^"\']+)' % PanoptoIE.BASE_URL_RE,
webpage)]
class PanoptoIE(PanoptoBaseIE): class PanoptoIE(PanoptoBaseIE):
_VALID_URL = PanoptoBaseIE.BASE_URL_RE + r'/Pages/(Viewer|Embed)\.aspx.*(?:\?|&)id=(?P<id>[a-f0-9-]+)' _VALID_URL = PanoptoBaseIE.BASE_URL_RE + r'/Pages/(Viewer|Embed)\.aspx.*(?:\?|&)id=(?P<id>[a-f0-9-]+)'
__EMBED_REGEX = [
r'''<iframe[^>]+src=["\'](?P<url>%s/Pages/(Viewer|Embed|Sessions/List)\.aspx[^"']+)'''
% (PanoptoBaseIE.BASE_URL_RE, )]
_TESTS = [ _TESTS = [
{ {
'url': 'https://demo.hosted.panopto.com/Panopto/Pages/Viewer.aspx?id=26b3ae9e-4a48-4dcc-96ba-0befba08a0fb', 'url': 'https://demo.hosted.panopto.com/Panopto/Pages/Viewer.aspx?id=26b3ae9e-4a48-4dcc-96ba-0befba08a0fb',
@ -184,7 +221,8 @@ class PanoptoIE(PanoptoBaseIE):
'chapters': 'count:28', 'chapters': 'count:28',
'thumbnail': r're:https://demo\.hosted\.panopto\.com/.+', 'thumbnail': r're:https://demo\.hosted\.panopto\.com/.+',
}, },
'params': {'format': 'mhtml', 'skip_download': True} 'params': {'format': 'mhtml', 'skip_download': True},
'skip': 'Not yet implemented',
}, },
{ {
'url': 'https://na-training-1.hosted.panopto.com/Panopto/Pages/Viewer.aspx?id=8285224a-9a2b-4957-84f2-acb0000c4ea9', 'url': 'https://na-training-1.hosted.panopto.com/Panopto/Pages/Viewer.aspx?id=8285224a-9a2b-4957-84f2-acb0000c4ea9',
@ -203,8 +241,8 @@ class PanoptoIE(PanoptoBaseIE):
'uploader': 'Cait M.', 'uploader': 'Cait M.',
'upload_date': '20210306', 'upload_date': '20210306',
'cast': ['Cait M.'], 'cast': ['Cait M.'],
'subtitles': {'en-US': [{'ext': 'srt', 'data': 'md5:a3f4d25963fdeace838f327097c13265'}], # 'subtitles': {'en-US': [{'ext': 'srt', 'data': 'md5:a3f4d25963fdeace838f327097c13265'}],
'es-ES': [{'ext': 'srt', 'data': 'md5:57e9dad365fd0fbaf0468eac4949f189'}]}, # 'es-ES': [{'ext': 'srt', 'data': 'md5:57e9dad365fd0fbaf0468eac4949f189'}]},
}, },
'params': {'writesubtitles': True, 'skip_download': True} 'params': {'writesubtitles': True, 'skip_download': True}
}, { }, {
@ -244,12 +282,19 @@ class PanoptoIE(PanoptoBaseIE):
def suitable(cls, url): def suitable(cls, url):
return False if PanoptoPlaylistIE.suitable(url) else super().suitable(url) return False if PanoptoPlaylistIE.suitable(url) else super().suitable(url)
@classmethod
def _extract_from_webpage(cls, url, webpage):
return map(
lambda u: cls.url_result(u, cls.ie_key()),
orderedSet(m.group('url') for m in itertools.chain(
re.finditer(embed_re, webpage) for embed_re in cls._EMBED_REGEX)))
def _mark_watched(self, base_url, video_id, delivery_info): def _mark_watched(self, base_url, video_id, delivery_info):
duration = traverse_obj(delivery_info, ('Delivery', 'Duration'), expected_type=float) duration = traverse_obj(delivery_info, ('Delivery', 'Duration'), expected_type=float)
invocation_id = delivery_info.get('InvocationId') invocation_id = delivery_info.get('InvocationId')
stream_id = traverse_obj(delivery_info, ('Delivery', 'Streams', ..., 'PublicID'), get_all=False, expected_type=str) stream_id = traverse_obj(delivery_info, ('Delivery', 'Streams', Ellipsis, 'PublicID'), get_all=False, expected_type=str)
if invocation_id and stream_id and duration: if invocation_id and stream_id and duration:
timestamp_str = f'/Date({calendar.timegm(datetime.utcnow().timetuple())}000)/' timestamp_str = '/Date(%s000)/' % (calendar.timegm(datetime.utcnow().timetuple()), )
data = { data = {
'streamRequests': [ 'streamRequests': [
{ {
@ -296,14 +341,24 @@ class PanoptoIE(PanoptoBaseIE):
obj_id, obj_sn = timestamp.get('ObjectIdentifier'), timestamp.get('ObjectSequenceNumber'), obj_id, obj_sn = timestamp.get('ObjectIdentifier'), timestamp.get('ObjectSequenceNumber'),
if timestamp.get('EventTargetType') == 'PowerPoint' and obj_id is not None and obj_sn is not None: if timestamp.get('EventTargetType') == 'PowerPoint' and obj_id is not None and obj_sn is not None:
image_frags.setdefault('slides', []).append({ image_frags.setdefault('slides', []).append({
'url': base_url + f'/Pages/Viewer/Image.aspx?id={obj_id}&number={obj_sn}', 'url': update_url_query(
base_url + '/Pages/Viewer/Image.aspx', {
'id': obj_id,
'number': obj_sn,
}),
'duration': duration 'duration': duration
}) })
obj_pid, session_id, abs_time = timestamp.get('ObjectPublicIdentifier'), timestamp.get('SessionID'), timestamp.get('AbsoluteTime') obj_pid, session_id, abs_time = timestamp.get('ObjectPublicIdentifier'), timestamp.get('SessionID'), timestamp.get('AbsoluteTime')
if None not in (obj_pid, session_id, abs_time): if None not in (obj_pid, session_id, abs_time):
image_frags.setdefault('chapter', []).append({ image_frags.setdefault('chapter', []).append({
'url': base_url + f'/Pages/Viewer/Thumb.aspx?eventTargetPID={obj_pid}&sessionPID={session_id}&number={obj_sn}&isPrimary=false&absoluteTime={abs_time}', 'url': update_url_query(
base_url + '/Pages/Viewer/Thumb.aspx?isPrimary=false', {
'eventTargetPID': obj_pid,
'sessionPID': session_id,
'number': obj_sn,
'absoluteTime': abs_time,
}),
'duration': duration, 'duration': duration,
}) })
for name, fragments in image_frags.items(): for name, fragments in image_frags.items():
@ -319,16 +374,19 @@ class PanoptoIE(PanoptoBaseIE):
@staticmethod @staticmethod
def _json2srt(data, delivery): def _json2srt(data, delivery):
def _gen_lines(): SRT_CAPTION_FMT = '{0}\n{1} --> {2}\n{3}'
for i, line in enumerate(data):
def gen_lines(dat, deliv):
for i, line in enumerate(dat):
start_time = line['Time'] start_time = line['Time']
duration = line.get('Duration') duration = line.get('Duration')
if duration: if duration:
end_time = start_time + duration end_time = start_time + duration
else: else:
end_time = traverse_obj(data, (i + 1, 'Time')) or delivery['Duration'] end_time = traverse_obj(dat, (i + 1, 'Time')) or deliv['Duration']
yield f'{i + 1}\n{srt_subtitles_timecode(start_time)} --> {srt_subtitles_timecode(end_time)}\n{line["Caption"]}' yield SRT_CAPTION_FMT.format(
return '\n\n'.join(_gen_lines()) i + 1, srt_subtitles_timecode(start_time), srt_subtitles_timecode(end_time), line['Caption'])
return '\n\n'.join(gen_lines(data, delivery))
def _get_subtitles(self, base_url, video_id, delivery): def _get_subtitles(self, base_url, video_id, delivery):
subtitles = {} subtitles = {}
@ -338,7 +396,7 @@ class PanoptoIE(PanoptoBaseIE):
note='Downloading captions JSON metadata', query={ note='Downloading captions JSON metadata', query={
'deliveryId': video_id, 'deliveryId': video_id,
'getCaptions': True, 'getCaptions': True,
'language': str(lang), 'language': compat_str(lang),
'responseType': 'json' 'responseType': 'json'
} }
) )
@ -364,18 +422,17 @@ class PanoptoIE(PanoptoBaseIE):
if stream_url: if stream_url:
media_type = stream.get('ViewerMediaFileTypeName') media_type = stream.get('ViewerMediaFileTypeName')
if media_type in ('hls', ): if media_type in ('hls', ):
m3u8_formats, stream_subtitles = self._extract_m3u8_formats_and_subtitles(stream_url, video_id) # m3u8_formats, stream_subtitles = self._extract_m3u8_formats_and_subtitles(stream_url, video_id)
m3u8_formats = self._extract_m3u8_formats(stream_url, video_id)
stream_formats.extend(m3u8_formats) stream_formats.extend(m3u8_formats)
subtitles = self._merge_subtitles(subtitles, stream_subtitles) # subtitles = self._merge_subtitles(subtitles, stream_subtitles)
else: else:
stream_formats.append({ stream_formats.append({
'url': stream_url 'url': stream_url
}) })
for fmt in stream_formats: for fmt in stream_formats:
fmt.update({ fmt.update({'format_note': stream.get('Tag'), })
'format_note': stream.get('Tag'), fmt.update(fmt_kwargs)
**fmt_kwargs
})
formats.extend(stream_formats) formats.extend(stream_formats)
return formats, subtitles return formats, subtitles
@ -410,8 +467,8 @@ class PanoptoIE(PanoptoBaseIE):
formats = podcast_formats + streams_formats formats = podcast_formats + streams_formats
formats.extend(self._extract_mhtml_formats(base_url, timestamps)) formats.extend(self._extract_mhtml_formats(base_url, timestamps))
subtitles = self._merge_subtitles( subtitles = self._merge_subtitles(podcast_subtitles, streams_subtitles)
podcast_subtitles, streams_subtitles, self.extract_subtitles(base_url, video_id, delivery)) subtitles = self._merge_subtitles(subtitles, self.extract_subtitles(base_url, video_id, delivery))
self._sort_formats(formats) self._sort_formats(formats)
self.mark_watched(base_url, video_id, delivery_info) self.mark_watched(base_url, video_id, delivery_info)
@ -419,16 +476,20 @@ class PanoptoIE(PanoptoBaseIE):
return { return {
'id': video_id, 'id': video_id,
'title': delivery.get('SessionName'), 'title': delivery.get('SessionName'),
'cast': traverse_obj(delivery, ('Contributors', ..., 'DisplayName'), default=[], expected_type=lambda x: x or None), 'cast': traverse_obj(delivery, ('Contributors', Ellipsis, 'DisplayName'), default=[], expected_type=lambda x: x or None),
'timestamp': session_start_time - 11640000000 if session_start_time else None, 'timestamp': session_start_time - 11640000000 if session_start_time else None,
'duration': delivery.get('Duration'), 'duration': delivery.get('Duration'),
'thumbnail': base_url + f'/Services/FrameGrabber.svc/FrameRedirect?objectId={video_id}&mode=Delivery&random={random()}', 'thumbnail': update_url_query(
base_url + '/Services/FrameGrabber.svc/FrameRedirect?mode=Delivery', {
'objectId': video_id,
'random': random(),
}),
'average_rating': delivery.get('AverageRating'), 'average_rating': delivery.get('AverageRating'),
'chapters': self._extract_chapters(timestamps), 'chapters': self._extract_chapters(timestamps),
'uploader': delivery.get('OwnerDisplayName') or None, 'uploader': delivery.get('OwnerDisplayName') or None,
'uploader_id': delivery.get('OwnerId'), 'uploader_id': delivery.get('OwnerId'),
'description': delivery.get('SessionAbstract'), 'description': delivery.get('SessionAbstract'),
'tags': traverse_obj(delivery, ('Tags', ..., 'Content')), 'tags': traverse_obj(delivery, ('Tags', Ellipsis, 'Content')),
'channel_id': delivery.get('SessionGroupPublicID'), 'channel_id': delivery.get('SessionGroupPublicID'),
'channel': traverse_obj(delivery, 'SessionGroupLongName', 'SessionGroupShortName', get_all=False), 'channel': traverse_obj(delivery, 'SessionGroupLongName', 'SessionGroupShortName', get_all=False),
'formats': formats, 'formats': formats,
@ -444,9 +505,8 @@ class PanoptoPlaylistIE(PanoptoBaseIE):
'info_dict': { 'info_dict': {
'title': 'Featured Video Tutorials', 'title': 'Featured Video Tutorials',
'id': 'f3b39fcf-882f-4849-93d6-a9f401236d36', 'id': 'f3b39fcf-882f-4849-93d6-a9f401236d36',
'description': '',
}, },
'playlist_mincount': 36 'playlist_mincount': 34, # was 36
}, },
{ {
'url': 'https://utsa.hosted.panopto.com/Panopto/Pages/Viewer.aspx?pid=e2900555-3ad4-4bdb-854d-ad2401686190', 'url': 'https://utsa.hosted.panopto.com/Panopto/Pages/Viewer.aspx?pid=e2900555-3ad4-4bdb-854d-ad2401686190',
@ -462,7 +522,9 @@ class PanoptoPlaylistIE(PanoptoBaseIE):
def _entries(self, base_url, playlist_id, session_list_id): def _entries(self, base_url, playlist_id, session_list_id):
session_list_info = self._call_api( session_list_info = self._call_api(
base_url, f'/Api/SessionLists/{session_list_id}?collections[0].maxCount=500&collections[0].name=items', playlist_id) base_url,
'/Api/SessionLists/%s?collections[0].maxCount=500&collections[0].name=items' % (session_list_id, ),
playlist_id)
items = session_list_info['Items'] items = session_list_info['Items']
for item in items: for item in items:
@ -487,11 +549,11 @@ class PanoptoPlaylistIE(PanoptoBaseIE):
if video_id: if video_id:
if self.get_param('noplaylist'): if self.get_param('noplaylist'):
self.to_screen('Downloading just video %s because of --no-playlist' % video_id) self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
return self.url_result(base_url + f'/Pages/Viewer.aspx?id={video_id}', ie_key=PanoptoIE.ie_key(), video_id=video_id) return self.url_result(update_url_query(base_url + '/Pages/Viewer.aspx', {'id': video_id}), ie_key=PanoptoIE.ie_key(), video_id=video_id)
else: else:
self.to_screen(f'Downloading playlist {playlist_id}; add --no-playlist to just download video {video_id}') self.to_screen('Downloading playlist {playlist_id}; add --no-playlist to just download video {video_id}'.format(**locals()))
playlist_info = self._call_api(base_url, f'/Api/Playlists/{playlist_id}', playlist_id) playlist_info = self._call_api(base_url, '/Api/Playlists/' + playlist_id, playlist_id)
return self.playlist_result( return self.playlist_result(
self._entries(base_url, playlist_id, playlist_info['SessionListId']), self._entries(base_url, playlist_id, playlist_info['SessionListId']),
playlist_id=playlist_id, playlist_title=playlist_info.get('Name'), playlist_id=playlist_id, playlist_title=playlist_info.get('Name'),
@ -533,17 +595,17 @@ class PanoptoListIE(PanoptoBaseIE):
def _fetch_page(self, base_url, query_params, display_id, page): def _fetch_page(self, base_url, query_params, display_id, page):
params = { params = merge_dicts({
'page': page,
'maxResults': self._PAGE_SIZE,
}, query_params, {
'sortColumn': 1, 'sortColumn': 1,
'getFolderData': True, 'getFolderData': True,
'includePlaylists': True, 'includePlaylists': True,
**query_params, })
'page': page,
'maxResults': self._PAGE_SIZE,
}
response = self._call_api( response = self._call_api(
base_url, '/Services/Data.svc/GetSessions', f'{display_id} page {page+1}', base_url, '/Services/Data.svc/GetSessions', '%s page %d' % (display_id, page + 1),
data={'queryParameters': params}, fatal=False) data={'queryParameters': params}, fatal=False)
for result in get_first(response, 'Results', default=[]): for result in get_first(response, 'Results', default=[]):
@ -553,7 +615,9 @@ class PanoptoListIE(PanoptoBaseIE):
'_type': 'url', '_type': 'url',
'id': item_id, 'id': item_id,
'title': result.get('SessionName'), 'title': result.get('SessionName'),
'url': traverse_obj(result, 'ViewerUrl', 'EmbedUrl', get_all=False) or (base_url + f'/Pages/Viewer.aspx?id={item_id}'), 'url': (
traverse_obj(result, 'ViewerUrl', 'EmbedUrl', get_all=False)
or update_url_query(base_url + '/Pages/Viewer.aspx', {'id': item_id})),
'duration': result.get('Duration'), 'duration': result.get('Duration'),
'channel': result.get('FolderName'), 'channel': result.get('FolderName'),
'channel_id': result.get('FolderID'), 'channel_id': result.get('FolderID'),
@ -562,7 +626,7 @@ class PanoptoListIE(PanoptoBaseIE):
for folder in get_first(response, 'Subfolders', default=[]): for folder in get_first(response, 'Subfolders', default=[]):
folder_id = folder.get('ID') folder_id = folder.get('ID')
yield self.url_result( yield self.url_result(
base_url + f'/Pages/Sessions/List.aspx#folderID="{folder_id}"', '%s/Pages/Sessions/List.aspx#folderID=%s' % (base_url, folder_id),
ie_key=PanoptoListIE.ie_key(), video_id=folder_id, title=folder.get('Name')) ie_key=PanoptoListIE.ie_key(), video_id=folder_id, title=folder.get('Name'))
def _extract_folder_metadata(self, base_url, folder_id): def _extract_folder_metadata(self, base_url, folder_id):
@ -591,7 +655,7 @@ class PanoptoListIE(PanoptoBaseIE):
query = query_params.get('query') query = query_params.get('query')
if query: if query:
display_id += f': query "{query}"' display_id += ': query "%s"' % (query, )
info = { info = {
'_type': 'playlist', '_type': 'playlist',