1
0
mirror of https://github.com/ytdl-org/youtube-dl.git synced 2024-06-26 00:27:57 +00:00

Compare commits

...

15 Commits

Author SHA1 Message Date
Olivier Trichet
5fb09daf08
Merge 72db217289 into 0153b387e5 2024-06-14 12:01:11 +00:00
Paper
0153b387e5
[VidLii] Add 720p support (#30924)
* [VidLii] Add HD support  (yt-dlp backport-ish)

* Also fix a bug with the view count

---------

Co-authored-by: dirkf <fieldhouse@gmx.net>
2024-06-11 13:21:39 +01:00
dirkf
a48fe7491d [ORF] Skip tests with limited availability 2024-06-11 12:52:13 +01:00
dirkf
e20ca543f0 [ORF] Re-factor and updateORFFM4StoryIE
* fix getting media via DASH instead of inaccessible mp4
* also get in-page YT media
2024-06-11 12:52:13 +01:00
dirkf
e39466051f [ORF] Support sound.orf.at, updating ORFRadioIE
* maintain support for xx.orf.at/player/... URLs
* add `ORFRadioCollectionIE` to support playlists in ORF Sound
* back-port and re-work `ORFPodcastIE` from https://github.com/yt-dlp/yt-dlp/pull/8486, thx Esokrates
2024-06-11 12:52:13 +01:00
dirkf
d95c0d203f [ORF] Support on.orf.at, replacing ORFTVthekIE
* add `ORFONIE`, back-porting yt-dlp PR https://github.com/yt-dlp/yt-dlp/pull/9113 and friends: thx HobbyistDev, TuxCoder, seproDev
* re-factor to support livestreams via new `ORFONliveIE`
2024-06-11 12:52:13 +01:00
dirkf
3bde6a5752 [test] Improve download test
* skip reason can't be unicode in Py2
* remove duplicate assert...Equal functions
2024-06-11 12:52:13 +01:00
dirkf
50f6c5668a [core] Re-factor with _fill_common_fields() as used in yt-dlp 2024-06-11 12:52:13 +01:00
dirkf
b4ff08bd2d [core] Safer handling of nested playlist data 2024-06-11 12:52:13 +01:00
kmnx
88bd8b9f87
[mixcloud] updated mixcloud API server address (#32557)
* updated mixcloud API server address
* fix tests
* etc

---------

Co-authored-by: dirkf <fieldhouse@gmx.net>
2024-06-11 12:38:24 +01:00
Olivier Trichet
72db217289 [RadioFrance] Extractor fo thematic webradios 2022-12-22 14:22:19 -05:00
Olivier Trichet
fc933e686b [RadioFrance] Refactoring 2022-12-22 13:01:10 -05:00
Olivier Trichet
ea02c40539 [RadioFrance] Extractor for podcast playlists 2022-12-22 13:00:54 -05:00
Olivier Trichet
7270ecf3d6 [RadioFrance] Extractor for podcast of Radio France stations 2022-12-22 13:00:17 -05:00
Olivier Trichet
dade9111f1 [RadioFrance] Remove old Radio France stations extractors
These are not working anymore after their respectives websites were
merged into www.radiofrance.fr.
2022-12-22 13:00:08 -05:00
10 changed files with 1065 additions and 749 deletions

View File

@ -5,9 +5,9 @@ import hashlib
import json
import os.path
import re
import types
import ssl
import sys
import types
import unittest
import youtube_dl.extractor
@ -181,18 +181,18 @@ def expect_value(self, got, expected, field):
op, _, expected_num = expected.partition(':')
expected_num = int(expected_num)
if op == 'mincount':
assert_func = assertGreaterEqual
assert_func = self.assertGreaterEqual
msg_tmpl = 'Expected %d items in field %s, but only got %d'
elif op == 'maxcount':
assert_func = assertLessEqual
assert_func = self.assertLessEqual
msg_tmpl = 'Expected maximum %d items in field %s, but got %d'
elif op == 'count':
assert_func = assertEqual
assert_func = self.assertEqual
msg_tmpl = 'Expected exactly %d items in field %s, but got %d'
else:
assert False
assert_func(
self, len(got), expected_num,
len(got), expected_num,
msg_tmpl % (expected_num, field, len(got)))
return
self.assertEqual(
@ -262,27 +262,6 @@ def assertRegexpMatches(self, text, regexp, msg=None):
self.assertTrue(m, msg)
def assertGreaterEqual(self, got, expected, msg=None):
if not (got >= expected):
if msg is None:
msg = '%r not greater than or equal to %r' % (got, expected)
self.assertTrue(got >= expected, msg)
def assertLessEqual(self, got, expected, msg=None):
if not (got <= expected):
if msg is None:
msg = '%r not less than or equal to %r' % (got, expected)
self.assertTrue(got <= expected, msg)
def assertEqual(self, got, expected, msg=None):
if not (got == expected):
if msg is None:
msg = '%r not equal to %r' % (got, expected)
self.assertTrue(got == expected, msg)
def expect_warnings(ydl, warnings_re):
real_warning = ydl.report_warning

View File

@ -9,8 +9,6 @@ import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import (
assertGreaterEqual,
assertLessEqual,
expect_warnings,
get_params,
gettestcases,
@ -36,12 +34,20 @@ from youtube_dl.utils import (
ExtractorError,
error_to_compat_str,
format_bytes,
IDENTITY,
preferredencoding,
UnavailableVideoError,
)
from youtube_dl.extractor import get_info_extractor
RETRIES = 3
# Some unittest APIs require actual str
if not isinstance('TEST', str):
_encode_str = lambda s: s.encode(preferredencoding())
else:
_encode_str = IDENTITY
class YoutubeDL(youtube_dl.YoutubeDL):
def __init__(self, *args, **kwargs):
@ -102,7 +108,7 @@ def generator(test_case, tname):
def print_skipping(reason):
print('Skipping %s: %s' % (test_case['name'], reason))
self.skipTest(reason)
self.skipTest(_encode_str(reason))
if not ie.working():
print_skipping('IE marked as not _WORKING')
@ -187,16 +193,14 @@ def generator(test_case, tname):
expect_info_dict(self, res_dict, test_case.get('info_dict', {}))
if 'playlist_mincount' in test_case:
assertGreaterEqual(
self,
self.assertGreaterEqual(
len(res_dict['entries']),
test_case['playlist_mincount'],
'Expected at least %d in playlist %s, but got only %d' % (
test_case['playlist_mincount'], test_case['url'],
len(res_dict['entries'])))
if 'playlist_maxcount' in test_case:
assertLessEqual(
self,
self.assertLessEqual(
len(res_dict['entries']),
test_case['playlist_maxcount'],
'Expected at most %d in playlist %s, but got %d' % (
@ -243,8 +247,8 @@ def generator(test_case, tname):
if params.get('test'):
expected_minsize = max(expected_minsize, 10000)
got_fsize = os.path.getsize(tc_filename)
assertGreaterEqual(
self, got_fsize, expected_minsize,
self.assertGreaterEqual(
got_fsize, expected_minsize,
'Expected %s to be at least %s, but it\'s only %s ' %
(tc_filename, format_bytes(expected_minsize),
format_bytes(got_fsize)))

View File

@ -1039,8 +1039,8 @@ class YoutubeDL(object):
elif result_type in ('playlist', 'multi_video'):
# Protect from infinite recursion due to recursively nested playlists
# (see https://github.com/ytdl-org/youtube-dl/issues/27833)
webpage_url = ie_result['webpage_url']
if webpage_url in self._playlist_urls:
webpage_url = ie_result.get('webpage_url') # not all pl/mv have this
if webpage_url and webpage_url in self._playlist_urls:
self.to_screen(
'[download] Skipping already downloaded playlist: %s'
% ie_result.get('title') or ie_result.get('id'))
@ -1048,6 +1048,10 @@ class YoutubeDL(object):
self._playlist_level += 1
self._playlist_urls.add(webpage_url)
new_result = dict((k, v) for k, v in extra_info.items() if k not in ie_result)
if new_result:
new_result.update(ie_result)
ie_result = new_result
try:
return self.__process_playlist(ie_result, download)
finally:
@ -1593,6 +1597,28 @@ class YoutubeDL(object):
self.cookiejar.add_cookie_header(pr)
return pr.get_header('Cookie')
def _fill_common_fields(self, info_dict, final=True):
for ts_key, date_key in (
('timestamp', 'upload_date'),
('release_timestamp', 'release_date'),
):
if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None:
# Working around out-of-range timestamp values (e.g. negative ones on Windows,
# see http://bugs.python.org/issue1646728)
try:
upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key])
info_dict[date_key] = compat_str(upload_date.strftime('%Y%m%d'))
except (ValueError, OverflowError, OSError):
pass
# Auto generate title fields corresponding to the *_number fields when missing
# in order to always have clean titles. This is very common for TV series.
if final:
for field in ('chapter', 'season', 'episode'):
if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
def process_video_result(self, info_dict, download=True):
assert info_dict.get('_type', 'video') == 'video'
@ -1660,24 +1686,7 @@ class YoutubeDL(object):
if 'display_id' not in info_dict and 'id' in info_dict:
info_dict['display_id'] = info_dict['id']
for ts_key, date_key in (
('timestamp', 'upload_date'),
('release_timestamp', 'release_date'),
):
if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None:
# Working around out-of-range timestamp values (e.g. negative ones on Windows,
# see http://bugs.python.org/issue1646728)
try:
upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key])
info_dict[date_key] = compat_str(upload_date.strftime('%Y%m%d'))
except (ValueError, OverflowError, OSError):
pass
# Auto generate title fields corresponding to the *_number fields when missing
# in order to always have clean titles. This is very common for TV series.
for field in ('chapter', 'season', 'episode'):
if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
self._fill_common_fields(info_dict)
for cc_kind in ('subtitles', 'automatic_captions'):
cc = info_dict.get(cc_kind)

View File

@ -413,8 +413,6 @@ from .foxnews import (
FoxNewsArticleIE,
)
from .foxsports import FoxSportsIE
from .franceculture import FranceCultureIE
from .franceinter import FranceInterIE
from .francetv import (
FranceTVIE,
FranceTVSiteIE,
@ -898,21 +896,13 @@ from .ooyala import (
)
from .ora import OraTVIE
from .orf import (
ORFTVthekIE,
ORFFM4IE,
ORFONIE,
ORFONLiveIE,
ORFFM4StoryIE,
ORFOE1IE,
ORFOE3IE,
ORFNOEIE,
ORFWIEIE,
ORFBGLIE,
ORFOOEIE,
ORFSTMIE,
ORFKTNIE,
ORFSBGIE,
ORFTIRIE,
ORFVBGIE,
ORFIPTVIE,
ORFPodcastIE,
ORFRadioIE,
ORFRadioCollectionIE,
)
from .outsidetv import OutsideTVIE
from .packtpub import (
@ -1019,7 +1009,11 @@ from .radiocanada import (
from .radiode import RadioDeIE
from .radiojavan import RadioJavanIE
from .radiobremen import RadioBremenIE
from .radiofrance import RadioFranceIE
from .radiofrance import (
RadioFrancePodcastEpisodeIE,
RadioFrancePodcastPlaylistIE,
RadioFranceWebradioIE,
)
from .rai import (
RaiPlayIE,
RaiPlayLiveIE,

View File

@ -1,73 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
determine_ext,
extract_attributes,
int_or_none,
)
class FranceCultureIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?franceculture\.fr/emissions/(?:[^/]+/)*(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'http://www.franceculture.fr/emissions/carnet-nomade/rendez-vous-au-pays-des-geeks',
'info_dict': {
'id': 'rendez-vous-au-pays-des-geeks',
'display_id': 'rendez-vous-au-pays-des-geeks',
'ext': 'mp3',
'title': 'Rendez-vous au pays des geeks',
'thumbnail': r're:^https?://.*\.jpg$',
'upload_date': '20140301',
'timestamp': 1393700400,
'vcodec': 'none',
}
}, {
# no thumbnail
'url': 'https://www.franceculture.fr/emissions/la-recherche-montre-en-main/la-recherche-montre-en-main-du-mercredi-10-octobre-2018',
'only_matching': True,
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
video_data = extract_attributes(self._search_regex(
r'''(?sx)
(?:
</h1>|
<div[^>]+class="[^"]*?(?:title-zone-diffusion|heading-zone-(?:wrapper|player-button))[^"]*?"[^>]*>
).*?
(<button[^>]+data-(?:url|asset-source)="[^"]+"[^>]+>)
''',
webpage, 'video data'))
video_url = video_data.get('data-url') or video_data['data-asset-source']
title = video_data.get('data-asset-title') or video_data.get('data-diffusion-title') or self._og_search_title(webpage)
description = self._html_search_regex(
r'(?s)<div[^>]+class="intro"[^>]*>.*?<h2>(.+?)</h2>',
webpage, 'description', default=None)
thumbnail = self._search_regex(
r'(?s)<figure[^>]+itemtype="https://schema.org/ImageObject"[^>]*>.*?<img[^>]+(?:data-dejavu-)?src="([^"]+)"',
webpage, 'thumbnail', default=None)
uploader = self._html_search_regex(
r'(?s)<span class="author">(.*?)</span>',
webpage, 'uploader', default=None)
ext = determine_ext(video_url.lower())
return {
'id': display_id,
'display_id': display_id,
'url': video_url,
'title': title,
'description': description,
'thumbnail': thumbnail,
'ext': ext,
'vcodec': 'none' if ext == 'mp3' else None,
'uploader': uploader,
'timestamp': int_or_none(video_data.get('data-start-time')) or int_or_none(video_data.get('data-asset-created-date')),
'duration': int_or_none(video_data.get('data-duration')),
}

View File

@ -1,59 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import month_by_name
class FranceInterIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?franceinter\.fr/emissions/(?P<id>[^?#]+)'
_TEST = {
'url': 'https://www.franceinter.fr/emissions/affaires-sensibles/affaires-sensibles-07-septembre-2016',
'md5': '9e54d7bdb6fdc02a841007f8a975c094',
'info_dict': {
'id': 'affaires-sensibles/affaires-sensibles-07-septembre-2016',
'ext': 'mp3',
'title': 'Affaire Cahuzac : le contentieux du compte en Suisse',
'description': 'md5:401969c5d318c061f86bda1fa359292b',
'thumbnail': r're:^https?://.*\.jpg',
'upload_date': '20160907',
},
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
video_url = self._search_regex(
r'(?s)<div[^>]+class=["\']page-diffusion["\'][^>]*>.*?<button[^>]+data-url=(["\'])(?P<url>(?:(?!\1).)+)\1',
webpage, 'video url', group='url')
title = self._og_search_title(webpage)
description = self._og_search_description(webpage)
thumbnail = self._html_search_meta(['og:image', 'twitter:image'], webpage)
upload_date_str = self._search_regex(
r'class=["\']\s*cover-emission-period\s*["\'][^>]*>[^<]+\s+(\d{1,2}\s+[^\s]+\s+\d{4})<',
webpage, 'upload date', fatal=False)
if upload_date_str:
upload_date_list = upload_date_str.split()
upload_date_list.reverse()
upload_date_list[1] = '%02d' % (month_by_name(upload_date_list[1], lang='fr') or 0)
upload_date_list[2] = '%02d' % int(upload_date_list[2])
upload_date = ''.join(upload_date_list)
else:
upload_date = None
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'upload_date': upload_date,
'formats': [{
'url': video_url,
'vcodec': 'none',
}],
}

View File

@ -1,3 +1,4 @@
# coding: utf-8
from __future__ import unicode_literals
import itertools
@ -10,7 +11,7 @@ from ..compat import (
compat_ord,
compat_str,
compat_urllib_parse_unquote,
compat_zip
compat_zip as zip,
)
from ..utils import (
int_or_none,
@ -24,7 +25,7 @@ class MixcloudBaseIE(InfoExtractor):
def _call_api(self, object_type, object_fields, display_id, username, slug=None):
lookup_key = object_type + 'Lookup'
return self._download_json(
'https://www.mixcloud.com/graphql', display_id, query={
'https://app.mixcloud.com/graphql', display_id, query={
'query': '''{
%s(lookup: {username: "%s"%s}) {
%s
@ -44,7 +45,7 @@ class MixcloudIE(MixcloudBaseIE):
'ext': 'm4a',
'title': 'Cryptkeeper',
'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.',
'uploader': 'Daniel Holbach',
'uploader': 'dholbach', # was: 'Daniel Holbach',
'uploader_id': 'dholbach',
'thumbnail': r're:https?://.*\.jpg',
'view_count': int,
@ -57,7 +58,7 @@ class MixcloudIE(MixcloudBaseIE):
'id': 'gillespeterson_caribou-7-inch-vinyl-mix-chat',
'ext': 'mp3',
'title': 'Caribou 7 inch Vinyl Mix & Chat',
'description': 'md5:2b8aec6adce69f9d41724647c65875e8',
'description': r're:Last week Dan Snaith aka Caribou swung by the Brownswood.{136}',
'uploader': 'Gilles Peterson Worldwide',
'uploader_id': 'gillespeterson',
'thumbnail': 're:https?://.*',
@ -65,6 +66,23 @@ class MixcloudIE(MixcloudBaseIE):
'timestamp': 1422987057,
'upload_date': '20150203',
},
'params': {
'skip_download': '404 not found',
},
}, {
'url': 'https://www.mixcloud.com/gillespeterson/carnival-m%C3%BAsica-popular-brasileira-mix/',
'info_dict': {
'id': 'gillespeterson_carnival-música-popular-brasileira-mix',
'ext': 'm4a',
'title': 'Carnival Música Popular Brasileira Mix',
'description': r're:Gilles was recently in Brazil to play at Boiler Room.{208}',
'timestamp': 1454347174,
'upload_date': '20160201',
'uploader': 'Gilles Peterson Worldwide',
'uploader_id': 'gillespeterson',
'thumbnail': 're:https?://.*',
'view_count': int,
},
}, {
'url': 'https://beta.mixcloud.com/RedLightRadio/nosedrip-15-red-light-radio-01-18-2016/',
'only_matching': True,
@ -76,10 +94,10 @@ class MixcloudIE(MixcloudBaseIE):
"""Encrypt/Decrypt XOR cipher. Both ways are possible because it's XOR."""
return ''.join([
compat_chr(compat_ord(ch) ^ compat_ord(k))
for ch, k in compat_zip(ciphertext, itertools.cycle(key))])
for ch, k in zip(ciphertext, itertools.cycle(key))])
def _real_extract(self, url):
username, slug = re.match(self._VALID_URL, url).groups()
username, slug = self._match_valid_url(url).groups()
username, slug = compat_urllib_parse_unquote(username), compat_urllib_parse_unquote(slug)
track_id = '%s_%s' % (username, slug)

File diff suppressed because it is too large Load Diff

View File

@ -4,56 +4,284 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
get_element_by_attribute,
int_or_none,
parse_iso8601,
strip_or_none,
url_or_none
)
class RadioFranceIE(InfoExtractor):
_VALID_URL = r'^https?://maison\.radiofrance\.fr/radiovisions/(?P<id>[^?#]+)'
IE_NAME = 'radiofrance'
class RadioFranceBaseIE(InfoExtractor):
_BASE_URL = r'https://www.radiofrance.fr/'
_TEST = {
'url': 'http://maison.radiofrance.fr/radiovisions/one-one',
'md5': 'bdbb28ace95ed0e04faab32ba3160daf',
def extract_api_data(self, api_path, id, html):
pattern = r'<script [^>]*sveltekit:data-url="https://www\.radiofrance\.fr/api/v[\d.]+/%s[^>]*>(?P<json>.*)</script>' % api_path
json = self._search_regex(pattern, html, 'API data', flags=re.DOTALL, group='json')
if not json:
raise ExtractorError('%s: JSON data not found' % id)
try:
json = self._parse_json(json, id)
json = self._parse_json(json['body'], id)
if api_path == 'path':
return json['content']
elif api_path == 'stations':
return json
else:
raise ExtractorError('Coding error')
except KeyError:
raise ExtractorError('%s: Invalid JSON' % id)
def get_title(self, api_data, webpage=None):
title = strip_or_none(api_data.get('title'))
if not title and webpage:
title = strip_or_none(get_element_by_attribute('h1', None, webpage, False)) or strip_or_none(self._og_search_title(webpage))
return title
def get_description(self, api_data, webpage=None):
description = strip_or_none(api_data.get('standFirst'))
if not description and webpage:
description = strip_or_none(self._og_search_description(webpage))
return description
def get_thumbnail(self, api_data, webpage=None):
thumbnail = None
visual = api_data.get('visual')
if visual:
thumbnail = url_or_none(visual.get('src'))
if not thumbnail and webpage:
thumbnail = self._og_search_thumbnail(webpage)
return thumbnail
def get_timestamp(self, api_data, webpage=None):
timestamp = api_data.get('publishedDate')
if not timestamp and webpage:
timestamp = parse_iso8601(self._html_search_meta('article:published_time', webpage, 'publication time', ))
return timestamp
def get_brand(self, api_data, webpage=None):
brand = strip_or_none(api_data.get('brand'))
if not brand and webpage:
brand = self._og_search_property('site_name', webpage, 'Station name', fatal=False)
return brand
def extract_episode(self, episode_id, api_data):
manifestations = api_data.get('manifestations')
if manifestations is None or len(manifestations) == 0:
return None, None
url = url_or_none(manifestations[0]['url'])
duration = int_or_none(manifestations[0].get('duration'))
return url, duration
def get_playlist_entries(self, playlist_url, playlist_id, api_data, direction):
playlist_data = api_data['expressions']
entries = []
items = playlist_data.get('items')
for item in items:
episode_path = item.get('path')
if episode_path is None:
self.report_warning('No path found for episode "%s"', item.get('title'))
continue
episode_id = RadioFrancePodcastEpisodeIE._match_id(self._BASE_URL + episode_path)
if episode_id is None:
self.report_warning('Could not parse id of episode from path: "%s"' % episode_path)
continue
episode_url, duration = self.extract_episode(episode_id, item)
if episode_url is None:
self.to_screen('Episode "%s" is not available' % episode_path)
continue
entry = {
'id': episode_id,
'url': episode_url,
'title': self.get_title(item),
'description': self.get_description(item),
'timestamp': self.get_timestamp(item),
'thumbnail': self.get_thumbnail(item),
'duration': duration,
}
entries.append(entry)
page_number = int_or_none(playlist_data.get('pageNumber'))
if page_number:
if direction in ['both', 'prev'] and playlist_data.get('prev') is not None:
webpage, other_api_data = self.get_data(playlist_url, 'path', playlist_id, page=page_number - 1)
entries = self.get_playlist_entries(playlist_url, playlist_id, other_api_data, direction='prev') + entries
if direction in ['both', 'next'] and playlist_data.get('next') is not None:
webpage, other_api_data = self.get_data(playlist_url, 'path', playlist_id, page=page_number + 1)
entries = entries + self.get_playlist_entries(playlist_url, playlist_id, other_api_data, direction='next')
return entries
def get_data(self, url, api_path, id, page=None):
query = {}
note = None
if page:
query['p'] = page
note = "Downloading page %i" % page
webpage = self._download_webpage(url, id, query=query, note=note)
api_data = self.extract_api_data(api_path, id, webpage)
return webpage, api_data
class RadioFrancePodcastEpisodeIE(RadioFranceBaseIE):
_VALID_URL = r'https?://www\.radiofrance\.fr/(?:francemusique|franceinter|franceculture|franceinfo|mouv|fip)/podcasts/.+/.+-(?P<id>\d+)$'
_TESTS = [{
'note': 'Podcast episode with audio from France Info',
'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-brief-eco/le-brief-eco-du-lundi-05-septembre-2022-8310713',
'info_dict': {
'id': 'one-one',
'ext': 'ogg',
'title': 'One to one',
'description': "Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.",
'uploader': 'Thomas Hercouët',
},
}
'id': '8310713',
'ext': 'mp3',
'url': r're:^https?://.*\.mp3$',
'title': 'Pour la première fois en vingt ans, leuro passe sous les 0,99\u00a0dollar',
'description': str,
'thumbnail': r're:^https?://.*\.jpg$',
'timestamp': int,
'duration': int,
'upload_date': str
}
}, {
'note': 'Podcast episode from France Musique',
'url': 'https://www.radiofrance.fr/francemusique/podcasts/allegretto/lever-du-jour-9233228',
'only_matching': True
}, {
'note': 'Podcast episode from FranceInter',
'url': 'https://www.radiofrance.fr/franceinter/podcasts/rendez-vous-avec-x/un-mysterieux-echange-digne-de-la-guerre-froide-9343281',
'only_matching': True
}, {
'note': 'Podcast episode from France Culture',
'url': 'https://www.radiofrance.fr/franceculture/podcasts/la-science-cqfd/teotihuacan-la-plus-mysterieuse-des-cites-d-or-9224610',
'only_matching': True
}, {
'note': 'Podcast episode from Le Mouv',
'url': 'https://www.radiofrance.fr/mouv/podcasts/mouv-dj-la-caution/ncr2a-ne-cherche-rien-d-autre-ailleurs-1197950',
'only_matching': True
}, {
'note': 'Podcast episode from FIP',
'url': 'https://www.radiofrance.fr/fip/podcasts/certains-l-aiment-fip/hommage-au-cinema-de-vangelis-4734742',
'only_matching': True
}]
def _real_extract(self, url):
m = re.match(self._VALID_URL, url)
video_id = m.group('id')
webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, 'title')
description = self._html_search_regex(
r'<div class="bloc_page_wrapper"><div class="text">(.*?)</div>',
webpage, 'description', fatal=False)
uploader = self._html_search_regex(
r'<div class="credit">&nbsp;&nbsp;&copy;&nbsp;(.*?)</div>',
webpage, 'uploader', fatal=False)
formats_str = self._html_search_regex(
r'class="jp-jplayer[^"]*" data-source="([^"]+)">',
webpage, 'audio URLs')
formats = [
{
'format_id': fm[0],
'url': fm[1],
'vcodec': 'none',
'preference': i,
}
for i, fm in
enumerate(re.findall(r"([a-z0-9]+)\s*:\s*'([^']+)'", formats_str))
]
self._sort_formats(formats)
id = self._match_id(url)
webpage, api_data = self.get_data(url, 'path', id)
url, duration = self.extract_episode(id, api_data)
if url is None:
msg = 'Podcast file is not available. If the show is too recent, the file may not have been uploaded yet: try again later.'
raise ExtractorError(msg, expected=True, video_id=id)
return {
'id': video_id,
'title': title,
'formats': formats,
'description': description,
'uploader': uploader,
'id': id,
'url': url,
'title': self.get_title(api_data, webpage),
'description': self.get_description(api_data, webpage),
'timestamp': self.get_timestamp(api_data, webpage),
'thumbnail': self.get_thumbnail(api_data, webpage),
'channel_id': self.get_brand(api_data, webpage),
'duration': duration,
}
class RadioFrancePodcastPlaylistIE(RadioFranceBaseIE):
_VALID_URL = r'https?://www\.radiofrance\.fr/(?:francemusique|franceinter|franceculture|franceinfo|mouv|fip)/podcasts/(?P<id>[^/]+?)(?:[?#].*)?$'
_TESTS = [{
'note': 'Podcast show with multiple pages of episodes and some of them are missing',
'url': 'https://www.radiofrance.fr/franceculture/podcasts/une-semaine-dans-le-monde-10-11?p=2',
'info_dict': {
'id': 'une-semaine-dans-le-monde-10-11',
'title': 'Une semaine dans le monde | 10-11',
'description': str,
'timestamp': int
},
'playlist_count': 23,
}]
def _real_extract(self, url):
id = self._match_id(url)
webpage, api_data = self.get_data(url, 'path', id)
entries = self.get_playlist_entries(url, id, api_data, direction='both')
entries.reverse()
return {
'id': id,
'_type': 'playlist',
'entries': entries,
'title': self.get_title(api_data, webpage),
'description': self.get_description(api_data, webpage),
'timestamp': self.get_timestamp(api_data, webpage),
'thumbnail': self.get_thumbnail(api_data, webpage),
'channel_id': self.get_brand(api_data, webpage),
}
class RadioFranceWebradioIE(RadioFranceBaseIE):
_VALID_URL = r'https?://www\.radiofrance\.fr/(?:francemusique|franceinter|franceculture|franceinfo|mouv|fip)/(?P<id>radio-[^/]+)$'
_TESTS = [{
'note': 'Full list of webradios available at https://www.radiofrance.fr/ecouter-musique',
'url': 'https://www.radiofrance.fr/fip/radio-metal',
'info_dict': {
'id': 'radio-metal',
'ext': 'aac',
'title': str,
},
'params': {
'format': 'aac',
'skip_download': True,
}
}]
def get_livestream_formats(self, id, api_data):
sources = api_data['media']['sources']
formats = []
for source in sources:
url = source.get('url')
if not url:
continue
format_id = source.get('format')
format = {
'url': url,
'format_id': format_id,
'asr': 48000,
'vcodec': 'none'
}
if format_id == 'mp3':
format['preference'] = 1
format['acodec'] = 'mp3'
format['abr'] = source.get('bitrate')
elif format_id == 'aac':
format['preference'] = 2
format['acodec'] = 'aac'
format['abr'] = source.get('bitrate')
elif format_id == 'hls':
format['preference'] = 0
format['manifest_url'] = url
formats.append(format)
if len(formats) == 0:
raise ExtractorError('No live streaming URL found')
return formats
def _real_extract(self, url):
id = self._match_id(url)
webpage, api_data = self.get_data(url, 'stations', id)
return {
'id': id,
'title': self.get_title(api_data, webpage),
'formats': self.get_livestream_formats(id, api_data),
'thumbnail': self.get_thumbnail(api_data, webpage),
'channel_id': self.get_brand(api_data, webpage),
'is_live': True
}

View File

@ -4,6 +4,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
float_or_none,
get_element_by_id,
@ -11,6 +12,7 @@ from ..utils import (
strip_or_none,
unified_strdate,
urljoin,
str_to_int,
)
@ -35,6 +37,26 @@ class VidLiiIE(InfoExtractor):
'categories': ['News & Politics'],
'tags': ['Vidlii', 'Jan', 'Videogames'],
}
}, {
# HD
'url': 'https://www.vidlii.com/watch?v=2Ng8Abj2Fkl',
'md5': '450e7da379c884788c3a4fa02a3ce1a4',
'info_dict': {
'id': '2Ng8Abj2Fkl',
'ext': 'mp4',
'title': 'test',
'description': 'md5:cc55a86032a7b6b3cbfd0f6b155b52e9',
'thumbnail': 'https://www.vidlii.com/usfi/thmp/2Ng8Abj2Fkl.jpg',
'uploader': 'VidLii',
'uploader_url': 'https://www.vidlii.com/user/VidLii',
'upload_date': '20200927',
'duration': 5,
'view_count': int,
'comment_count': int,
'average_rating': float,
'categories': ['Film & Animation'],
'tags': list,
},
}, {
'url': 'https://www.vidlii.com/embed?v=tJluaH4BJ3v&a=0',
'only_matching': True,
@ -46,11 +68,32 @@ class VidLiiIE(InfoExtractor):
webpage = self._download_webpage(
'https://www.vidlii.com/watch?v=%s' % video_id, video_id)
video_url = self._search_regex(
r'src\s*:\s*(["\'])(?P<url>(?:https?://)?(?:(?!\1).)+)\1', webpage,
'video url', group='url')
formats = []
title = self._search_regex(
def add_format(format_url, height=None):
height = int(self._search_regex(r'(\d+)\.mp4',
format_url, 'height', default=360))
formats.append({
'url': format_url,
'format_id': '%dp' % height if height else None,
'height': height,
})
sources = re.findall(
r'src\s*:\s*(["\'])(?P<url>(?:https?://)?(?:(?!\1).)+)\1',
webpage)
formats = []
if len(sources) > 1:
add_format(sources[1][1])
self._check_formats(formats, video_id)
if len(sources) > 0:
add_format(sources[0][1])
self._sort_formats(formats)
title = self._html_search_regex(
(r'<h1>([^<]+)</h1>', r'<title>([^<]+) - VidLii<'), webpage,
'title')
@ -82,9 +125,9 @@ class VidLiiIE(InfoExtractor):
default=None) or self._search_regex(
r'duration\s*:\s*(\d+)', webpage, 'duration', fatal=False))
view_count = int_or_none(self._search_regex(
(r'<strong>(\d+)</strong> views',
r'Views\s*:\s*<strong>(\d+)</strong>'),
view_count = str_to_int(self._html_search_regex(
(r'<strong>([\d,.]+)</strong> views',
r'Views\s*:\s*<strong>([\d,.]+)</strong>'),
webpage, 'view count', fatal=False))
comment_count = int_or_none(self._search_regex(
@ -109,7 +152,7 @@ class VidLiiIE(InfoExtractor):
return {
'id': video_id,
'url': video_url,
'formats': formats,
'title': title,
'description': description,
'thumbnail': thumbnail,