Merge 72db217289 into 0153b387e5

[VidLii] Add 720p support (#30924 )
* [VidLii] Add HD support (yt-dlp backport-ish) * Also fix a bug with the view count --------- Co-authored-by: dirkf <fieldhouse@gmx.net>
2024-06-26 00:27:57 +00:00 · 2024-06-14 12:01:11 +00:00 · 2024-06-11 13:21:39 +01:00 · 2024-06-11 12:52:13 +01:00 · 2024-06-11 12:52:13 +01:00 · 2024-06-11 12:52:13 +01:00
10 changed files with 1065 additions and 749 deletions
--- a/test/helper.py
+++ b/test/helper.py
@ -5,9 +5,9 @@ import hashlib
 import json
 import os.path
 import re
-import types
 import ssl
 import sys
+import types
 import unittest

 import youtube_dl.extractor
@ -181,18 +181,18 @@ def expect_value(self, got, expected, field):
            op, _, expected_num = expected.partition(':')
            expected_num = int(expected_num)
            if op == 'mincount':
-                assert_func = assertGreaterEqual
+                assert_func = self.assertGreaterEqual
                msg_tmpl = 'Expected %d items in field %s, but only got %d'
            elif op == 'maxcount':
-                assert_func = assertLessEqual
+                assert_func = self.assertLessEqual
                msg_tmpl = 'Expected maximum %d items in field %s, but got %d'
            elif op == 'count':
-                assert_func = assertEqual
+                assert_func = self.assertEqual
                msg_tmpl = 'Expected exactly %d items in field %s, but got %d'
            else:
                assert False
            assert_func(
-                self, len(got), expected_num,
+                len(got), expected_num,
                msg_tmpl % (expected_num, field, len(got)))
            return
        self.assertEqual(
@ -262,27 +262,6 @@ def assertRegexpMatches(self, text, regexp, msg=None):
            self.assertTrue(m, msg)


-def assertGreaterEqual(self, got, expected, msg=None):
-    if not (got >= expected):
-        if msg is None:
-            msg = '%r not greater than or equal to %r' % (got, expected)
-        self.assertTrue(got >= expected, msg)
-
-
-def assertLessEqual(self, got, expected, msg=None):
-    if not (got <= expected):
-        if msg is None:
-            msg = '%r not less than or equal to %r' % (got, expected)
-        self.assertTrue(got <= expected, msg)
-
-
-def assertEqual(self, got, expected, msg=None):
-    if not (got == expected):
-        if msg is None:
-            msg = '%r not equal to %r' % (got, expected)
-        self.assertTrue(got == expected, msg)
-
-
 def expect_warnings(ydl, warnings_re):
    real_warning = ydl.report_warning

--- a/test/test_download.py
+++ b/test/test_download.py
@ -9,8 +9,6 @@ import unittest
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

 from test.helper import (
-    assertGreaterEqual,
-    assertLessEqual,
    expect_warnings,
    get_params,
    gettestcases,
@ -36,12 +34,20 @@ from youtube_dl.utils import (
    ExtractorError,
    error_to_compat_str,
    format_bytes,
+    IDENTITY,
+    preferredencoding,
    UnavailableVideoError,
 )
 from youtube_dl.extractor import get_info_extractor

 RETRIES = 3

+# Some unittest APIs require actual str
+if not isinstance('TEST', str):
+    _encode_str = lambda s: s.encode(preferredencoding())
+else:
+    _encode_str = IDENTITY
+

 class YoutubeDL(youtube_dl.YoutubeDL):
    def __init__(self, *args, **kwargs):
@ -102,7 +108,7 @@ def generator(test_case, tname):

        def print_skipping(reason):
            print('Skipping %s: %s' % (test_case['name'], reason))
-            self.skipTest(reason)
+            self.skipTest(_encode_str(reason))

        if not ie.working():
            print_skipping('IE marked as not _WORKING')
@ -187,16 +193,14 @@ def generator(test_case, tname):
                expect_info_dict(self, res_dict, test_case.get('info_dict', {}))

            if 'playlist_mincount' in test_case:
-                assertGreaterEqual(
-                    self,
+                self.assertGreaterEqual(
                    len(res_dict['entries']),
                    test_case['playlist_mincount'],
                    'Expected at least %d in playlist %s, but got only %d' % (
                        test_case['playlist_mincount'], test_case['url'],
                        len(res_dict['entries'])))
            if 'playlist_maxcount' in test_case:
-                assertLessEqual(
-                    self,
+                self.assertLessEqual(
                    len(res_dict['entries']),
                    test_case['playlist_maxcount'],
                    'Expected at most %d in playlist %s, but got %d' % (
@ -243,8 +247,8 @@ def generator(test_case, tname):
                        if params.get('test'):
                            expected_minsize = max(expected_minsize, 10000)
                        got_fsize = os.path.getsize(tc_filename)
-                        assertGreaterEqual(
-                            self, got_fsize, expected_minsize,
+                        self.assertGreaterEqual(
+                            got_fsize, expected_minsize,
                            'Expected %s to be at least %s, but it\'s only %s ' %
                            (tc_filename, format_bytes(expected_minsize),
                                format_bytes(got_fsize)))
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@ -1039,8 +1039,8 @@ class YoutubeDL(object):
        elif result_type in ('playlist', 'multi_video'):
            # Protect from infinite recursion due to recursively nested playlists
            # (see https://github.com/ytdl-org/youtube-dl/issues/27833)
-            webpage_url = ie_result['webpage_url']
-            if webpage_url in self._playlist_urls:
+            webpage_url = ie_result.get('webpage_url')  # not all pl/mv have this
+            if webpage_url and webpage_url in self._playlist_urls:
                self.to_screen(
                    '[download] Skipping already downloaded playlist: %s'
                    % ie_result.get('title') or ie_result.get('id'))
@ -1048,6 +1048,10 @@ class YoutubeDL(object):

            self._playlist_level += 1
            self._playlist_urls.add(webpage_url)
+            new_result = dict((k, v) for k, v in extra_info.items() if k not in ie_result)
+            if new_result:
+                new_result.update(ie_result)
+                ie_result = new_result
            try:
                return self.__process_playlist(ie_result, download)
            finally:
@ -1593,6 +1597,28 @@ class YoutubeDL(object):
        self.cookiejar.add_cookie_header(pr)
        return pr.get_header('Cookie')

+    def _fill_common_fields(self, info_dict, final=True):
+
+        for ts_key, date_key in (
+                ('timestamp', 'upload_date'),
+                ('release_timestamp', 'release_date'),
+        ):
+            if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None:
+                # Working around out-of-range timestamp values (e.g. negative ones on Windows,
+                # see http://bugs.python.org/issue1646728)
+                try:
+                    upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key])
+                    info_dict[date_key] = compat_str(upload_date.strftime('%Y%m%d'))
+                except (ValueError, OverflowError, OSError):
+                    pass
+
+        # Auto generate title fields corresponding to the *_number fields when missing
+        # in order to always have clean titles. This is very common for TV series.
+        if final:
+            for field in ('chapter', 'season', 'episode'):
+                if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
+                    info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
+
    def process_video_result(self, info_dict, download=True):
        assert info_dict.get('_type', 'video') == 'video'

@ -1660,24 +1686,7 @@ class YoutubeDL(object):
        if 'display_id' not in info_dict and 'id' in info_dict:
            info_dict['display_id'] = info_dict['id']

-        for ts_key, date_key in (
-                ('timestamp', 'upload_date'),
-                ('release_timestamp', 'release_date'),
-        ):
-            if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None:
-                # Working around out-of-range timestamp values (e.g. negative ones on Windows,
-                # see http://bugs.python.org/issue1646728)
-                try:
-                    upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key])
-                    info_dict[date_key] = compat_str(upload_date.strftime('%Y%m%d'))
-                except (ValueError, OverflowError, OSError):
-                    pass
-
-        # Auto generate title fields corresponding to the *_number fields when missing
-        # in order to always have clean titles. This is very common for TV series.
-        for field in ('chapter', 'season', 'episode'):
-            if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
-                info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
+        self._fill_common_fields(info_dict)

        for cc_kind in ('subtitles', 'automatic_captions'):
            cc = info_dict.get(cc_kind)
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -413,8 +413,6 @@ from .foxnews import (
    FoxNewsArticleIE,
 )
 from .foxsports import FoxSportsIE
-from .franceculture import FranceCultureIE
-from .franceinter import FranceInterIE
 from .francetv import (
    FranceTVIE,
    FranceTVSiteIE,
@ -898,21 +896,13 @@ from .ooyala import (
 )
 from .ora import OraTVIE
 from .orf import (
-    ORFTVthekIE,
-    ORFFM4IE,
+    ORFONIE,
+    ORFONLiveIE,
    ORFFM4StoryIE,
-    ORFOE1IE,
-    ORFOE3IE,
-    ORFNOEIE,
-    ORFWIEIE,
-    ORFBGLIE,
-    ORFOOEIE,
-    ORFSTMIE,
-    ORFKTNIE,
-    ORFSBGIE,
-    ORFTIRIE,
-    ORFVBGIE,
    ORFIPTVIE,
+    ORFPodcastIE,
+    ORFRadioIE,
+    ORFRadioCollectionIE,
 )
 from .outsidetv import OutsideTVIE
 from .packtpub import (
@ -1019,7 +1009,11 @@ from .radiocanada import (
 from .radiode import RadioDeIE
 from .radiojavan import RadioJavanIE
 from .radiobremen import RadioBremenIE
-from .radiofrance import RadioFranceIE
+from .radiofrance import (
+    RadioFrancePodcastEpisodeIE,
+    RadioFrancePodcastPlaylistIE,
+    RadioFranceWebradioIE,
+)
 from .rai import (
    RaiPlayIE,
    RaiPlayLiveIE,
--- a/youtube_dl/extractor/franceculture.py
+++ b/youtube_dl/extractor/franceculture.py
@ -1,73 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-from .common import InfoExtractor
-from ..utils import (
-    determine_ext,
-    extract_attributes,
-    int_or_none,
-)
-
-
-class FranceCultureIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?franceculture\.fr/emissions/(?:[^/]+/)*(?P<id>[^/?#&]+)'
-    _TESTS = [{
-        'url': 'http://www.franceculture.fr/emissions/carnet-nomade/rendez-vous-au-pays-des-geeks',
-        'info_dict': {
-            'id': 'rendez-vous-au-pays-des-geeks',
-            'display_id': 'rendez-vous-au-pays-des-geeks',
-            'ext': 'mp3',
-            'title': 'Rendez-vous au pays des geeks',
-            'thumbnail': r're:^https?://.*\.jpg$',
-            'upload_date': '20140301',
-            'timestamp': 1393700400,
-            'vcodec': 'none',
-        }
-    }, {
-        # no thumbnail
-        'url': 'https://www.franceculture.fr/emissions/la-recherche-montre-en-main/la-recherche-montre-en-main-du-mercredi-10-octobre-2018',
-        'only_matching': True,
-    }]
-
-    def _real_extract(self, url):
-        display_id = self._match_id(url)
-
-        webpage = self._download_webpage(url, display_id)
-
-        video_data = extract_attributes(self._search_regex(
-            r'''(?sx)
-                (?:
-                    </h1>|
-                    <div[^>]+class="[^"]*?(?:title-zone-diffusion|heading-zone-(?:wrapper|player-button))[^"]*?"[^>]*>
-                ).*?
-                (<button[^>]+data-(?:url|asset-source)="[^"]+"[^>]+>)
-            ''',
-            webpage, 'video data'))
-
-        video_url = video_data.get('data-url') or video_data['data-asset-source']
-        title = video_data.get('data-asset-title') or video_data.get('data-diffusion-title') or self._og_search_title(webpage)
-
-        description = self._html_search_regex(
-            r'(?s)<div[^>]+class="intro"[^>]*>.*?<h2>(.+?)</h2>',
-            webpage, 'description', default=None)
-        thumbnail = self._search_regex(
-            r'(?s)<figure[^>]+itemtype="https://schema.org/ImageObject"[^>]*>.*?<img[^>]+(?:data-dejavu-)?src="([^"]+)"',
-            webpage, 'thumbnail', default=None)
-        uploader = self._html_search_regex(
-            r'(?s)<span class="author">(.*?)</span>',
-            webpage, 'uploader', default=None)
-        ext = determine_ext(video_url.lower())
-
-        return {
-            'id': display_id,
-            'display_id': display_id,
-            'url': video_url,
-            'title': title,
-            'description': description,
-            'thumbnail': thumbnail,
-            'ext': ext,
-            'vcodec': 'none' if ext == 'mp3' else None,
-            'uploader': uploader,
-            'timestamp': int_or_none(video_data.get('data-start-time')) or int_or_none(video_data.get('data-asset-created-date')),
-            'duration': int_or_none(video_data.get('data-duration')),
-        }
--- a/youtube_dl/extractor/franceinter.py
+++ b/youtube_dl/extractor/franceinter.py
@ -1,59 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-from .common import InfoExtractor
-from ..utils import month_by_name
-
-
-class FranceInterIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?franceinter\.fr/emissions/(?P<id>[^?#]+)'
-
-    _TEST = {
-        'url': 'https://www.franceinter.fr/emissions/affaires-sensibles/affaires-sensibles-07-septembre-2016',
-        'md5': '9e54d7bdb6fdc02a841007f8a975c094',
-        'info_dict': {
-            'id': 'affaires-sensibles/affaires-sensibles-07-septembre-2016',
-            'ext': 'mp3',
-            'title': 'Affaire Cahuzac : le contentieux du compte en Suisse',
-            'description': 'md5:401969c5d318c061f86bda1fa359292b',
-            'thumbnail': r're:^https?://.*\.jpg',
-            'upload_date': '20160907',
-        },
-    }
-
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-
-        webpage = self._download_webpage(url, video_id)
-
-        video_url = self._search_regex(
-            r'(?s)<div[^>]+class=["\']page-diffusion["\'][^>]*>.*?<button[^>]+data-url=(["\'])(?P<url>(?:(?!\1).)+)\1',
-            webpage, 'video url', group='url')
-
-        title = self._og_search_title(webpage)
-        description = self._og_search_description(webpage)
-        thumbnail = self._html_search_meta(['og:image', 'twitter:image'], webpage)
-
-        upload_date_str = self._search_regex(
-            r'class=["\']\s*cover-emission-period\s*["\'][^>]*>[^<]+\s+(\d{1,2}\s+[^\s]+\s+\d{4})<',
-            webpage, 'upload date', fatal=False)
-        if upload_date_str:
-            upload_date_list = upload_date_str.split()
-            upload_date_list.reverse()
-            upload_date_list[1] = '%02d' % (month_by_name(upload_date_list[1], lang='fr') or 0)
-            upload_date_list[2] = '%02d' % int(upload_date_list[2])
-            upload_date = ''.join(upload_date_list)
-        else:
-            upload_date = None
-
-        return {
-            'id': video_id,
-            'title': title,
-            'description': description,
-            'thumbnail': thumbnail,
-            'upload_date': upload_date,
-            'formats': [{
-                'url': video_url,
-                'vcodec': 'none',
-            }],
-        }
--- a/youtube_dl/extractor/mixcloud.py
+++ b/youtube_dl/extractor/mixcloud.py
@ -1,3 +1,4 @@
+# coding: utf-8
 from __future__ import unicode_literals

 import itertools
@ -10,7 +11,7 @@ from ..compat import (
    compat_ord,
    compat_str,
    compat_urllib_parse_unquote,
-    compat_zip
+    compat_zip as zip,
 )
 from ..utils import (
    int_or_none,
@ -24,7 +25,7 @@ class MixcloudBaseIE(InfoExtractor):
    def _call_api(self, object_type, object_fields, display_id, username, slug=None):
        lookup_key = object_type + 'Lookup'
        return self._download_json(
-            'https://www.mixcloud.com/graphql', display_id, query={
+            'https://app.mixcloud.com/graphql', display_id, query={
                'query': '''{
  %s(lookup: {username: "%s"%s}) {
    %s
@ -44,7 +45,7 @@ class MixcloudIE(MixcloudBaseIE):
            'ext': 'm4a',
            'title': 'Cryptkeeper',
            'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.',
-            'uploader': 'Daniel Holbach',
+            'uploader': 'dholbach',  # was: 'Daniel Holbach',
            'uploader_id': 'dholbach',
            'thumbnail': r're:https?://.*\.jpg',
            'view_count': int,
@ -57,7 +58,7 @@ class MixcloudIE(MixcloudBaseIE):
            'id': 'gillespeterson_caribou-7-inch-vinyl-mix-chat',
            'ext': 'mp3',
            'title': 'Caribou 7 inch Vinyl Mix & Chat',
-            'description': 'md5:2b8aec6adce69f9d41724647c65875e8',
+            'description': r're:Last week Dan Snaith aka Caribou swung by the Brownswood.{136}',
            'uploader': 'Gilles Peterson Worldwide',
            'uploader_id': 'gillespeterson',
            'thumbnail': 're:https?://.*',
@ -65,6 +66,23 @@ class MixcloudIE(MixcloudBaseIE):
            'timestamp': 1422987057,
            'upload_date': '20150203',
        },
+        'params': {
+            'skip_download': '404 not found',
+        },
+    }, {
+        'url': 'https://www.mixcloud.com/gillespeterson/carnival-m%C3%BAsica-popular-brasileira-mix/',
+        'info_dict': {
+            'id': 'gillespeterson_carnival-música-popular-brasileira-mix',
+            'ext': 'm4a',
+            'title': 'Carnival Música Popular Brasileira Mix',
+            'description': r're:Gilles was recently in Brazil to play at Boiler Room.{208}',
+            'timestamp': 1454347174,
+            'upload_date': '20160201',
+            'uploader': 'Gilles Peterson Worldwide',
+            'uploader_id': 'gillespeterson',
+            'thumbnail': 're:https?://.*',
+            'view_count': int,
+        },
    }, {
        'url': 'https://beta.mixcloud.com/RedLightRadio/nosedrip-15-red-light-radio-01-18-2016/',
        'only_matching': True,
@ -76,10 +94,10 @@ class MixcloudIE(MixcloudBaseIE):
        """Encrypt/Decrypt XOR cipher. Both ways are possible because it's XOR."""
        return ''.join([
            compat_chr(compat_ord(ch) ^ compat_ord(k))
-            for ch, k in compat_zip(ciphertext, itertools.cycle(key))])
+            for ch, k in zip(ciphertext, itertools.cycle(key))])

    def _real_extract(self, url):
-        username, slug = re.match(self._VALID_URL, url).groups()
+        username, slug = self._match_valid_url(url).groups()
        username, slug = compat_urllib_parse_unquote(username), compat_urllib_parse_unquote(slug)
        track_id = '%s_%s' % (username, slug)

--- a/youtube_dl/extractor/orf.py
+++ b/youtube_dl/extractor/orf.py
--- a/youtube_dl/extractor/radiofrance.py
+++ b/youtube_dl/extractor/radiofrance.py
@ -4,56 +4,284 @@ from __future__ import unicode_literals
 import re

 from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    get_element_by_attribute,
+    int_or_none,
+    parse_iso8601,
+    strip_or_none,
+    url_or_none
+)


-class RadioFranceIE(InfoExtractor):
-    _VALID_URL = r'^https?://maison\.radiofrance\.fr/radiovisions/(?P<id>[^?#]+)'
-    IE_NAME = 'radiofrance'
+class RadioFranceBaseIE(InfoExtractor):
+    _BASE_URL = r'https://www.radiofrance.fr/'

-    _TEST = {
-        'url': 'http://maison.radiofrance.fr/radiovisions/one-one',
-        'md5': 'bdbb28ace95ed0e04faab32ba3160daf',
+    def extract_api_data(self, api_path, id, html):
+        pattern = r'<script [^>]*sveltekit:data-url="https://www\.radiofrance\.fr/api/v[\d.]+/%s[^>]*>(?P<json>.*)</script>' % api_path
+        json = self._search_regex(pattern, html, 'API data', flags=re.DOTALL, group='json')
+
+        if not json:
+            raise ExtractorError('%s: JSON data not found' % id)
+
+        try:
+            json = self._parse_json(json, id)
+            json = self._parse_json(json['body'], id)
+
+            if api_path == 'path':
+                return json['content']
+            elif api_path == 'stations':
+                return json
+            else:
+                raise ExtractorError('Coding error')
+        except KeyError:
+            raise ExtractorError('%s: Invalid JSON' % id)
+
+    def get_title(self, api_data, webpage=None):
+        title = strip_or_none(api_data.get('title'))
+        if not title and webpage:
+            title = strip_or_none(get_element_by_attribute('h1', None, webpage, False)) or strip_or_none(self._og_search_title(webpage))
+        return title
+
+    def get_description(self, api_data, webpage=None):
+        description = strip_or_none(api_data.get('standFirst'))
+        if not description and webpage:
+            description = strip_or_none(self._og_search_description(webpage))
+        return description
+
+    def get_thumbnail(self, api_data, webpage=None):
+        thumbnail = None
+        visual = api_data.get('visual')
+        if visual:
+            thumbnail = url_or_none(visual.get('src'))
+        if not thumbnail and webpage:
+            thumbnail = self._og_search_thumbnail(webpage)
+        return thumbnail
+
+    def get_timestamp(self, api_data, webpage=None):
+        timestamp = api_data.get('publishedDate')
+        if not timestamp and webpage:
+            timestamp = parse_iso8601(self._html_search_meta('article:published_time', webpage, 'publication time', ))
+        return timestamp
+
+    def get_brand(self, api_data, webpage=None):
+        brand = strip_or_none(api_data.get('brand'))
+        if not brand and webpage:
+            brand = self._og_search_property('site_name', webpage, 'Station name', fatal=False)
+        return brand
+
+    def extract_episode(self, episode_id, api_data):
+        manifestations = api_data.get('manifestations')
+        if manifestations is None or len(manifestations) == 0:
+            return None, None
+
+        url = url_or_none(manifestations[0]['url'])
+        duration = int_or_none(manifestations[0].get('duration'))
+        return url, duration
+
+    def get_playlist_entries(self, playlist_url, playlist_id, api_data, direction):
+        playlist_data = api_data['expressions']
+
+        entries = []
+        items = playlist_data.get('items')
+        for item in items:
+            episode_path = item.get('path')
+            if episode_path is None:
+                self.report_warning('No path found for episode "%s"', item.get('title'))
+                continue
+            episode_id = RadioFrancePodcastEpisodeIE._match_id(self._BASE_URL + episode_path)
+            if episode_id is None:
+                self.report_warning('Could not parse id of episode from path: "%s"' % episode_path)
+                continue
+            episode_url, duration = self.extract_episode(episode_id, item)
+            if episode_url is None:
+                self.to_screen('Episode "%s" is not available' % episode_path)
+                continue
+            entry = {
+                'id': episode_id,
+                'url': episode_url,
+                'title': self.get_title(item),
+                'description': self.get_description(item),
+                'timestamp': self.get_timestamp(item),
+                'thumbnail': self.get_thumbnail(item),
+                'duration': duration,
+            }
+            entries.append(entry)
+
+        page_number = int_or_none(playlist_data.get('pageNumber'))
+        if page_number:
+            if direction in ['both', 'prev'] and playlist_data.get('prev') is not None:
+                webpage, other_api_data = self.get_data(playlist_url, 'path', playlist_id, page=page_number - 1)
+                entries = self.get_playlist_entries(playlist_url, playlist_id, other_api_data, direction='prev') + entries
+            if direction in ['both', 'next'] and playlist_data.get('next') is not None:
+                webpage, other_api_data = self.get_data(playlist_url, 'path', playlist_id, page=page_number + 1)
+                entries = entries + self.get_playlist_entries(playlist_url, playlist_id, other_api_data, direction='next')
+
+        return entries
+
+    def get_data(self, url, api_path, id, page=None):
+        query = {}
+        note = None
+        if page:
+            query['p'] = page
+            note = "Downloading page %i" % page
+        webpage = self._download_webpage(url, id, query=query, note=note)
+        api_data = self.extract_api_data(api_path, id, webpage)
+        return webpage, api_data
+
+
+class RadioFrancePodcastEpisodeIE(RadioFranceBaseIE):
+    _VALID_URL = r'https?://www\.radiofrance\.fr/(?:francemusique|franceinter|franceculture|franceinfo|mouv|fip)/podcasts/.+/.+-(?P<id>\d+)$'
+
+    _TESTS = [{
+        'note': 'Podcast episode with audio from France Info',
+        'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-brief-eco/le-brief-eco-du-lundi-05-septembre-2022-8310713',
        'info_dict': {
-            'id': 'one-one',
-            'ext': 'ogg',
-            'title': 'One to one',
-            'description': "Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.",
-            'uploader': 'Thomas Hercouët',
-        },
-    }
+            'id': '8310713',
+            'ext': 'mp3',
+            'url': r're:^https?://.*\.mp3$',
+            'title': 'Pour la première fois en vingt ans, l’euro passe sous les 0,99\u00a0dollar',
+            'description': str,
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'timestamp': int,
+            'duration': int,
+            'upload_date': str
+        }
+    }, {
+        'note': 'Podcast episode from France Musique',
+        'url': 'https://www.radiofrance.fr/francemusique/podcasts/allegretto/lever-du-jour-9233228',
+        'only_matching': True
+    }, {
+        'note': 'Podcast episode from FranceInter',
+        'url': 'https://www.radiofrance.fr/franceinter/podcasts/rendez-vous-avec-x/un-mysterieux-echange-digne-de-la-guerre-froide-9343281',
+        'only_matching': True
+    }, {
+        'note': 'Podcast episode from France Culture',
+        'url': 'https://www.radiofrance.fr/franceculture/podcasts/la-science-cqfd/teotihuacan-la-plus-mysterieuse-des-cites-d-or-9224610',
+        'only_matching': True
+    }, {
+        'note': 'Podcast episode from Le Mouv',
+        'url': 'https://www.radiofrance.fr/mouv/podcasts/mouv-dj-la-caution/ncr2a-ne-cherche-rien-d-autre-ailleurs-1197950',
+        'only_matching': True
+    }, {
+        'note': 'Podcast episode from FIP',
+        'url': 'https://www.radiofrance.fr/fip/podcasts/certains-l-aiment-fip/hommage-au-cinema-de-vangelis-4734742',
+        'only_matching': True
+    }]

    def _real_extract(self, url):
-        m = re.match(self._VALID_URL, url)
-        video_id = m.group('id')
-
-        webpage = self._download_webpage(url, video_id)
-        title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, 'title')
-        description = self._html_search_regex(
-            r'<div class="bloc_page_wrapper"><div class="text">(.*?)</div>',
-            webpage, 'description', fatal=False)
-        uploader = self._html_search_regex(
-            r'<div class="credit">&nbsp;&nbsp;&copy;&nbsp;(.*?)</div>',
-            webpage, 'uploader', fatal=False)
-
-        formats_str = self._html_search_regex(
-            r'class="jp-jplayer[^"]*" data-source="([^"]+)">',
-            webpage, 'audio URLs')
-        formats = [
-            {
-                'format_id': fm[0],
-                'url': fm[1],
-                'vcodec': 'none',
-                'preference': i,
-            }
-            for i, fm in
-            enumerate(re.findall(r"([a-z0-9]+)\s*:\s*'([^']+)'", formats_str))
-        ]
-        self._sort_formats(formats)
+        id = self._match_id(url)
+        webpage, api_data = self.get_data(url, 'path', id)
+        url, duration = self.extract_episode(id, api_data)
+        if url is None:
+            msg = 'Podcast file is not available. If the show is too recent, the file may not have been uploaded yet: try again later.'
+            raise ExtractorError(msg, expected=True, video_id=id)

        return {
-            'id': video_id,
-            'title': title,
-            'formats': formats,
-            'description': description,
-            'uploader': uploader,
+            'id': id,
+            'url': url,
+            'title': self.get_title(api_data, webpage),
+            'description': self.get_description(api_data, webpage),
+            'timestamp': self.get_timestamp(api_data, webpage),
+            'thumbnail': self.get_thumbnail(api_data, webpage),
+            'channel_id': self.get_brand(api_data, webpage),
+            'duration': duration,
+        }
+
+
+class RadioFrancePodcastPlaylistIE(RadioFranceBaseIE):
+    _VALID_URL = r'https?://www\.radiofrance\.fr/(?:francemusique|franceinter|franceculture|franceinfo|mouv|fip)/podcasts/(?P<id>[^/]+?)(?:[?#].*)?$'
+
+    _TESTS = [{
+        'note': 'Podcast show with multiple pages of episodes and some of them are missing',
+        'url': 'https://www.radiofrance.fr/franceculture/podcasts/une-semaine-dans-le-monde-10-11?p=2',
+        'info_dict': {
+            'id': 'une-semaine-dans-le-monde-10-11',
+            'title': 'Une semaine dans le monde | 10-11',
+            'description': str,
+            'timestamp': int
+        },
+        'playlist_count': 23,
+    }]
+
+    def _real_extract(self, url):
+        id = self._match_id(url)
+        webpage, api_data = self.get_data(url, 'path', id)
+
+        entries = self.get_playlist_entries(url, id, api_data, direction='both')
+        entries.reverse()
+
+        return {
+            'id': id,
+            '_type': 'playlist',
+            'entries': entries,
+            'title': self.get_title(api_data, webpage),
+            'description': self.get_description(api_data, webpage),
+            'timestamp': self.get_timestamp(api_data, webpage),
+            'thumbnail': self.get_thumbnail(api_data, webpage),
+            'channel_id': self.get_brand(api_data, webpage),
+        }
+
+
+class RadioFranceWebradioIE(RadioFranceBaseIE):
+    _VALID_URL = r'https?://www\.radiofrance\.fr/(?:francemusique|franceinter|franceculture|franceinfo|mouv|fip)/(?P<id>radio-[^/]+)$'
+
+    _TESTS = [{
+        'note': 'Full list of webradios available at https://www.radiofrance.fr/ecouter-musique',
+        'url': 'https://www.radiofrance.fr/fip/radio-metal',
+        'info_dict': {
+            'id': 'radio-metal',
+            'ext': 'aac',
+            'title': str,
+        },
+        'params': {
+            'format': 'aac',
+            'skip_download': True,
+        }
+    }]
+
+    def get_livestream_formats(self, id, api_data):
+        sources = api_data['media']['sources']
+
+        formats = []
+        for source in sources:
+            url = source.get('url')
+            if not url:
+                continue
+
+            format_id = source.get('format')
+            format = {
+                'url': url,
+                'format_id': format_id,
+                'asr': 48000,
+                'vcodec': 'none'
+            }
+            if format_id == 'mp3':
+                format['preference'] = 1
+                format['acodec'] = 'mp3'
+                format['abr'] = source.get('bitrate')
+            elif format_id == 'aac':
+                format['preference'] = 2
+                format['acodec'] = 'aac'
+                format['abr'] = source.get('bitrate')
+            elif format_id == 'hls':
+                format['preference'] = 0
+                format['manifest_url'] = url
+            formats.append(format)
+
+        if len(formats) == 0:
+            raise ExtractorError('No live streaming URL found')
+        return formats
+
+    def _real_extract(self, url):
+        id = self._match_id(url)
+        webpage, api_data = self.get_data(url, 'stations', id)
+
+        return {
+            'id': id,
+            'title': self.get_title(api_data, webpage),
+            'formats': self.get_livestream_formats(id, api_data),
+            'thumbnail': self.get_thumbnail(api_data, webpage),
+            'channel_id': self.get_brand(api_data, webpage),
+            'is_live': True
        }
--- a/youtube_dl/extractor/vidlii.py
+++ b/youtube_dl/extractor/vidlii.py
@ -4,6 +4,7 @@ from __future__ import unicode_literals
 import re

 from .common import InfoExtractor
+
 from ..utils import (
    float_or_none,
    get_element_by_id,
@ -11,6 +12,7 @@ from ..utils import (
    strip_or_none,
    unified_strdate,
    urljoin,
+    str_to_int,
 )


@ -35,6 +37,26 @@ class VidLiiIE(InfoExtractor):
            'categories': ['News & Politics'],
            'tags': ['Vidlii', 'Jan', 'Videogames'],
        }
+    }, {
+        # HD
+        'url': 'https://www.vidlii.com/watch?v=2Ng8Abj2Fkl',
+        'md5': '450e7da379c884788c3a4fa02a3ce1a4',
+        'info_dict': {
+            'id': '2Ng8Abj2Fkl',
+            'ext': 'mp4',
+            'title': 'test',
+            'description': 'md5:cc55a86032a7b6b3cbfd0f6b155b52e9',
+            'thumbnail': 'https://www.vidlii.com/usfi/thmp/2Ng8Abj2Fkl.jpg',
+            'uploader': 'VidLii',
+            'uploader_url': 'https://www.vidlii.com/user/VidLii',
+            'upload_date': '20200927',
+            'duration': 5,
+            'view_count': int,
+            'comment_count': int,
+            'average_rating': float,
+            'categories': ['Film & Animation'],
+            'tags': list,
+        },
    }, {
        'url': 'https://www.vidlii.com/embed?v=tJluaH4BJ3v&a=0',
        'only_matching': True,
@ -46,11 +68,32 @@ class VidLiiIE(InfoExtractor):
        webpage = self._download_webpage(
            'https://www.vidlii.com/watch?v=%s' % video_id, video_id)

-        video_url = self._search_regex(
-            r'src\s*:\s*(["\'])(?P<url>(?:https?://)?(?:(?!\1).)+)\1', webpage,
-            'video url', group='url')
+        formats = []

-        title = self._search_regex(
+        def add_format(format_url, height=None):
+            height = int(self._search_regex(r'(\d+)\.mp4',
+                         format_url, 'height', default=360))
+
+            formats.append({
+                'url': format_url,
+                'format_id': '%dp' % height if height else None,
+                'height': height,
+            })
+
+        sources = re.findall(
+            r'src\s*:\s*(["\'])(?P<url>(?:https?://)?(?:(?!\1).)+)\1',
+            webpage)
+
+        formats = []
+        if len(sources) > 1:
+            add_format(sources[1][1])
+            self._check_formats(formats, video_id)
+        if len(sources) > 0:
+            add_format(sources[0][1])
+
+        self._sort_formats(formats)
+
+        title = self._html_search_regex(
            (r'<h1>([^<]+)</h1>', r'<title>([^<]+) - VidLii<'), webpage,
            'title')

@ -82,9 +125,9 @@ class VidLiiIE(InfoExtractor):
            default=None) or self._search_regex(
            r'duration\s*:\s*(\d+)', webpage, 'duration', fatal=False))

-        view_count = int_or_none(self._search_regex(
-            (r'<strong>(\d+)</strong> views',
-             r'Views\s*:\s*<strong>(\d+)</strong>'),
+        view_count = str_to_int(self._html_search_regex(
+            (r'<strong>([\d,.]+)</strong> views',
+             r'Views\s*:\s*<strong>([\d,.]+)</strong>'),
            webpage, 'view count', fatal=False))

        comment_count = int_or_none(self._search_regex(
@ -109,7 +152,7 @@ class VidLiiIE(InfoExtractor):

        return {
            'id': video_id,
-            'url': video_url,
+            'formats': formats,
            'title': title,
            'description': description,
            'thumbnail': thumbnail,
Author	SHA1	Message	Date
Olivier Trichet	5fb09daf08	Merge `72db217289` into `0153b387e5`	2024-06-14 12:01:11 +00:00
Paper	0153b387e5	[VidLii] Add 720p support (#30924 ) * [VidLii] Add HD support (yt-dlp backport-ish) * Also fix a bug with the view count --------- Co-authored-by: dirkf <fieldhouse@gmx.net>	2024-06-11 13:21:39 +01:00
dirkf	a48fe7491d	[ORF] Skip tests with limited availability	2024-06-11 12:52:13 +01:00
dirkf	e20ca543f0	[ORF] Re-factor and update`ORFFM4StoryIE` * fix getting media via DASH instead of inaccessible mp4 * also get in-page YT media	2024-06-11 12:52:13 +01:00
dirkf	e39466051f	[ORF] Support sound.orf.at, updating `ORFRadioIE` * maintain support for xx.orf.at/player/... URLs * add `ORFRadioCollectionIE` to support playlists in ORF Sound * back-port and re-work `ORFPodcastIE` from https://github.com/yt-dlp/yt-dlp/pull/8486, thx Esokrates	2024-06-11 12:52:13 +01:00
dirkf	d95c0d203f	[ORF] Support on.orf.at, replacing `ORFTVthekIE` * add `ORFONIE`, back-porting yt-dlp PR https://github.com/yt-dlp/yt-dlp/pull/9113 and friends: thx HobbyistDev, TuxCoder, seproDev * re-factor to support livestreams via new `ORFONliveIE`	2024-06-11 12:52:13 +01:00
dirkf	3bde6a5752	[test] Improve download test * skip reason can't be unicode in Py2 * remove duplicate assert...Equal functions	2024-06-11 12:52:13 +01:00
dirkf	50f6c5668a	[core] Re-factor with `_fill_common_fields()` as used in yt-dlp	2024-06-11 12:52:13 +01:00
dirkf	b4ff08bd2d	[core] Safer handling of nested playlist data	2024-06-11 12:52:13 +01:00
kmnx	88bd8b9f87	[mixcloud] updated mixcloud API server address (#32557 ) * updated mixcloud API server address * fix tests * etc --------- Co-authored-by: dirkf <fieldhouse@gmx.net>	2024-06-11 12:38:24 +01:00
Olivier Trichet	72db217289	[RadioFrance] Extractor fo thematic webradios	2022-12-22 14:22:19 -05:00
Olivier Trichet	fc933e686b	[RadioFrance] Refactoring	2022-12-22 13:01:10 -05:00
Olivier Trichet	ea02c40539	[RadioFrance] Extractor for podcast playlists	2022-12-22 13:00:54 -05:00
Olivier Trichet	7270ecf3d6	[RadioFrance] Extractor for podcast of Radio France stations	2022-12-22 13:00:17 -05:00
Olivier Trichet	dade9111f1	[RadioFrance] Remove old Radio France stations extractors These are not working anymore after their respectives websites were merged into www.radiofrance.fr.	2022-12-22 13:00:08 -05:00