Add ArteTVCategoryIE to support category playlists

2026-06-06 20:55:23 +00:00 · 2021-10-18 15:54:26 +01:00
parent 16a3fe2ba6
commit ddc080a562
2 changed files with 48 additions and 0 deletions
@@ -12,6 +12,7 @@ from ..utils import (
    ExtractorError,
    int_or_none,
    qualities,
+    strip_or_none,
    try_get,
    unified_strdate,
    url_or_none,
@@ -252,3 +253,49 @@ class ArteTVPlaylistIE(ArteTVBaseIE):
        title = collection.get('title')
        description = collection.get('shortDescription') or collection.get('teaserText')
        return self.playlist_result(entries, playlist_id, title, description)
+
+
+class ArteTVCategoryIE(ArteTVBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE._ARTE_LANGUAGES
+    _TESTS = [{
+        'url': 'https://www.arte.tv/en/videos/politics-and-society/',
+        'info_dict': {
+            'id': 'politics-and-society',
+            'title': 'Politics and society',
+            'description': 'Investigative documentary series, geopolitical analysis, and international commentary',
+        },
+        'playlist_mincount': 13,
+    },
+    ]
+
+    @classmethod
+    def suitable(cls, url):
+        return (
+            not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, ))
+            and super(ArteTVCategoryIE, cls).suitable(url))
+
+    def _real_extract(self, url):
+        lang, playlist_id = re.match(self._VALID_URL, url).groups()
+        webpage = self._download_webpage(url, playlist_id)
+
+        items = []
+        for video in re.finditer(
+                r'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|\b)(?P<url>https?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang,
+                webpage):
+            video = video.group('url')
+            if video == url:
+                continue
+            if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )):
+                items.append(video)
+
+        if items:
+            title = (self._og_search_title(webpage, default=None)
+                     or self._html_search_regex(r'<title\b[^>]*>([^<]+)</title>', default=None))
+            title = strip_or_none(title.rsplit('|', 1)[0]) or self._generic_title(url)
+
+            result = self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title)
+            if result:
+                description = self._og_search_description(webpage, default=None)
+                if description:
+                    result['description'] = description
+                return result
@@ -71,6 +71,7 @@ from .arte import (
    ArteTVIE,
    ArteTVEmbedIE,
    ArteTVPlaylistIE,
+    ArteTVCategoryIE,
 )
 from .arnes import ArnesIE
 from .asiancrush import (