1
0
mirror of https://github.com/ytdl-org/youtube-dl.git synced 2025-01-25 04:09:50 +00:00

[polskieradio] Add support for downloading whole programmes.

This extends the Polskie Radio (the Polish national radio) extractor to
enable the user to download all the broadcasts of a single programme.
This commit is contained in:
Jakub Adam Wieczorek 2016-08-25 23:04:59 +02:00
parent 155bc674c4
commit 8d3737cda7
2 changed files with 80 additions and 1 deletions

View File

@ -667,7 +667,7 @@ from .pluralsight import (
) )
from .podomatic import PodomaticIE from .podomatic import PodomaticIE
from .pokemon import PokemonIE from .pokemon import PokemonIE
from .polskieradio import PolskieRadioIE from .polskieradio import PolskieRadioIE, PolskieRadioProgrammeIE
from .porn91 import Porn91IE from .porn91 import Porn91IE
from .porncom import PornComIE from .porncom import PornComIE
from .pornhd import PornHdIE from .pornhd import PornHdIE

View File

@ -7,6 +7,7 @@ from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_str, compat_str,
compat_urllib_parse_unquote, compat_urllib_parse_unquote,
compat_urlparse
) )
from ..utils import ( from ..utils import (
int_or_none, int_or_none,
@ -15,6 +16,84 @@ from ..utils import (
) )
class PolskieRadioProgrammeIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+(,[^/]+)?/(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.polskieradio.pl/7/5102,HISTORIA-ZYWA',
'info_dict': {
'id': '5102',
'title': 'HISTORIA ŻYWA',
},
'playlist_mincount': 34,
}, {
'url': 'http://www.polskieradio.pl/7/4807',
'info_dict': {
'id': '4807',
'title': 'Vademecum 1050. rocznicy Chrztu Polski'
},
'playlist_mincount': 5
}, {
'url': 'http://www.polskieradio.pl/7/129,Sygnaly-dnia?ref=source',
'only_matching': True
}, {
'url': 'http://www.polskieradio.pl/37,RedakcjaKatolicka/4143,Kierunek-Krakow',
'info_dict': {
'id': '4143',
'title': 'Kierunek Kraków',
},
'playlist_mincount': 61
}, {
'url': 'http://www.polskieradio.pl/7,Jedynka/5102,HISTORIA-ZYWA',
'only_matching': True
}]
def _get_entries_from_page_content(self, base_url, content):
entries = []
articles = re.findall(
r'<article class="ID-(\d+) article">\s+<a href="([^"]+)"( data-layer="[^"]*")? class="[^"]*" title="([^"]+)">',
content)
for article_id, article_url, _, article_title in articles:
resolved_article_url = compat_urlparse.urljoin(base_url, article_url)
entries.append(self.url_result(
resolved_article_url,
ie='PolskieRadio',
video_id=article_id,
video_title=article_title))
return entries
@classmethod
def suitable(cls, url):
return False if PolskieRadioIE.suitable(url) else super(PolskieRadioProgrammeIE, cls).suitable(url)
def _real_extract(self, url):
programme_id = self._match_id(url)
webpage = self._download_webpage(url, programme_id)
title = self._html_search_regex(
r'<a href="[^"]+" id=".*_linkCategory" title="[^"]+">(.+?)</a>',
webpage, 'title', fatal=False)
description = None
entries = self._get_entries_from_page_content(url, webpage)
pages = re.findall(r'<a( href="([^"]+/Strona/)\d+")? id="[^"]+" title="strona&#32;(\d+)"', webpage)
page_count = max(int(page_number) for _, _, page_number in pages) if pages else 1
if page_count > 1:
page_url_root = next(url for _, url, _ in pages if len(url) > 0)
for page_number in range(2, page_count + 1):
page_url = page_url_root + str(page_number)
resolved_page_url = compat_urlparse.urljoin(url, page_url)
page_content = self._download_webpage(
resolved_page_url, programme_id,
note="Downloading page number %d" % page_number)
entries.extend(self._get_entries_from_page_content(url, page_content))
return self.playlist_result(entries, programme_id, title, description)
class PolskieRadioIE(InfoExtractor): class PolskieRadioIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+/\d+/Artykul/(?P<id>[0-9]+)' _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+/\d+/Artykul/(?P<id>[0-9]+)'
_TESTS = [{ _TESTS = [{