From a142226695e695846a48c17ab8ae2d7251ebfa4a Mon Sep 17 00:00:00 2001 From: Kay B <> Date: Wed, 14 Mar 2018 22:35:59 +0100 Subject: [PATCH] [requestforcomments] new extractor for requestforcomments podcast --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/requestforcomments.py | 64 ++++++++++++++++++++++ 2 files changed, 65 insertions(+) create mode 100644 youtube_dl/extractor/requestforcomments.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 74ec899f4..b14bfa26b 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -885,6 +885,7 @@ from .rentv import ( RENTVIE, RENTVArticleIE, ) +from .requestforcomments import RequestForCommentsIE from .restudy import RestudyIE from .reuters import ReutersIE from .reverbnation import ReverbNationIE diff --git a/youtube_dl/extractor/requestforcomments.py b/youtube_dl/extractor/requestforcomments.py new file mode 100644 index 000000000..75ff84cad --- /dev/null +++ b/youtube_dl/extractor/requestforcomments.py @@ -0,0 +1,64 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class RequestForCommentsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?requestforcomments.de/(?:archives/|\?p=)(?P[^\s]+)' + _TESTS = [{ + 'url': 'https://requestforcomments.de/archives/412', + 'info_dict': { + 'id': '412', + 'ext': 'ogg', + 'formats': 'mincount:4', + 'title': 'RFCE014: IPv6', + 'description': 'md5:e0924fc2a3536107c2055b3c36bef2e9', + 'site_name': 'Request for Comments', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://requestforcomments.de/?p=412', + 'info_dict': { + 'id': '412', + 'ext': 'ogg', + 'formats': 'mincount:4', + 'title': 'RFCE014: IPv6', + 'description': 'md5:e0924fc2a3536107c2055b3c36bef2e9', + 'site_name': 'Request for Comments', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + + content_id = self._match_id(url).strip('/') + webpage = self._download_webpage(url, content_id) + + audio_reg = self._og_regexes('audio') + audio_type_reg = self._og_regexes('audio:type') + + formats = [] + for audio_url, audio_type in zip( + re.findall(audio_reg[0], webpage), + re.findall(audio_type_reg[0], webpage)): + formats.append({ + 'url': audio_url[0], + 'format_id': audio_type[0]}) + + return { + 'id': content_id, + 'title': self._og_search_title(webpage), + 'site_name': self._og_search_property('site_name', webpage), + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'formats': formats, + }