youtube-dl/youtube_dl/extractor/wat.py

# coding: utf-8
from __future__ import unicode_literals

from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
    ExtractorError,
    int_or_none,
    try_get,
    unified_strdate,
)


class WatIE(InfoExtractor):
    _VALID_URL = r'(?:wat:|https?://(?:www\.)?wat\.tv/video/.*-)(?P<id>[0-9a-z]+)'
    IE_NAME = 'wat.tv'
    _TESTS = [
        {
            'url': 'http://www.wat.tv/video/soupe-figues-l-orange-aux-epices-6z1uz_2hvf7_.html',
            'info_dict': {
                'id': '11713067',
                'ext': 'mp4',
                'title': 'Soupe de figues à l\'orange et aux épices',
                'description': 'Retrouvez l\'émission "Petits plats en équilibre", diffusée le 18 août 2014.',
                'upload_date': '20140819',
                'duration': 120,
            },
            'params': {
                # m3u8 download
                'skip_download': True,
            },
            'expected_warnings': ['HTTP Error 404'],
            'skip': 'This content is no longer available',
        },
        {
            'url': 'http://www.wat.tv/video/gregory-lemarchal-voix-ange-6z1v7_6ygkj_.html',
            'md5': 'b16574df2c3cd1a36ca0098f2a791925',
            'info_dict': {
                'id': '11713075',
                'ext': 'mp4',
                'title': 'Grégory Lemarchal, une voix d\'ange depuis 10 ans (1/3)',
                'upload_date': '20140816',
            },
            'expected_warnings': ["Ce contenu n'est pas disponible pour l'instant."],
            'skip': 'This content is no longer available',
        },
    ]
    _GEO_BYPASS = False

    def _real_extract(self, url):
        video_id = self._match_id(url)
        video_id = video_id if video_id.isdigit() and len(video_id) > 6 else compat_str(int(video_id, 36))

        # 'contentv4' is used in the website, but it also returns the related
        # videos, we don't need them
        # video_data = self._download_json(
        #     'http://www.wat.tv/interface/contentv4s/' + video_id, video_id)
        video_data = self._download_json(
            'https://mediainfo.tf1.fr/mediainfocombo/' + video_id,
            video_id, query={'context': 'MYTF1'})
        video_info = video_data['media']

        error_desc = video_info.get('error_desc')
        if error_desc:
            if video_info.get('error_code') == 'GEOBLOCKED':
                self.raise_geo_restricted(error_desc, video_info.get('geoList'))
            raise ExtractorError(error_desc, expected=True)

        title = video_info['title']

        formats = []

        def extract_formats(manifest_urls):
            for f, f_url in manifest_urls.items():
                if not f_url:
                    continue
                if f in ('dash', 'mpd'):
                    formats.extend(self._extract_mpd_formats(
                        f_url.replace('://das-q1.tf1.fr/', '://das-q1-ssl.tf1.fr/'),
                        video_id, mpd_id='dash', fatal=False))
                elif f == 'hls':
                    formats.extend(self._extract_m3u8_formats(
                        f_url, video_id, 'mp4',
                        'm3u8_native', m3u8_id='hls', fatal=False))

        delivery = video_data.get('delivery') or {}
        extract_formats({delivery.get('format'): delivery.get('url')})
        if not formats:
            if delivery.get('drm'):
                raise ExtractorError('This video is DRM protected.', expected=True)
            manifest_urls = self._download_json(
                'http://www.wat.tv/get/webhtml/' + video_id, video_id, fatal=False)
            if manifest_urls:
                extract_formats(manifest_urls)

        self._sort_formats(formats)

        return {
            'id': video_id,
            'title': title,
            'thumbnail': video_info.get('preview'),
            'upload_date': unified_strdate(try_get(
                video_data, lambda x: x['mediametrie']['chapters'][0]['estatS4'])),
            'duration': int_or_none(video_info.get('duration')),
            'formats': formats,
        }
WatIE: support videos divided in multiple parts (closes #222 and #659) The id for the videos is now the full id, no the one in the webpage url. Also extract more information: description, view_count and upload_date 2013-06-29 16:22:03 +00:00			`# coding: utf-8`
[wat] Modernize 2014-03-29 14:15:16 +00:00			`from __future__ import unicode_literals`
WatIE: support videos divided in multiple parts (closes #222 and #659) The id for the videos is now the full id, no the one in the webpage url. Also extract more information: description, view_count and upload_date 2013-06-29 16:22:03 +00:00
Add WatIE 2013-06-28 20:01:47 +00:00			`from .common import InfoExtractor`
[wat] extract all formats 2016-04-22 08:36:14 +00:00			`from ..compat import compat_str`
[wat] Capture and output error message 2014-09-23 12:58:35 +00:00			`from ..utils import (`
[tf1] improve extraction(closes #27980)(closes #28040) 2021-02-23 11:39:46 +00:00			`ExtractorError,`
[wat] improve extraction(#10281) add alternative method to extract http formats works even if the video is geo-restricted or removed from public access(most of the cases) 2016-08-10 13:17:22 +00:00			`int_or_none,`
[tf1] improve extraction(closes #27980)(closes #28040) 2021-02-23 11:39:46 +00:00			`try_get,`
			`unified_strdate,`
[wat] Capture and output error message 2014-09-23 12:58:35 +00:00			`)`
Add WatIE 2013-06-28 20:01:47 +00:00

			`class WatIE(InfoExtractor):`
[wat] extract all formats 2016-04-22 08:36:14 +00:00			`_VALID_URL = r'(?:wat:\|https?://(?:www\.)?wat\.tv/video/.*-)(?P<id>[0-9a-z]+)'`
Add WatIE 2013-06-28 20:01:47 +00:00			`IE_NAME = 'wat.tv'`
[wat] Use server time and pass country argument (Closes #3579) 2014-08-25 13:21:33 +00:00			`_TESTS = [`
			`{`
			`'url': 'http://www.wat.tv/video/soupe-figues-l-orange-aux-epices-6z1uz_2hvf7_.html',`
			`'info_dict': {`
			`'id': '11713067',`
			`'ext': 'mp4',`
			`'title': 'Soupe de figues à l\'orange et aux épices',`
			`'description': 'Retrouvez l\'émission "Petits plats en équilibre", diffusée le 18 août 2014.',`
			`'upload_date': '20140819',`
			`'duration': 120,`
			`},`
[wat] try all supported adaptive urls 2018-06-17 14:56:52 +00:00			`'params': {`
			`# m3u8 download`
			`'skip_download': True,`
			`},`
			`'expected_warnings': ['HTTP Error 404'],`
[tf1] improve extraction(closes #27980)(closes #28040) 2021-02-23 11:39:46 +00:00			`'skip': 'This content is no longer available',`
[wat] Use server time and pass country argument (Closes #3579) 2014-08-25 13:21:33 +00:00			`},`
			`{`
			`'url': 'http://www.wat.tv/video/gregory-lemarchal-voix-ange-6z1v7_6ygkj_.html',`
[wat] try all supported adaptive urls 2018-06-17 14:56:52 +00:00			`'md5': 'b16574df2c3cd1a36ca0098f2a791925',`
[wat] Use server time and pass country argument (Closes #3579) 2014-08-25 13:21:33 +00:00			`'info_dict': {`
			`'id': '11713075',`
			`'ext': 'mp4',`
			`'title': 'Grégory Lemarchal, une voix d\'ange depuis 10 ans (1/3)',`
			`'upload_date': '20140816',`
			`},`
[wat] improve extraction(#10281) add alternative method to extract http formats works even if the video is geo-restricted or removed from public access(most of the cases) 2016-08-10 13:17:22 +00:00			`'expected_warnings': ["Ce contenu n'est pas disponible pour l'instant."],`
[tf1] improve extraction(closes #27980)(closes #28040) 2021-02-23 11:39:46 +00:00			`'skip': 'This content is no longer available',`
Disable way and tf1 tests, the whole videos are served sometimes, so the md5 sum doesn't match. 2013-07-30 09:19:07 +00:00			`},`
[wat] Use server time and pass country argument (Closes #3579) 2014-08-25 13:21:33 +00:00			`]`
[tf1] improve extraction(closes #27980)(closes #28040) 2021-02-23 11:39:46 +00:00			`_GEO_BYPASS = False`
[wat] Modernize 2014-03-29 14:15:16 +00:00
Add WatIE 2013-06-28 20:01:47 +00:00			`def _real_extract(self, url):`
[wat] extract all formats 2016-04-22 08:36:14 +00:00			`video_id = self._match_id(url)`
			`video_id = video_id if video_id.isdigit() and len(video_id) > 6 else compat_str(int(video_id, 36))`
WatIE: support videos divided in multiple parts (closes #222 and #659) The id for the videos is now the full id, no the one in the webpage url. Also extract more information: description, view_count and upload_date 2013-06-29 16:22:03 +00:00
[wat] extract all formats 2016-04-22 08:36:14 +00:00			`# 'contentv4' is used in the website, but it also returns the related`
			`# videos, we don't need them`
[tf1] improve extraction(closes #27980)(closes #28040) 2021-02-23 11:39:46 +00:00			`# video_data = self._download_json(`
			`# 'http://www.wat.tv/interface/contentv4s/' + video_id, video_id)`
[wat] improve extraction(#10281) add alternative method to extract http formats works even if the video is geo-restricted or removed from public access(most of the cases) 2016-08-10 13:17:22 +00:00			`video_data = self._download_json(`
[tf1] improve extraction(closes #27980)(closes #28040) 2021-02-23 11:39:46 +00:00			`'https://mediainfo.tf1.fr/mediainfocombo/' + video_id,`
			`video_id, query={'context': 'MYTF1'})`
[wat] improve extraction(#10281) add alternative method to extract http formats works even if the video is geo-restricted or removed from public access(most of the cases) 2016-08-10 13:17:22 +00:00			`video_info = video_data['media']`
[wat] Add support for SD and HD videos (Closes #3558) 2014-08-23 19:22:10 +00:00
[wat] Capture and output error message 2014-09-23 12:58:35 +00:00			`error_desc = video_info.get('error_desc')`
			`if error_desc:`
[tf1] improve extraction(closes #27980)(closes #28040) 2021-02-23 11:39:46 +00:00			`if video_info.get('error_code') == 'GEOBLOCKED':`
			`self.raise_geo_restricted(error_desc, video_info.get('geoList'))`
[wat] detect DRM protected videos(closes #27958) 2021-02-23 12:50:18 +00:00			`raise ExtractorError(error_desc, expected=True)`
Add WatIE 2013-06-28 20:01:47 +00:00
[tf1] improve extraction(closes #27980)(closes #28040) 2021-02-23 11:39:46 +00:00			`title = video_info['title']`
WatIE: support videos divided in multiple parts (closes #222 and #659) The id for the videos is now the full id, no the one in the webpage url. Also extract more information: description, view_count and upload_date 2013-06-29 16:22:03 +00:00
[tf1] improve extraction(closes #27980)(closes #28040) 2021-02-23 11:39:46 +00:00			`formats = []`
[wat] extract all formats 2016-04-22 08:36:14 +00:00
[tf1] improve extraction(closes #27980)(closes #28040) 2021-02-23 11:39:46 +00:00			`def extract_formats(manifest_urls):`
			`for f, f_url in manifest_urls.items():`
			`if not f_url:`
			`continue`
			`if f in ('dash', 'mpd'):`
			`formats.extend(self._extract_mpd_formats(`
			`f_url.replace('://das-q1.tf1.fr/', '://das-q1-ssl.tf1.fr/'),`
			`video_id, mpd_id='dash', fatal=False))`
			`elif f == 'hls':`
			`formats.extend(self._extract_m3u8_formats(`
			`f_url, video_id, 'mp4',`
			`'m3u8_native', m3u8_id='hls', fatal=False))`
[wat] extract all formats 2016-04-22 08:36:14 +00:00
[tf1] improve extraction(closes #27980)(closes #28040) 2021-02-23 11:39:46 +00:00			`delivery = video_data.get('delivery') or {}`
			`extract_formats({delivery.get('format'): delivery.get('url')})`
			`if not formats:`
[wat] detect DRM protected videos(closes #27958) 2021-02-23 12:50:18 +00:00			`if delivery.get('drm'):`
			`raise ExtractorError('This video is DRM protected.', expected=True)`
[tf1] improve extraction(closes #27980)(closes #28040) 2021-02-23 11:39:46 +00:00			`manifest_urls = self._download_json(`
			`'http://www.wat.tv/get/webhtml/' + video_id, video_id, fatal=False)`
			`if manifest_urls:`
			`extract_formats(manifest_urls)`
[wat] extract dash formats 2016-09-06 19:44:45 +00:00
[wat] fix format extraction(closes #27901) 2021-01-21 16:20:32 +00:00			`self._sort_formats(formats)`
[wat] improve extraction(#10281) add alternative method to extract http formats works even if the video is geo-restricted or removed from public access(most of the cases) 2016-08-10 13:17:22 +00:00
[wat] Modernize 2014-03-29 14:15:16 +00:00			`return {`
[wat] extract all formats 2016-04-22 08:36:14 +00:00			`'id': video_id,`
[wat] improve extraction(#10281) add alternative method to extract http formats works even if the video is geo-restricted or removed from public access(most of the cases) 2016-08-10 13:17:22 +00:00			`'title': title,`
[tf1] improve extraction(closes #27980)(closes #28040) 2021-02-23 11:39:46 +00:00			`'thumbnail': video_info.get('preview'),`
			`'upload_date': unified_strdate(try_get(`
			`video_data, lambda x: x['mediametrie']['chapters'][0]['estatS4'])),`
			`'duration': int_or_none(video_info.get('duration')),`
[wat] Add support for SD and HD videos (Closes #3558) 2014-08-23 19:22:10 +00:00			`'formats': formats,`
[wat] Modernize 2014-03-29 14:15:16 +00:00			`}`