youtube-dl/youtube_dl/extractor/iqm2.py

# coding: utf-8
from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..compat import compat_urlparse
from .generic import GenericIE

# IQM2 aka Accela is a municipal meeting management platform that
# (among other things) stores livestreamed video from municipal
# meetings.  After a hefty (several-hour) processing time, that video
# is available in easily downloadable form from their web portal, but
# prior to that, the video can only be watched in realtime through
# JWPlayer. This extractor is designed to download the realtime video
# prior to download links being available. For more info on Accela, see:
#   http://www.iqm2.com/About/Accela.aspx
#   http://www.accela.com/

# This processing makes it challenging to produce a test case for,
# because the extractor will want to find the processed and easily
# downloadable version. So there may be interesting bugs during the
# race condition time before the processed video is available (which
# is really the only time this extractor is especially important).

# This is also a relatively braindead extractor. It parses a given page like
#   http://cambridgema.iqm2.com/Citizens/SplitView.aspx?Mode=Video&MeetingID=1679
# to determine the location of an inner div defined by a URL of the form
#   http://cambridgema.iqm2.com/Citizens/VideoScreen.aspx?MediaID=1563&Frame=SplitView

# and then simply hands that URL to the GenericIE generic extractor,
# which matches it under the "Broaden the findall a little bit:
# JWPlayer JS loader" (line 2372 as of 6 Oct 2016).

# It appears that the metadata associated with the video (like its
# title) does not appear anywhere in the 2 HTML pages that get
# downloaded through this extractor. So it would need to download
# additional HTTP resources in order to get "real" metadata.

# This also appears to be the only example to date of an extractor
# that calls-out to the generic extractor, so it may be
# useful as an example. Or perhaps it means that there's a better way
# to do this and it should be rewritten differently, esp. to not
# leverage the generic? (xxx)

# Contributed by John Hawkinson <jhawk@mit.edu>, 6 Oct 2016.

class IQM2IE(InfoExtractor):

    # We commonly see both iqm2.com and IQM2.com.
    _VALID_URL = r'(?i)https?://(?:\w+\.)?iqm2\.com/Citizens/\w+.aspx\?.*MeetingID=(?P<id>[0-9]+)'
    _TESTS = [ {
        'url': 'http://cambridgema.iqm2.com/Citizens/SplitView.aspx?Mode=Video&MeetingID=1679#',
        'md5': '478ea30eee1966f7be0d8dd623122148',
        'info_dict': {
            'id': '1563_720',
            'ext': 'mp4',
            'title': 'Cambridge, MA (2)',
            'uploader': 'cambridgema.iqm2.com',
        }}, {
            'url': 'https://CambridgeMA.IQM2.com/Citizens/VideoMain.aspx?MeetingID=1679',
            'only_matching': True,
        }]

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        video_id = mobj.group('id')

        webpage = self._download_webpage(url, video_id)

        # print "Original URL is", url

        # <div id="VideoPanel" class="LeftTopContent">
        #   <div id="VideoPanelInner" ... src="/Citizens/VideoScreen.aspx?MediaID=1563&Frame=SplitView">
        inner_url_rel = self._html_search_regex(
            r'<div id="VideoPanelInner".*src="([^"]+)"',
            webpage, 'url');
        # print "inner_URL is", inner_url_rel

        inner_url = compat_urlparse.urljoin(url, inner_url_rel)
        # print "Joined URL is", inner_url

        return GenericIE(self._downloader)._real_extract(inner_url)
[IQM2] Add new extractor first cut 2016-10-06 01:09:53 -04:00			`# coding: utf-8`
			`from __future__ import unicode_literals`

			`import re`

			`from .common import InfoExtractor`
Handle relative URLs with urlparse.urljoin() 2016-10-06 01:38:25 -04:00			`from ..compat import compat_urlparse`
[IQM2] Add new extractor first cut 2016-10-06 01:09:53 -04:00			`from .generic import GenericIE`

			`# IQM2 aka Accela is a municipal meeting management platform that`
			`# (among other things) stores livestreamed video from municipal`
			`# meetings. After a hefty (several-hour) processing time, that video`
copyedit comments 2016-10-06 01:51:06 -04:00			`# is available in easily downloadable form from their web portal, but`
[IQM2] Add new extractor first cut 2016-10-06 01:09:53 -04:00			`# prior to that, the video can only be watched in realtime through`
			`# JWPlayer. This extractor is designed to download the realtime video`
copyedit comments 2016-10-06 01:51:06 -04:00			`# prior to download links being available. For more info on Accela, see:`
[IQM2] Add new extractor first cut 2016-10-06 01:09:53 -04:00			`# http://www.iqm2.com/About/Accela.aspx`
			`# http://www.accela.com/`

copyedit comments 2016-10-06 01:51:06 -04:00			`# This processing makes it challenging to produce a test case for,`
			`# because the extractor will want to find the processed and easily`
			`# downloadable version. So there may be interesting bugs during the`
			`# race condition time before the processed video is available (which`
			`# is really the only time this extractor is especially important).`
[IQM2] Add new extractor first cut 2016-10-06 01:09:53 -04:00
			`# This is also a relatively braindead extractor. It parses a given page like`
copyedit comments 2016-10-06 01:51:06 -04:00			`# http://cambridgema.iqm2.com/Citizens/SplitView.aspx?Mode=Video&MeetingID=1679`
[IQM2] Add new extractor first cut 2016-10-06 01:09:53 -04:00			`# to determine the location of an inner div defined by a URL of the form`
			`# http://cambridgema.iqm2.com/Citizens/VideoScreen.aspx?MediaID=1563&Frame=SplitView`

copyedit comments 2016-10-06 01:51:06 -04:00			`# and then simply hands that URL to the GenericIE generic extractor,`
			`# which matches it under the "Broaden the findall a little bit:`
			`# JWPlayer JS loader" (line 2372 as of 6 Oct 2016).`
[IQM2] Add new extractor first cut 2016-10-06 01:09:53 -04:00
copyedit comments 2016-10-06 01:51:06 -04:00			`# It appears that the metadata associated with the video (like its`
Case-insensitive URL match 2016-10-06 01:29:58 -04:00			`# title) does not appear anywhere in the 2 HTML pages that get`
			`# downloaded through this extractor. So it would need to download`
copyedit comments 2016-10-06 01:51:06 -04:00			`# additional HTTP resources in order to get "real" metadata.`
Case-insensitive URL match 2016-10-06 01:29:58 -04:00
[IQM2] Add new extractor first cut 2016-10-06 01:09:53 -04:00			`# This also appears to be the only example to date of an extractor`
copyedit comments 2016-10-06 01:51:06 -04:00			`# that calls-out to the generic extractor, so it may be`
[IQM2] Add new extractor first cut 2016-10-06 01:09:53 -04:00			`# useful as an example. Or perhaps it means that there's a better way`
			`# to do this and it should be rewritten differently, esp. to not`
copyedit comments 2016-10-06 01:51:06 -04:00			`# leverage the generic? (xxx)`
[IQM2] Add new extractor first cut 2016-10-06 01:09:53 -04:00
			`# Contributed by John Hawkinson <jhawk@mit.edu>, 6 Oct 2016.`

			`class IQM2IE(InfoExtractor):`
Case-insensitive URL match 2016-10-06 01:29:58 -04:00
Use (?i) for case-insensitivity in URLs 2016-10-08 19:21:33 -04:00			`# We commonly see both iqm2.com and IQM2.com.`
			`_VALID_URL = r'(?i)https?://(?:\w+\.)?iqm2\.com/Citizens/\w+.aspx\?.*MeetingID=(?P<id>[0-9]+)'`
Move test cases from comment to _TESTS 2016-10-08 19:17:32 -04:00			`_TESTS = [ {`
[IQM2] Add new extractor first cut 2016-10-06 01:09:53 -04:00			`'url': 'http://cambridgema.iqm2.com/Citizens/SplitView.aspx?Mode=Video&MeetingID=1679#',`
			`'md5': '478ea30eee1966f7be0d8dd623122148',`
			`'info_dict': {`
			`'id': '1563_720',`
			`'ext': 'mp4',`
			`'title': 'Cambridge, MA (2)',`
			`'uploader': 'cambridgema.iqm2.com',`
Move test cases from comment to _TESTS 2016-10-08 19:17:32 -04:00			`}}, {`
			`'url': 'https://CambridgeMA.IQM2.com/Citizens/VideoMain.aspx?MeetingID=1679',`
			`'only_matching': True,`
			`}]`
[IQM2] Add new extractor first cut 2016-10-06 01:09:53 -04:00
			`def _real_extract(self, url):`
Use (?i) for case-insensitivity in URLs 2016-10-08 19:21:33 -04:00			`mobj = re.match(self._VALID_URL, url)`
[IQM2] Add new extractor first cut 2016-10-06 01:09:53 -04:00			`video_id = mobj.group('id')`

			`webpage = self._download_webpage(url, video_id)`

Comment verbatim example of <div id=VideoPanelInner/> 2016-10-06 01:42:30 -04:00			`# print "Original URL is", url`

			`# <div id="VideoPanel" class="LeftTopContent">`
Tighten up regex comment 2016-10-08 19:23:59 -04:00			`# <div id="VideoPanelInner" ... src="/Citizens/VideoScreen.aspx?MediaID=1563&Frame=SplitView">`
Handle relative URLs with urlparse.urljoin() 2016-10-06 01:38:25 -04:00			`inner_url_rel = self._html_search_regex(`
			`r'<div id="VideoPanelInner".*src="([^"]+)"',`
			`webpage, 'url');`
Comment verbatim example of <div id=VideoPanelInner/> 2016-10-06 01:42:30 -04:00			`# print "inner_URL is", inner_url_rel`
Handle relative URLs with urlparse.urljoin() 2016-10-06 01:38:25 -04:00
			`inner_url = compat_urlparse.urljoin(url, inner_url_rel)`
Comment verbatim example of <div id=VideoPanelInner/> 2016-10-06 01:42:30 -04:00			`# print "Joined URL is", inner_url`
Handle relative URLs with urlparse.urljoin() 2016-10-06 01:38:25 -04:00
			`return GenericIE(self._downloader)._real_extract(inner_url)`