2016-10-06 05:09:53 +00:00
|
|
|
# coding: utf-8
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
|
|
|
import re
|
|
|
|
|
|
|
|
from .common import InfoExtractor
|
2016-10-06 05:38:25 +00:00
|
|
|
from ..compat import compat_urlparse
|
2016-10-06 05:09:53 +00:00
|
|
|
from .generic import GenericIE
|
|
|
|
|
|
|
|
# IQM2 aka Accela is a municipal meeting management platform that
|
|
|
|
# (among other things) stores livestreamed video from municipal
|
|
|
|
# meetings. After a hefty (several-hour) processing time, that video
|
2016-10-06 05:51:06 +00:00
|
|
|
# is available in easily downloadable form from their web portal, but
|
2016-10-06 05:09:53 +00:00
|
|
|
# prior to that, the video can only be watched in realtime through
|
|
|
|
# JWPlayer. This extractor is designed to download the realtime video
|
2016-10-06 05:51:06 +00:00
|
|
|
# prior to download links being available. For more info on Accela, see:
|
2016-10-06 05:09:53 +00:00
|
|
|
# http://www.iqm2.com/About/Accela.aspx
|
|
|
|
# http://www.accela.com/
|
|
|
|
|
2016-10-06 05:51:06 +00:00
|
|
|
# This processing makes it challenging to produce a test case for,
|
|
|
|
# because the extractor will want to find the processed and easily
|
|
|
|
# downloadable version. So there may be interesting bugs during the
|
|
|
|
# race condition time before the processed video is available (which
|
|
|
|
# is really the only time this extractor is especially important).
|
2016-10-06 05:09:53 +00:00
|
|
|
|
|
|
|
# This is also a relatively braindead extractor. It parses a given page like
|
2016-10-06 05:51:06 +00:00
|
|
|
# http://cambridgema.iqm2.com/Citizens/SplitView.aspx?Mode=Video&MeetingID=1679
|
2016-10-06 05:09:53 +00:00
|
|
|
# to determine the location of an inner div defined by a URL of the form
|
|
|
|
# http://cambridgema.iqm2.com/Citizens/VideoScreen.aspx?MediaID=1563&Frame=SplitView
|
|
|
|
|
2016-10-06 05:51:06 +00:00
|
|
|
# and then simply hands that URL to the GenericIE generic extractor,
|
|
|
|
# which matches it under the "Broaden the findall a little bit:
|
|
|
|
# JWPlayer JS loader" (line 2372 as of 6 Oct 2016).
|
2016-10-06 05:09:53 +00:00
|
|
|
|
2016-10-06 05:51:06 +00:00
|
|
|
# It appears that the metadata associated with the video (like its
|
2016-10-06 05:29:58 +00:00
|
|
|
# title) does not appear anywhere in the 2 HTML pages that get
|
|
|
|
# downloaded through this extractor. So it would need to download
|
2016-10-06 05:51:06 +00:00
|
|
|
# additional HTTP resources in order to get "real" metadata.
|
2016-10-06 05:29:58 +00:00
|
|
|
|
2016-10-06 05:09:53 +00:00
|
|
|
# This also appears to be the only example to date of an extractor
|
2016-10-06 05:51:06 +00:00
|
|
|
# that calls-out to the generic extractor, so it may be
|
2016-10-06 05:09:53 +00:00
|
|
|
# useful as an example. Or perhaps it means that there's a better way
|
|
|
|
# to do this and it should be rewritten differently, esp. to not
|
2016-10-06 05:51:06 +00:00
|
|
|
# leverage the generic? (xxx)
|
2016-10-06 05:09:53 +00:00
|
|
|
|
|
|
|
# Contributed by John Hawkinson <jhawk@mit.edu>, 6 Oct 2016.
|
|
|
|
|
|
|
|
class IQM2IE(InfoExtractor):
|
2016-10-06 05:29:58 +00:00
|
|
|
|
|
|
|
# xxx is really right that InfoExtractor.suitable() calls re.compile()
|
|
|
|
# on _VALID_URL in a case-sensitive fashion? It's obviously reasonable
|
|
|
|
# for the path portion of a URL to be case-sensitive, but the hostname
|
|
|
|
# ought not to be. And it seems like strict adherence might mess up a
|
|
|
|
# bunch of extractors in funny-cased URLs? Redefine suitable() to search
|
2016-10-06 05:51:06 +00:00
|
|
|
# case-insensitively. Note this also changes the re.match() call at the
|
2016-10-06 05:29:58 +00:00
|
|
|
# start of _real_extract()
|
|
|
|
#
|
2016-10-06 05:51:06 +00:00
|
|
|
# In this case, we commonly see both iqm2.com and IQM2.com.
|
2016-10-06 05:29:58 +00:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def suitable(cls, url):
|
|
|
|
"""Receives a URL and returns True if suitable for this IE."""
|
|
|
|
|
|
|
|
# This does not use has/getattr intentionally - we want to know whether
|
|
|
|
# we have cached the regexp for *this* class, whereas getattr would also
|
|
|
|
# match the superclass
|
|
|
|
if '_VALID_URL_RE' not in cls.__dict__:
|
|
|
|
cls._VALID_URL_RE = re.compile(cls._VALID_URL, flags=re.IGNORECASE)
|
|
|
|
return cls._VALID_URL_RE.match(url) is not None
|
|
|
|
|
|
|
|
_VALID_URL = r'https?://(?:\w+\.)?iqm2\.com/Citizens/\w+.aspx\?.*MeetingID=(?P<id>[0-9]+)'
|
2016-10-08 23:17:32 +00:00
|
|
|
_TESTS = [ {
|
2016-10-06 05:09:53 +00:00
|
|
|
'url': 'http://cambridgema.iqm2.com/Citizens/SplitView.aspx?Mode=Video&MeetingID=1679#',
|
|
|
|
'md5': '478ea30eee1966f7be0d8dd623122148',
|
|
|
|
'info_dict': {
|
|
|
|
'id': '1563_720',
|
|
|
|
'ext': 'mp4',
|
|
|
|
'title': 'Cambridge, MA (2)',
|
|
|
|
'uploader': 'cambridgema.iqm2.com',
|
2016-10-08 23:17:32 +00:00
|
|
|
}}, {
|
|
|
|
'url': 'https://CambridgeMA.IQM2.com/Citizens/VideoMain.aspx?MeetingID=1679',
|
|
|
|
'only_matching': True,
|
|
|
|
}]
|
2016-10-06 05:09:53 +00:00
|
|
|
|
|
|
|
def _real_extract(self, url):
|
2016-10-06 05:29:58 +00:00
|
|
|
mobj = re.match(self._VALID_URL, url, flags=re.IGNORECASE)
|
2016-10-06 05:09:53 +00:00
|
|
|
video_id = mobj.group('id')
|
|
|
|
|
|
|
|
webpage = self._download_webpage(url, video_id)
|
|
|
|
|
2016-10-06 05:42:30 +00:00
|
|
|
# print "Original URL is", url
|
|
|
|
|
|
|
|
# We want to extract an inner URL like this:
|
|
|
|
# <div id="VideoPanel" class="LeftTopContent">
|
|
|
|
# <div id="VideoPanelInner" onselectstart="javascript:return false;"
|
|
|
|
# style="overflow:hidden"
|
|
|
|
# src="/Citizens/VideoScreen.aspx?MediaID=1563&Frame=SplitView"
|
|
|
|
# quality="hd">
|
2016-10-06 05:38:25 +00:00
|
|
|
inner_url_rel = self._html_search_regex(
|
|
|
|
r'<div id="VideoPanelInner".*src="([^"]+)"',
|
|
|
|
webpage, 'url');
|
2016-10-06 05:42:30 +00:00
|
|
|
# print "inner_URL is", inner_url_rel
|
2016-10-06 05:38:25 +00:00
|
|
|
|
|
|
|
inner_url = compat_urlparse.urljoin(url, inner_url_rel)
|
2016-10-06 05:42:30 +00:00
|
|
|
# print "Joined URL is", inner_url
|
2016-10-06 05:38:25 +00:00
|
|
|
|
|
|
|
return GenericIE(self._downloader)._real_extract(inner_url)
|