From b67b4411089f44fe68923a6aca56f2a0b1875c76 Mon Sep 17 00:00:00 2001 From: SpiderRiderGit Date: Fri, 25 Dec 2020 12:59:26 -0500 Subject: [PATCH 01/11] [TasVideos] Add new extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/tasvideos.py | 32 ++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) create mode 100644 youtube_dl/extractor/tasvideos.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index bf34ae6b7..e31e62874 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1128,6 +1128,7 @@ from .tagesschau import ( TagesschauIE, ) from .tass import TassIE +from .tasvideos import TasVideosIE from .tbs import TBSIE from .tdslifeway import TDSLifewayIE from .teachable import ( diff --git a/youtube_dl/extractor/tasvideos.py b/youtube_dl/extractor/tasvideos.py new file mode 100644 index 000000000..4ef199e12 --- /dev/null +++ b/youtube_dl/extractor/tasvideos.py @@ -0,0 +1,32 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class TasVideosIE(InfoExtractor): + _VALID_URL = r'http://tasvideos.org/(?P\d+M)\.html' + _TEST = { + 'url': 'http://tasvideos.org/4352M.html', + 'md5': '92b08f544beb6ee905030609c7251cd1', + 'info_dict': { + 'id': '4352M', + 'ext': 'mkv', + 'title': 'C64 L\'Abbaye des Morts', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video_url = "http://www." + self._search_regex( + r']+(?Parchive\.org\/download[^<]+(?:mkv|mp4))[^<]+<\/a>', + webpage, 'video url') + title = self._search_regex( + r'(?P[^<]+)<\/span>', webpage, + 'title') + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + } From 20cb323d5d3546d44f7854030c86e6fe930fef20 Mon Sep 17 00:00:00 2001 From: SpiderRider067 <36859584+SpiderRider067@users.noreply.github.com> Date: Fri, 25 Dec 2020 16:18:43 -0500 Subject: [PATCH 02/11] Update tasvideos.py --- youtube_dl/extractor/tasvideos.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tasvideos.py b/youtube_dl/extractor/tasvideos.py index 4ef199e12..ee6e7e48d 100644 --- a/youtube_dl/extractor/tasvideos.py +++ b/youtube_dl/extractor/tasvideos.py @@ -19,7 +19,7 @@ class TasVideosIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) video_url = "http://www." + self._search_regex( - r'<a [^>]+(?P<URL>archive\.org\/download[^<]+(?:mkv|mp4))[^<]+<\/a>', + r'<a [^>]+(?P<URL>archive\.org\/download[^<]+(?:mkv|mp4|avi))[^<]+<\/a>', webpage, 'video url') title = self._search_regex( r'<span title="Movie[^"]+">(?P<TITLE>[^<]+)<\/span>', webpage, From 7f457f913fb2869171595aa82b977e35276b9564 Mon Sep 17 00:00:00 2001 From: SpiderRider067 <36859584+SpiderRider067@users.noreply.github.com> Date: Fri, 25 Dec 2020 16:41:11 -0500 Subject: [PATCH 03/11] Update tasvideos.py --- youtube_dl/extractor/tasvideos.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tasvideos.py b/youtube_dl/extractor/tasvideos.py index ee6e7e48d..ba61e07cb 100644 --- a/youtube_dl/extractor/tasvideos.py +++ b/youtube_dl/extractor/tasvideos.py @@ -19,7 +19,7 @@ class TasVideosIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) video_url = "http://www." + self._search_regex( - r'<a [^>]+(?P<URL>archive\.org\/download[^<]+(?:mkv|mp4|avi))[^<]+<\/a>', + r'<a [^>]+(?P<URL>archive\.org\/download[^<]+\.(?:mkv|mp4|avi))[^<]+<\/a>', webpage, 'video url') title = self._search_regex( r'<span title="Movie[^"]+">(?P<TITLE>[^<]+)<\/span>', webpage, From e152c0107ea0c6930ffa942165af981ebe6894f9 Mon Sep 17 00:00:00 2001 From: SpiderRider067 <36859584+SpiderRider067@users.noreply.github.com> Date: Sat, 26 Dec 2020 12:29:19 -0500 Subject: [PATCH 04/11] Update youtube_dl/extractor/tasvideos.py Change class name to reflect website's letter capitalization better. Co-authored-by: fossdd <fossdd@tutanota.com> --- youtube_dl/extractor/tasvideos.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tasvideos.py b/youtube_dl/extractor/tasvideos.py index ba61e07cb..e8a408be5 100644 --- a/youtube_dl/extractor/tasvideos.py +++ b/youtube_dl/extractor/tasvideos.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals from .common import InfoExtractor -class TasVideosIE(InfoExtractor): +class TASVideosIE(InfoExtractor): _VALID_URL = r'http://tasvideos.org/(?P<id>\d+M)\.html' _TEST = { 'url': 'http://tasvideos.org/4352M.html', From de3c09e6ac3ced8e1dbb3340757b053d439a1d67 Mon Sep 17 00:00:00 2001 From: SpiderRider067 <36859584+SpiderRider067@users.noreply.github.com> Date: Sat, 26 Dec 2020 12:30:33 -0500 Subject: [PATCH 05/11] Update extractors.py --- youtube_dl/extractor/extractors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e31e62874..aa554420f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1128,7 +1128,7 @@ from .tagesschau import ( TagesschauIE, ) from .tass import TassIE -from .tasvideos import TasVideosIE +from .tasvideos import TASVideosIE from .tbs import TBSIE from .tdslifeway import TDSLifewayIE from .teachable import ( From f884efcc54b062a9527d1fcc1a9e5b11b223bd96 Mon Sep 17 00:00:00 2001 From: SpiderRider067 <36859584+SpiderRider067@users.noreply.github.com> Date: Sun, 27 Dec 2020 20:10:07 -0500 Subject: [PATCH 06/11] return 'formats' instead of url --- youtube_dl/extractor/tasvideos.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/tasvideos.py b/youtube_dl/extractor/tasvideos.py index e8a408be5..19b1a8e10 100644 --- a/youtube_dl/extractor/tasvideos.py +++ b/youtube_dl/extractor/tasvideos.py @@ -1,6 +1,7 @@ from __future__ import unicode_literals from .common import InfoExtractor +import re class TASVideosIE(InfoExtractor): @@ -18,15 +19,22 @@ class TASVideosIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - video_url = "http://www." + self._search_regex( + video_urls = re.findall( r'<a [^>]+(?P<URL>archive\.org\/download[^<]+\.(?:mkv|mp4|avi))[^<]+<\/a>', - webpage, 'video url') + webpage) title = self._search_regex( r'<span title="Movie[^"]+">(?P<TITLE>[^<]+)<\/span>', webpage, 'title') + formats = [] + + for url in video_urls: + format_entry = {'url': "http://www." + url} + formats.append(format_entry) + + self._sort_formats(formats) return { 'id': video_id, 'title': title, - 'url': video_url, + 'formats': formats, } From 1dcca5c300bb224936b6f207302d1f715bed932d Mon Sep 17 00:00:00 2001 From: SpiderRider067 <36859584+SpiderRider067@users.noreply.github.com> Date: Tue, 29 Dec 2020 21:28:35 -0500 Subject: [PATCH 07/11] had incorrect md5, updated. --- youtube_dl/extractor/tasvideos.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tasvideos.py b/youtube_dl/extractor/tasvideos.py index 19b1a8e10..8f12cf70b 100644 --- a/youtube_dl/extractor/tasvideos.py +++ b/youtube_dl/extractor/tasvideos.py @@ -8,7 +8,7 @@ class TASVideosIE(InfoExtractor): _VALID_URL = r'http://tasvideos.org/(?P<id>\d+M)\.html' _TEST = { 'url': 'http://tasvideos.org/4352M.html', - 'md5': '92b08f544beb6ee905030609c7251cd1', + 'md5': '8dced25a575e853cec5533a887a8dcfc', 'info_dict': { 'id': '4352M', 'ext': 'mkv', From ee1b02ab83bfaea7f343c89c6607f975357bd5da Mon Sep 17 00:00:00 2001 From: SpiderRider067 <36859584+SpiderRider067@users.noreply.github.com> Date: Wed, 30 Dec 2020 20:21:37 -0500 Subject: [PATCH 08/11] Added playlist extraction functionality --- youtube_dl/extractor/tasvideos.py | 57 ++++++++++++++++++++++++++++--- 1 file changed, 52 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/tasvideos.py b/youtube_dl/extractor/tasvideos.py index 8f12cf70b..e8c82f9c6 100644 --- a/youtube_dl/extractor/tasvideos.py +++ b/youtube_dl/extractor/tasvideos.py @@ -1,8 +1,10 @@ +# coding: utf-8 from __future__ import unicode_literals -from .common import InfoExtractor import re +from .common import InfoExtractor + class TASVideosIE(InfoExtractor): _VALID_URL = r'http://tasvideos.org/(?P<id>\d+M)\.html' @@ -11,7 +13,7 @@ class TASVideosIE(InfoExtractor): 'md5': '8dced25a575e853cec5533a887a8dcfc', 'info_dict': { 'id': '4352M', - 'ext': 'mkv', + 'ext': 'mp4', 'title': 'C64 L\'Abbaye des Morts', } } @@ -25,12 +27,11 @@ class TASVideosIE(InfoExtractor): title = self._search_regex( r'<span title="Movie[^"]+">(?P<TITLE>[^<]+)<\/span>', webpage, 'title') + formats = [] - for url in video_urls: - format_entry = {'url': "http://www." + url} + format_entry = {'url': 'http://www.' + url} formats.append(format_entry) - self._sort_formats(formats) return { @@ -38,3 +39,49 @@ class TASVideosIE(InfoExtractor): 'title': title, 'formats': formats, } + + +class TASVideosPlaylistIE(InfoExtractor): + _VALID_URL = r'http://tasvideos.org/(?P<id>Movies-[^\.]*?)\.html' + _TEST = { + 'url': 'http://tasvideos.org/Movies-Stars.html', + 'info_dict': { + 'id': 'Movies-Stars', + 'title': 'TASVideos movies: Tier Stars', + }, + 'playlist_count': 114, + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + playlist_title = self._search_regex( + r'<title>(?P<title>[^<]*)', webpage, 'title') + video_entries = re.findall( + r'((?s)', webpage) + entries = [] + for entry in video_entries: + video_urls = re.findall( + r']+(?Parchive\.org\/download[^<]+\.(?:mkv|mp4|avi))[^<]+<\/a>', + entry) + title = self._search_regex( + r'(?P[^<]+)<\/span>', entry, + 'title') + video_id = self._search_regex( + r'id="movie_(?P<id>\d+)', entry, 'video id') + 'M' + + formats = [] + for url in video_urls: + format_entry = {'url': "http://www." + url} + formats.append(format_entry) + + self._sort_formats(formats) + + formats = { + 'id': video_id, + 'title': title, + 'formats': formats, + } + entries.append(formats) + + return self.playlist_result(entries, playlist_id, playlist_title) From 6ce7baecaac973c319c44313e8c4f3599a75ed45 Mon Sep 17 00:00:00 2001 From: SpiderRider067 <36859584+SpiderRider067@users.noreply.github.com> Date: Wed, 30 Dec 2020 20:28:55 -0500 Subject: [PATCH 09/11] Update extractors.py --- youtube_dl/extractor/extractors.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index aa554420f..64d45584b 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1128,7 +1128,10 @@ from .tagesschau import ( TagesschauIE, ) from .tass import TassIE -from .tasvideos import TASVideosIE +from .tasvideos import ( + TASVideosIE, + TASVideosPlaylistIE, +) from .tbs import TBSIE from .tdslifeway import TDSLifewayIE from .teachable import ( From 14834bf977a08383174fe1f6b73c8610f971ff5c Mon Sep 17 00:00:00 2001 From: SpiderRider067 <36859584+SpiderRider067@users.noreply.github.com> Date: Wed, 30 Dec 2020 21:59:27 -0500 Subject: [PATCH 10/11] Use a helper method to replace a messy regex --- youtube_dl/extractor/tasvideos.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/tasvideos.py b/youtube_dl/extractor/tasvideos.py index e8c82f9c6..4848ae5dd 100644 --- a/youtube_dl/extractor/tasvideos.py +++ b/youtube_dl/extractor/tasvideos.py @@ -5,6 +5,8 @@ import re from .common import InfoExtractor +from ..utils import get_elements_by_class + class TASVideosIE(InfoExtractor): _VALID_URL = r'http://tasvideos.org/(?P<id>\d+M)\.html' @@ -57,8 +59,8 @@ class TASVideosPlaylistIE(InfoExtractor): webpage = self._download_webpage(url, playlist_id) playlist_title = self._search_regex( r'<title>(?P<title>[^<]*)', webpage, 'title') - video_entries = re.findall( - r'((?s)
', webpage) + video_entries = get_elements_by_class('item', webpage) + entries = [] for entry in video_entries: video_urls = re.findall( From 9addbadd330b2dd86792aaedd9633ca2f5657a69 Mon Sep 17 00:00:00 2001 From: SpiderRider067 <36859584+SpiderRider067@users.noreply.github.com> Date: Thu, 31 Dec 2020 23:02:26 -0500 Subject: [PATCH 11/11] added speedrun author and timer optional (fatal=False) because the website's layout forced me into an unreliably regex. --- youtube_dl/extractor/tasvideos.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/tasvideos.py b/youtube_dl/extractor/tasvideos.py index 4848ae5dd..f628d7c05 100644 --- a/youtube_dl/extractor/tasvideos.py +++ b/youtube_dl/extractor/tasvideos.py @@ -27,8 +27,13 @@ class TASVideosIE(InfoExtractor): r']+(?Parchive\.org\/download[^<]+\.(?:mkv|mp4|avi))[^<]+<\/a>', webpage) title = self._search_regex( - r'(?P[^<]+)<\/span>', webpage, - 'title') + r'<span title="Movie[^"]+">(?P<TITLE>[^<]+)<\/span>', + webpage, 'title') + time_and_author = self._html_search_regex( + r'<th.*<\/span>(?P<time_and_author>.*)<\/th>', webpage, + 'title: speedrun timer and credit', fatal=False) + if time_and_author is not None: + title = title + time_and_author formats = [] for url in video_urls: @@ -60,15 +65,20 @@ class TASVideosPlaylistIE(InfoExtractor): playlist_title = self._search_regex( r'<title>(?P<title>[^<]*)', webpage, 'title') video_entries = get_elements_by_class('item', webpage) - + entries = [] for entry in video_entries: video_urls = re.findall( r']+(?Parchive\.org\/download[^<]+\.(?:mkv|mp4|avi))[^<]+<\/a>', entry) title = self._search_regex( - r'(?P[^<]+)<\/span>', entry, - 'title') + r'<span title="Movie[^"]+">(?P<title>[^<]+)<\/span>', + entry, 'title') + time_and_author = self._html_search_regex( + r'<th.*<\/span>(?P<time_and_author>.*)<\/th>', entry, + 'time_and_author', fatal=False) + if time_and_author is not None: + title = title + time_and_author video_id = self._search_regex( r'id="movie_(?P<id>\d+)', entry, 'video id') + 'M'