From aa016336a8023dfd2955c648519a083de3137705 Mon Sep 17 00:00:00 2001
From: Petar Kukolj <petarkukolj3@yahoo.com>
Date: Wed, 20 Sep 2017 03:02:02 +0200
Subject: [PATCH 1/2] [LibriVox] Add new extractor

---
 youtube_dl/extractor/extractors.py |  1 +
 youtube_dl/extractor/librivox.py   | 48 ++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+)
 create mode 100644 youtube_dl/extractor/librivox.py
diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py
index ab95c8575..eeaa4b8aa 100644
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@@ -527,6 +527,7 @@ from .leeco import (
     LetvCloudIE,
 )
 from .libraryofcongress import LibraryOfCongressIE
+from .librivox import LibriVoxIE
 from .libsyn import LibsynIE
 from .lifenews import (
     LifeNewsIE,
diff --git a/youtube_dl/extractor/librivox.py b/youtube_dl/extractor/librivox.py
new file mode 100644
index 000000000..ea5bffc1e
--- /dev/null
+++ b/youtube_dl/extractor/librivox.py
@@ -0,0 +1,48 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    orderedSet
+)
+
+
+class LibriVoxIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?librivox\.org/(?P<id>(?P<title>(?:[^\-]*\-)+[^\-]*)\-by\-(?P<author>(-.*\-)*[^/]*))/?'
+    _TESTS = [{
+        'url': 'https://librivox.org/the-art-of-war-by-sun-tzu/',
+        'info_dict': {
+            'id': 'the-art-of-war-by-sun-tzu',
+            'title': 'The Art Of War by Sun Tzu'
+        },
+        'playlist_mincount': 7
+    }, {
+        'url': 'https://librivox.org/alexander-the-great-by-jacob-abbott/',
+        'info_dict': {
+            'id': 'alexander-the-great-by-jacob-abbott',
+            'title': 'Alexander The Great by Jacob Abbott'
+        },
+        'playlist_mincount': 12
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+
+        video_id = mobj.group('id')
+        book_title = mobj.group('title').replace('-', ' ').strip().title()
+        author = mobj.group('author').replace('-', ' ').strip().title()
+
+        info = {
+            'id': video_id,
+            '_type': 'playlist',
+            'title': book_title + ' by ' + author
+        }
+
+        webpage = self._download_webpage(url, video_id)
+
+        links = orderedSet(re.findall(r'<a href="(https?://(?:www\.)?archive\.org/download/[^/]*/([^\.]*(?<!(?:64kb)))\.mp3)".*>(.*)</a>', webpage))
+        info['entries'] = [self.url_result(link[0], video_id=link[1], video_title=link[2]) for link in links]
+
+        return info

From d9f4cc8b3e8b660566d89c0d5625aa811ba02eda Mon Sep 17 00:00:00 2001
From: Petar Kukolj <petarkukolj3@yahoo.com>
Date: Sun, 1 Oct 2017 16:58:51 +0200
Subject: [PATCH 2/2] [LibriVox] Added description extraction, added some
 fallbacks and some regex improvements

---
 youtube_dl/extractor/librivox.py | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/youtube_dl/extractor/librivox.py b/youtube_dl/extractor/librivox.py
index ea5bffc1e..025bd66aa 100644
--- a/youtube_dl/extractor/librivox.py
+++ b/youtube_dl/extractor/librivox.py
@@ -4,9 +4,7 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..utils import (
-    orderedSet
-)
+from ..utils import orderedSet
 
 
 class LibriVoxIE(InfoExtractor):
@@ -15,14 +13,16 @@ class LibriVoxIE(InfoExtractor):
         'url': 'https://librivox.org/the-art-of-war-by-sun-tzu/',
         'info_dict': {
             'id': 'the-art-of-war-by-sun-tzu',
-            'title': 'The Art Of War by Sun Tzu'
+            'title': 'The Art Of War by Sun Tzu',
+            'description': '"The Art of War is a Chinese military treatise written during the 6th century BC by Sun Tzu. Composed of 13 chapters, each of which is devoted to one aspect of warfare, it has long been praised as the definitive work on military strategies and tactics of its time. The Art of War is one of the oldest and most famous studies of strategy and has had a huge influence on both military planning and beyond. The Art of War has also been applied, with much success, to business and managerial strategies." (summary from Wikipedia)'
         },
         'playlist_mincount': 7
     }, {
         'url': 'https://librivox.org/alexander-the-great-by-jacob-abbott/',
         'info_dict': {
             'id': 'alexander-the-great-by-jacob-abbott',
-            'title': 'Alexander The Great by Jacob Abbott'
+            'title': 'Alexander The Great by Jacob Abbott',
+            'description': 'Alexander the Great was one of the most successful military commanders in history, and was undefeated in battle. By the time of his death, he had conquered most of the world known to the ancient Greeks.\nAlexander the Great is one of many biographies aimed at young people written by Jacob Abbott and his brother. The biographies are written in such a way that makes them appealing and easily accessible to everyone. - Written by Wikipedia and Lizzie Driver'
         },
         'playlist_mincount': 12
     }]
@@ -31,18 +31,24 @@ class LibriVoxIE(InfoExtractor):
         mobj = re.match(self._VALID_URL, url)
 
         video_id = mobj.group('id')
-        book_title = mobj.group('title').replace('-', ' ').strip().title()
+
+        webpage = self._download_webpage(url, video_id)
+
+        book_title = self._html_search_regex(
+            r'<h1>(?P<title>.+)</h1>', webpage, 'title', group='title', fatal=False).lower().title() or mobj.group('title').replace('-', ' ').strip().title()
         author = mobj.group('author').replace('-', ' ').strip().title()
+        description = self._html_search_regex(
+            r'<div class=(["\'])description\1(^>)*>(<p(^>)*>)?(?P<desc>.+)(</p>)?</div>',
+            webpage, 'description', group='desc', fatal=False) or self._og_search_description(webpage)
 
         info = {
             'id': video_id,
             '_type': 'playlist',
-            'title': book_title + ' by ' + author
+            'title': book_title + ' by ' + author,
+            'description': description
         }
 
-        webpage = self._download_webpage(url, video_id)
-
-        links = orderedSet(re.findall(r'<a href="(https?://(?:www\.)?archive\.org/download/[^/]*/([^\.]*(?<!(?:64kb)))\.mp3)".*>(.*)</a>', webpage))
-        info['entries'] = [self.url_result(link[0], video_id=link[1], video_title=link[2]) for link in links]
+        links = orderedSet(re.findall(r'<a href=(["\'])(https?://(?:www\.)?archive\.org/download/[^/]*/([^\.]*(?<!(?:64kb)))\.mp3)\1.*>(.*)</a>', webpage))
+        info['entries'] = [self.url_result(link[1], video_id=link[2], video_title=link[3]) for link in links]
 
         return info