diff -Nru youtube-dl-2016.02.22/AUTHORS youtube-dl-2019.05.20/AUTHORS --- youtube-dl-2016.02.22/AUTHORS 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/AUTHORS 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,248 @@ +Ricardo Garcia Gonzalez +Danny Colligan +Benjamin Johnson +Vasyl' Vavrychuk +Witold Baryluk +Paweł Paprota +Gergely Imreh +Rogério Brito +Philipp Hagemeister +Sören Schulze +Kevin Ngo +Ori Avtalion +shizeeg +Filippo Valsorda +Christian Albrecht +Dave Vasilevsky +Jaime Marquínez Ferrándiz +Jeff Crouse +Osama Khalid +Michael Walter +M. Yasoob Ullah Khalid +Julien Fraichard +Johny Mo Swag +Axel Noack +Albert Kim +Pierre Rudloff +Huarong Huo +Ismael Mejía +Steffan Donal +Andras Elso +Jelle van der Waa +Marcin Cieślak +Anton Larionov +Takuya Tsuchida +Sergey M. +Michael Orlitzky +Chris Gahan +Saimadhav Heblikar +Mike Col +Oleg Prutz +pulpe +Andreas Schmitz +Michael Kaiser +Niklas Laxström +David Triendl +Anthony Weems +David Wagner +Juan C. Olivares +Mattias Harrysson +phaer +Sainyam Kapoor +Nicolas Évrard +Jason Normore +Hoje Lee +Adam Thalhammer +Georg Jähnig +Ralf Haring +Koki Takahashi +Ariset Llerena +Adam Malcontenti-Wilson +Tobias Bell +Naglis Jonaitis +Charles Chen +Hassaan Ali +Dobrosław Żybort +David Fabijan +Sebastian Haas +Alexander Kirk +Erik Johnson +Keith Beckman +Ole Ernst +Aaron McDaniel (mcd1992) +Magnus Kolstad +Hari Padmanaban +Carlos Ramos +5moufl +lenaten +Dennis Scheiba +Damon Timm +winwon +Xavier Beynon +Gabriel Schubiner +xantares +Jan Matějka +Mauroy Sébastien +William Sewell +Dao Hoang Son +Oskar Jauch +Matthew Rayfield +t0mm0 +Tithen-Firion +Zack Fernandes +cryptonaut +Adrian Kretz +Mathias Rav +Petr Kutalek +Will Glynn +Max Reimann +Cédric Luthi +Thijs Vermeir +Joel Leclerc +Christopher Krooss +Ondřej Caletka +Dinesh S +Johan K. Jensen +Yen Chi Hsuan +Enam Mijbah Noor +David Luhmer +Shaya Goldberg +Paul Hartmann +Frans de Jonge +Robin de Rooij +Ryan Schmidt +Leslie P. Polzer +Duncan Keall +Alexander Mamay +Devin J. Pohly +Eduardo Ferro Aldama +Jeff Buchbinder +Amish Bhadeshia +Joram Schrijver +Will W. +Mohammad Teimori Pabandi +Roman Le Négrate +Matthias Küch +Julian Richen +Ping O. +Mister Hat +Peter Ding +jackyzy823 +George Brighton +Remita Amine +Aurélio A. Heckert +Bernhard Minks +sceext +Zach Bruggeman +Tjark Saul +slangangular +Behrouz Abbasi +ngld +nyuszika7h +Shaun Walbridge +Lee Jenkins +Anssi Hannula +Lukáš Lalinský +Qijiang Fan +Rémy Léone +Marco Ferragina +reiv +Muratcan Simsek +Evan Lu +flatgreen +Brian Foley +Vignesh Venkat +Tom Gijselinck +Founder Fang +Andrew Alexeyew +Saso Bezlaj +Erwin de Haan +Jens Wille +Robin Houtevelts +Patrick Griffis +Aidan Rowe +mutantmonkey +Ben Congdon +Kacper Michajłow +José Joaquín Atria +Viťas Strádal +Kagami Hiiragi +Philip Huppert +blahgeek +Kevin Deldycke +inondle +Tomáš Čech +Déstin Reed +Roman Tsiupa +Artur Krysiak +Jakub Adam Wieczorek +Aleksandar Topuzović +Nehal Patel +Rob van Bekkum +Petr Zvoníček +Pratyush Singh +Aleksander Nitecki +Sebastian Blunt +Matěj Cepl +Xie Yanbo +Philip Xu +John Hawkinson +Rich Leeper +Zhong Jianxin +Thor77 +Mattias Wadman +Arjan Verwer +Costy Petrisor +Logan B +Alex Seiler +Vijay Singh +Paul Hartmann +Stephen Chen +Fabian Stahl +Bagira +Odd Stråbø +Philip Herzog +Thomas Christlieb +Marek Rusinowski +Tobias Gruetzmacher +Olivier Bilodeau +Lars Vierbergen +Juanjo Benages +Xiao Di Guan +Thomas Winant +Daniel Twardowski +Jeremie Jarosh +Gerard Rovira +Marvin Ewald +Frédéric Bournival +Timendum +gritstub +Adam Voss +Mike Fährmann +Jan Kundrát +Giuseppe Fabiano +Örn Guðjónsson +Parmjit Virk +Genki Sky +Ľuboš Katrinec +Corey Nicholson +Ashutosh Chaudhary +John Dong +Tatsuyuki Ishi +Daniel Weber +Kay Bouché +Yang Hongbo +Lei Wang +Petr Novák +Leonardo Taccari +Martin Weinelt +Surya Oktafendri +TingPing +Alexandre Macabies +Bastian de Groot +Niklas Haas +András Veres-Szentkirályi +Enes Solak +Nathan Rossi +Thomas van der Berg +Luca Cherubin diff -Nru youtube-dl-2016.02.22/ChangeLog youtube-dl-2019.05.20/ChangeLog --- youtube-dl-2016.02.22/ChangeLog 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/ChangeLog 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,4379 @@ +version 2019.05.20 + +Core ++ [extractor/common] Move workaround for applying first Set-Cookie header + into a separate _apply_first_set_cookie_header method + +Extractors +* [safari] Fix authentication (#21090) +* [vk] Use _apply_first_set_cookie_header +* [vrt] Fix extraction (#20527) ++ [canvas] Add support for vrtnieuws and sporza site ids and extract + AES HLS formats ++ [vrv] Extract captions (#19238) +* [tele5] Improve video id extraction +* [tele5] Relax URL regular expression (#21020, #21063) +* [svtplay] Update API URL (#21075) ++ [yahoo:gyao] Add X-User-Agent header to dam proxy requests (#21071) + + +version 2019.05.11 + +Core +* [utils] Transliterate "þ" as "th" (#20897) + +Extractors ++ [cloudflarestream] Add support for videodelivery.net (#21049) ++ [byutv] Add support for DVR videos (#20574, #20676) ++ [gfycat] Add support for URLs with tags (#20696, #20731) ++ [openload] Add support for verystream.com (#20701, #20967) +* [youtube] Use sp field value for signature field name (#18841, #18927, + #21028) ++ [yahoo:gyao] Extend URL regular expression (#21008) +* [youtube] Fix channel id extraction (#20982, #21003) ++ [sky] Add support for news.sky.com (#13055) ++ [youtube:entrylistbase] Retry on 5xx HTTP errors (#20965) ++ [francetvinfo] Extend video id extraction (#20619, #20740) +* [4tube] Update token hosts (#20918) +* [hotstar] Move to API v2 (#20931) +* [fox] Fix API error handling under python 2 (#20925) ++ [redbulltv] Extend URL regular expression (#20922) + + +version 2019.04.30 + +Extractors +* [openload] Use real Chrome versions (#20902) +- [youtube] Remove info el for get_video_info request +* [youtube] Improve extraction robustness +- [dramafever] Remove extractor (#20868) +* [adn] Fix subtitle extraction (#12724) ++ [ccc] Extract creator (#20355) ++ [ccc:playlist] Add support for media.ccc.de playlists (#14601, #20355) ++ [sverigesradio] Add support for sverigesradio.se (#18635) ++ [cinemax] Add support for cinemax.com +* [sixplay] Try extracting non-DRM protected manifests (#20849) ++ [youtube] Extract Youtube Music Auto-generated metadata (#20599, #20742) +- [wrzuta] Remove extractor (#20684, #20801) +* [twitch] Prefer source format (#20850) ++ [twitcasting] Add support for private videos (#20843) +* [reddit] Validate thumbnail URL (#20030) +* [yandexmusic] Fix track URL extraction (#20820) + + +version 2019.04.24 + +Extractors +* [youtube] Fix extraction (#20758, #20759, #20761, #20762, #20764, #20766, + #20767, #20769, #20771, #20768, #20770) +* [toutv] Fix extraction and extract series info (#20757) ++ [vrv] Add support for movie listings (#19229) ++ [youtube] Print error when no data is available (#20737) ++ [soundcloud] Add support for new rendition and improve extraction (#20699) ++ [ooyala] Add support for geo verification proxy ++ [nrl] Add support for nrl.com (#15991) ++ [vimeo] Extract live archive source format (#19144) ++ [vimeo] Add support for live streams and improve info extraction (#19144) ++ [ntvcojp] Add support for cu.ntv.co.jp ++ [nhk] Extract RTMPT format ++ [nhk] Add support for audio URLs ++ [udemy] Add another course id extraction pattern (#20491) ++ [openload] Add support for oload.services (#20691) ++ [openload] Add support for openloed.co (#20691, #20693) +* [bravotv] Fix extraction (#19213) + + +version 2019.04.17 + +Extractors +* [openload] Randomize User-Agent (closes #20688) ++ [openload] Add support for oladblock domains (#20471) +* [adn] Fix subtitle extraction (#12724) ++ [aol] Add support for localized websites ++ [yahoo] Add support GYAO episode URLs ++ [yahoo] Add support for streaming.yahoo.co.jp (#5811, #7098) ++ [yahoo] Add support for gyao.yahoo.co.jp +* [aenetworks] Fix history topic extraction and extract more formats ++ [cbs] Extract smpte and vtt subtitles ++ [streamango] Add support for streamcherry.com (#20592) ++ [yourporn] Add support for sxyprn.com (#20646) +* [mgtv] Fix extraction (#20650) +* [linkedin:learning] Use urljoin for form action URL (#20431) ++ [gdc] Add support for kaltura embeds (#20575) +* [dispeak] Improve mp4 bitrate extraction +* [kaltura] Sanitize embed URLs +* [jwplatfom] Do not match manifest URLs (#20596) +* [aol] Restrict URL regular expression and improve format extraction ++ [tiktok] Add support for new URL schema (#20573) ++ [stv:player] Add support for player.stv.tv (#20586) + + +version 2019.04.07 + +Core ++ [downloader/external] Pass rtmp_conn to ffmpeg + +Extractors ++ [ruutu] Add support for audio podcasts (#20473, #20545) ++ [xvideos] Extract all thumbnails (#20432) ++ [platzi] Add support for platzi.com (#20562) +* [dvtv] Fix extraction (#18514, #19174) ++ [vrv] Add basic support for individual movie links (#19229) ++ [bfi:player] Add support for player.bfi.org.uk (#19235) +* [hbo] Fix extraction and extract subtitles (#14629, #13709) +* [youtube] Extract srv[1-3] subtitle formats (#20566) +* [adultswim] Fix extraction (#18025) +* [teamcoco] Fix extraction and add suport for subdomains (#17099, #20339) +* [adn] Fix subtitle compatibility with ffmpeg +* [adn] Fix extraction and add support for positioning styles (#20549) +* [vk] Use unique video id (#17848) +* [newstube] Fix extraction +* [rtl2] Actualize extraction ++ [adobeconnect] Add support for adobeconnect.com (#20283) ++ [gaia] Add support for authentication (#14605) ++ [mediasite] Add support for dashed ids and named catalogs (#20531) + + +version 2019.04.01 + +Core +* [utils] Improve int_or_none and float_or_none (#20403) +* Check for valid --min-sleep-interval when --max-sleep-interval is specified + (#20435) + +Extractors ++ [weibo] Extend URL regular expression (#20496) ++ [xhamster] Add support for xhamster.one (#20508) ++ [mediasite] Add support for catalogs (#20507) ++ [teamtreehouse] Add support for teamtreehouse.com (#9836) ++ [ina] Add support for audio URLs +* [ina] Improve extraction +* [cwtv] Fix episode number extraction (#20461) +* [npo] Improve DRM detection ++ [pornhub] Add support for DASH formats (#20403) +* [svtplay] Update API endpoint (#20430) + + +version 2019.03.18 + +Core +* [extractor/common] Improve HTML5 entries extraction ++ [utils] Introduce parse_bitrate +* [update] Hide update URLs behind redirect +* [extractor/common] Fix url meta field for unfragmented DASH formats (#20346) + +Extractors ++ [yandexvideo] Add extractor +* [openload] Improve embed detection ++ [corus] Add support for bigbrothercanada.ca (#20357) ++ [orf:radio] Extract series (#20012) ++ [cbc:watch] Add support for gem.cbc.ca (#20251, #20359) +- [anysex] Remove extractor (#19279) ++ [ciscolive] Add support for new URL schema (#20320, #20351) ++ [youtube] Add support for invidiou.sh (#20309) +- [anitube] Remove extractor (#20334) +- [ruleporn] Remove extractor (#15344, #20324) +* [npr] Fix extraction (#10793, #13440) +* [biqle] Fix extraction (#11471, #15313) +* [viddler] Modernize +* [moevideo] Fix extraction +* [primesharetv] Remove extractor +* [hypem] Modernize and extract more metadata (#15320) +* [veoh] Fix extraction +* [escapist] Modernize +- [videomega] Remove extractor (#10108) ++ [beeg] Add support for beeg.porn (#20306) +* [vimeo:review] Improve config url extraction and extract original format + (#20305) +* [fox] Detect geo restriction and authentication errors (#20208) + + +version 2019.03.09 + +Core +* [extractor/common] Use compat_etree_Element ++ [compat] Introduce compat_etree_Element +* [extractor/common] Fallback url to base URL for DASH formats +* [extractor/common] Do not fail on invalid data while parsing F4M manifest + in non fatal mode +* [extractor/common] Return MPD manifest as format's url meta field (#20242) +* [utils] Strip #HttpOnly_ prefix from cookies files (#20219) + +Extractors +* [francetv:site] Relax video id regular expression (#20268) +* [toutv] Detect invalid login error +* [toutv] Fix authentication (#20261) ++ [urplay] Extract timestamp (#20235) ++ [openload] Add support for oload.space (#20246) +* [facebook] Improve uploader extraction (#20250) +* [bbc] Use compat_etree_Element +* [crunchyroll] Use compat_etree_Element +* [npo] Improve ISM extraction +* [rai] Improve extraction (#20253) +* [paramountnetwork] Fix mgid extraction (#20241) +* [libsyn] Improve extraction (#20229) ++ [youtube] Add more invidious instances to URL regular expression (#20228) +* [spankbang] Fix extraction (#20023) +* [espn] Extend URL regular expression (#20013) +* [sixplay] Handle videos with empty assets (#20016) ++ [vimeo] Add support for Vimeo Pro portfolio protected videos (#20070) + + +version 2019.03.01 + +Core ++ [downloader/external] Add support for rate limit and retries for wget +* [downloader/external] Fix infinite retries for curl (#19303) + +Extractors +* [npo] Fix extraction (#20084) +* [francetv:site] Extend video id regex (#20029, #20071) ++ [periscope] Extract width and height (#20015) +* [servus] Fix extraction (#19297) +* [bbccouk] Make subtitles non fatal (#19651) +* [metacafe] Fix family filter bypass (#19287) + + +version 2019.02.18 + +Extractors +* [tvp:website] Fix and improve extraction ++ [tvp] Detect unavailable videos +* [tvp] Fix description extraction and make thumbnail optional ++ [linuxacademy] Add support for linuxacademy.com (#12207) +* [bilibili] Update keys (#19233) +* [udemy] Extend URL regular expressions (#14330, #15883) +* [udemy] Update User-Agent and detect captcha (#14713, #15839, #18126) +* [noovo] Fix extraction (#19230) +* [rai] Relax URL regular expression (#19232) ++ [vshare] Pass Referer to download request (#19205, #19221) ++ [openload] Add support for oload.live (#19222) +* [imgur] Use video id as title fallback (#18590) ++ [twitch] Add new source format detection approach (#19193) +* [tvplayhome] Fix video id extraction (#19190) +* [tvplayhome] Fix episode metadata extraction (#19190) +* [rutube:embed] Fix extraction (#19163) ++ [rutube:embed] Add support private videos (#19163) ++ [soundcloud] Extract more metadata ++ [trunews] Add support for trunews.com (#19153) ++ [linkedin:learning] Extract chapter_number and chapter_id (#19162) + + +version 2019.02.08 + +Core +* [utils] Improve JSON-LD regular expression (#18058) +* [YoutubeDL] Fallback to ie_key of matching extractor while making + download archive id when no explicit ie_key is provided (#19022) + +Extractors ++ [malltv] Add support for mall.tv (#18058, #17856) ++ [spankbang:playlist] Add support for playlists (#19145) +* [spankbang] Extend URL regular expression +* [trutv] Fix extraction (#17336) +* [toutv] Fix authentication (#16398, #18700) +* [pornhub] Fix tags and categories extraction (#13720, #19135) +* [pornhd] Fix formats extraction ++ [pornhd] Extract like count (#19123, #19125) +* [radiocanada] Switch to the new media requests (#19115) ++ [teachable] Add support for courses.workitdaily.com (#18871) +- [vporn] Remove extractor (#16276) ++ [soundcloud:pagedplaylist] Add ie and title to entries (#19022, #19086) ++ [drtuber] Extract duration (#19078) +* [soundcloud] Fix paged playlists extraction, add support for albums and update client id +* [soundcloud] Update client id +* [drtv] Improve preference (#19079) ++ [openload] Add support for openload.pw and oload.pw (#18930) ++ [openload] Add support for oload.info (#19073) +* [crackle] Authorize media detail request (#16931) + + +version 2019.01.30.1 + +Core +* [postprocessor/ffmpeg] Fix avconv processing broken in #19025 (#19067) + + +version 2019.01.30 + +Core +* [postprocessor/ffmpeg] Do not copy Apple TV chapter tracks while embedding + subtitles (#19024, #19042) +* [postprocessor/ffmpeg] Disable "Last message repeated" messages (#19025) + +Extractors +* [yourporn] Fix extraction and extract duration (#18815, #18852, #19061) +* [drtv] Improve extraction (#19039) + + Add support for EncryptedUri videos + + Extract more metadata + * Fix subtitles extraction ++ [fox] Add support for locked videos using cookies (#19060) +* [fox] Fix extraction for free videos (#19060) ++ [zattoo] Add support for tv.salt.ch (#19059) + + +version 2019.01.27 + +Core ++ [extractor/common] Extract season in _json_ld +* [postprocessor/ffmpeg] Fallback to ffmpeg/avconv for audio codec detection + (#681) + +Extractors +* [vice] Fix extraction for locked videos (#16248) ++ [wakanim] Detect DRM protected videos ++ [wakanim] Add support for wakanim.tv (#14374) +* [usatoday] Fix extraction for videos with custom brightcove partner id + (#18990) +* [drtv] Fix extraction (#18989) +* [nhk] Extend URL regular expression (#18968) +* [go] Fix Adobe Pass requests for Disney Now (#18901) ++ [openload] Add support for oload.club (#18969) + + +version 2019.01.24 + +Core +* [YoutubeDL] Fix negation for string operators in format selection (#18961) + + +version 2019.01.23 + +Core +* [utils] Fix urljoin for paths with non-http(s) schemes +* [extractor/common] Improve jwplayer relative URL handling (#18892) ++ [YoutubeDL] Add negation support for string comparisons in format selection + expressions (#18600, #18805) +* [extractor/common] Improve HLS video-only format detection (#18923) + +Extractors +* [crunchyroll] Extend URL regular expression (#18955) +* [pornhub] Bypass scrape detection (#4822, #5930, #7074, #10175, #12722, + #17197, #18338 #18842, #18899) ++ [vrv] Add support for authentication (#14307) +* [videomore:season] Fix extraction +* [videomore] Improve extraction (#18908) ++ [tnaflix] Pass Referer in metadata request (#18925) +* [radiocanada] Relax DRM check (#18608, #18609) +* [vimeo] Fix video password verification for videos protected by + Referer HTTP header ++ [hketv] Add support for hkedcity.net (#18696) ++ [streamango] Add support for fruithosts.net (#18710) ++ [instagram] Add support for tags (#18757) ++ [odnoklassniki] Detect paid videos (#18876) +* [ted] Correct acodec for HTTP formats (#18923) +* [cartoonnetwork] Fix extraction (#15664, #17224) +* [vimeo] Fix extraction for password protected player URLs (#18889) + + +version 2019.01.17 + +Extractors +* [youtube] Extend JS player signature function name regular expressions + (#18890, #18891, #18893) + + +version 2019.01.16 + +Core ++ [test/helper] Add support for maxcount and count collection len checkers +* [downloader/hls] Fix uplynk ad skipping (#18824) +* [postprocessor/ffmpeg] Improve ffmpeg version parsing (#18813) + +Extractors +* [youtube] Skip unsupported adaptive stream type (#18804) ++ [youtube] Extract DASH formats from player response (#18804) +* [funimation] Fix extraction (#14089) +* [skylinewebcams] Fix extraction (#18853) ++ [curiositystream] Add support for non app URLs ++ [bitchute] Check formats (#18833) +* [wistia] Extend URL regular expression (#18823) ++ [playplustv] Add support for playplus.com (#18789) + + +version 2019.01.10 + +Core +* [extractor/common] Use episode name as title in _json_ld ++ [extractor/common] Add support for movies in _json_ld +* [postprocessor/ffmpeg] Embed subtitles with non-standard language codes + (#18765) ++ [utils] Add language codes replaced in 1989 revision of ISO 639 + to ISO639Utils (#18765) + +Extractors +* [youtube] Extract live HLS URL from player response (#18799) ++ [outsidetv] Add support for outsidetv.com (#18774) +* [jwplatform] Use JW Platform Delivery API V2 and add support for more URLs ++ [fox] Add support National Geographic (#17985, #15333, #14698) ++ [playplustv] Add support for playplus.tv (#18789) +* [globo] Set GLBID cookie manually (#17346) ++ [gaia] Add support for gaia.com (#14605) +* [youporn] Fix title and description extraction (#18748) ++ [hungama] Add support for hungama.com (#17402, #18771) +* [dtube] Fix extraction (#18741) +* [tvnow] Fix and rework extractors and prepare for a switch to the new API + (#17245, #18499) +* [carambatv:page] Fix extraction (#18739) + + +version 2019.01.02 + +Extractors +* [discovery] Use geo verification headers (#17838) ++ [packtpub] Add support for subscription.packtpub.com (#18718) +* [yourporn] Fix extraction (#18583) ++ [acast:channel] Add support for play.acast.com (#18587) ++ [extractors] Add missing age limits (#18621) ++ [rmcdecouverte] Add support for live stream +* [rmcdecouverte] Bypass geo restriction +* [rmcdecouverte] Update URL regular expression (#18595, 18697) +* [manyvids] Fix extraction (#18604, #18614) +* [bitchute] Fix extraction (#18567) + + +version 2018.12.31 + +Extractors ++ [bbc] Add support for another embed pattern (#18643) ++ [npo:live] Add support for npostart.nl (#18644) +* [beeg] Fix extraction (#18610, #18626) +* [youtube] Unescape HTML for series (#18641) ++ [youtube] Extract more format metadata +* [youtube] Detect DRM protected videos (#1774) +* [youtube] Relax HTML5 player regular expressions (#18465, #18466) +* [youtube] Extend HTML5 player regular expression (#17516) ++ [liveleak] Add support for another embed type and restore original + format extraction ++ [crackle] Extract ISM and HTTP formats ++ [twitter] Pass Referer with card request (#18579) +* [mediasite] Extend URL regular expression (#18558) ++ [lecturio] Add support for lecturio.de (#18562) ++ [discovery] Add support for Scripps Networks watch domains (#17947) + + +version 2018.12.17 + +Extractors +* [ard:beta] Improve geo restricted videos extraction +* [ard:beta] Fix subtitles extraction +* [ard:beta] Improve extraction robustness +* [ard:beta] Relax URL regular expression (#18441) +* [acast] Add support for embed.acast.com and play.acast.com (#18483) +* [iprima] Relax URL regular expression (#18515, #18540) +* [vrv] Fix initial state extraction (#18553) +* [youtube] Fix mark watched (#18546) ++ [safari] Add support for learning.oreilly.com (#18510) +* [youtube] Fix multifeed extraction (#18531) +* [lecturio] Improve subtitles extraction (#18488) +* [uol] Fix format URL extraction (#18480) ++ [ard:mediathek] Add support for classic.ardmediathek.de (#18473) + + +version 2018.12.09 + +Core +* [YoutubeDL] Keep session cookies in cookie file between runs +* [YoutubeDL] Recognize session cookies with expired set to 0 (#12929) + +Extractors ++ [teachable] Add support for teachable platform sites (#5451, #18150, #18272) ++ [aenetworks] Add support for historyvault.com (#18460) +* [imgur] Improve gallery and album detection and extraction (#9133, #16577, + #17223, #18404) +* [iprima] Relax URL regular expression (#18453) +* [hotstar] Fix video data extraction (#18386) +* [ard:mediathek] Fix title and description extraction (#18349, #18371) +* [xvideos] Switch to HTTPS (#18422, #18427) ++ [lecturio] Add support for lecturio.com (#18405) ++ [nrktv:series] Add support for extra materials +* [nrktv:season,series] Fix extraction (#17159, #17258) +* [nrktv] Relax URL regular expression (#18304, #18387) +* [yourporn] Fix extraction (#18424, #18425) +* [tbs] Fix info extraction (#18403) ++ [gamespot] Add support for review URLs + + +version 2018.12.03 + +Core +* [utils] Fix random_birthday to generate existing dates only (#18284) + +Extractors ++ [tiktok] Add support for tiktok.com (#18108, #18135) +* [pornhub] Use actual URL host for requests (#18359) +* [lynda] Fix authentication (#18158, #18217) +* [gfycat] Update API endpoint (#18333, #18343) ++ [hotstar] Add support for alternative app state layout (#18320) +* [azmedien] Fix extraction (#18334, #18336) ++ [vimeo] Add support for VHX (Vimeo OTT) (#14835) +* [joj] Fix extraction (#18280, #18281) ++ [wistia] Add support for fast.wistia.com (#18287) + + +version 2018.11.23 + +Core ++ [setup.py] Add more relevant classifiers + +Extractors +* [mixcloud] Fallback to hardcoded decryption key (#18016) +* [nbc:news] Fix article extraction (#16194) +* [foxsports] Fix extraction (#17543) +* [loc] Relax regular expression and improve formats extraction ++ [ciscolive] Add support for ciscolive.cisco.com (#17984) +* [nzz] Relax kaltura regex (#18228) +* [sixplay] Fix formats extraction +* [bitchute] Improve title extraction +* [kaltura] Limit requested MediaEntry fields ++ [americastestkitchen] Add support for zype embeds (#18225) ++ [pornhub] Add pornhub.net alias +* [nova:embed] Fix extraction (#18222) + + +version 2018.11.18 + +Extractors ++ [wwe] Extract subtitles ++ [wwe] Add support for playlistst (#14781) ++ [wwe] Add support for wwe.com (#14781, #17450) +* [vk] Detect geo restriction (#17767) +* [openload] Use original host during extraction (#18211) +* [atvat] Fix extraction (#18041) ++ [rte] Add support for new API endpoint (#18206) +* [tnaflixnetwork:embed] Fix extraction (#18205) +* [picarto] Use API and add token support (#16518) ++ [zype] Add support for player.zype.com (#18143) +* [vivo] Fix extraction (#18139) +* [ruutu] Update API endpoint (#18138) + + +version 2018.11.07 + +Extractors ++ [youtube] Add another JS signature function name regex (#18091, #18093, + #18094) +* [facebook] Fix tahoe request (#17171) +* [cliphunter] Fix extraction (#18083) ++ [youtube:playlist] Add support for invidio.us (#18077) +* [zattoo] Arrange API hosts for derived extractors (#18035) ++ [youtube] Add fallback metadata extraction from videoDetails (#18052) + + +version 2018.11.03 + +Core +* [extractor/common] Ensure response handle is not prematurely closed before + it can be read if it matches expected_status (#17195, #17846, #17447) + +Extractors +* [laola1tv:embed] Set correct stream access URL scheme (#16341) ++ [ehftv] Add support for ehftv.com (#15408) +* [azmedien] Adopt to major site redesign (#17745, #17746) ++ [twitcasting] Add support for twitcasting.tv (#17981) +* [orf:tvthek] Fix extraction (#17737, #17956, #18024) ++ [openload] Add support for oload.fun (#18045) +* [njpwworld] Fix authentication (#17427) ++ [linkedin:learning] Add support for linkedin.com/learning (#13545) +* [theplatform] Improve error detection (#13222) +* [cnbc] Simplify extraction (#14280, #17110) ++ [cbnc] Add support for new URL schema (#14193) +* [aparat] Improve extraction and extract more metadata (#17445, #18008) +* [aparat] Fix extraction + + +version 2018.10.29 + +Core ++ [extractor/common] Add validation for JSON-LD URLs + +Extractors ++ [sportbox] Add support for matchtv.ru +* [sportbox] Fix extraction (#17978) +* [screencast] Fix extraction (#14590, #14617, #17990) ++ [openload] Add support for oload.icu ++ [ivi] Add support for ivi.tv +* [crunchyroll] Improve extraction failsafeness (#17991) +* [dailymail] Fix formats extraction (#17976) +* [viewster] Reduce format requests +* [cwtv] Handle API errors (#17905) ++ [rutube] Use geo verification headers (#17897) ++ [brightcove:legacy] Add fallbacks to brightcove:new (#13912) +- [tv3] Remove extractor (#10461, #15339) +* [ted] Fix extraction for HTTP and RTMP formats (#5941, #17572, #17894) ++ [openload] Add support for oload.cc (#17823) ++ [patreon] Extract post_file URL (#17792) +* [patreon] Fix extraction (#14502, #10471) + + +version 2018.10.05 + +Extractors +* [pluralsight] Improve authentication (#17762) +* [dailymotion] Fix extraction (#17699) +* [crunchyroll] Switch to HTTPS for RpcApi (#17749) ++ [philharmoniedeparis] Add support for pad.philharmoniedeparis.fr (#17705) +* [philharmoniedeparis] Fix extraction (#17705) ++ [jamendo] Add support for licensing.jamendo.com (#17724) ++ [openload] Add support for oload.cloud (#17710) +* [pluralsight] Fix subtitles extraction (#17726, #17728) ++ [vimeo] Add another config regular expression (#17690) +* [spike] Fix Paramount Network extraction (#17677) +* [hotstar] Fix extraction (#14694, #14931, #17637) + + +version 2018.09.26 + +Extractors +* [pluralsight] Fix subtitles extraction (#17671) +* [mediaset] Improve embed support (#17668) ++ [youtube] Add support for invidio.us (#17613) ++ [zattoo] Add support for more zattoo platform sites +* [zattoo] Fix extraction (#17175, #17542) + + +version 2018.09.18 + +Core ++ [extractor/common] Introduce channel meta fields + +Extractors +* [adobepass] Don't pollute default headers dict +* [udemy] Don't pollute default headers dict +* [twitch] Don't pollute default headers dict +* [youtube] Don't pollute default query dict (#17593) +* [crunchyroll] Prefer hardsubless formats and formats in locale language +* [vrv] Make format ids deterministic +* [vimeo] Fix ondemand playlist extraction (#14591) ++ [pornhub] Extract upload date (#17574) ++ [porntube] Extract channel meta fields ++ [vimeo] Extract channel meta fields ++ [youtube] Extract channel meta fields (#9676, #12939) +* [porntube] Fix extraction (#17541) +* [asiancrush] Fix extraction (#15630) ++ [twitch:clips] Extend URL regular expression (closes #17559) ++ [vzaar] Add support for HLS +* [tube8] Fix metadata extraction (#17520) +* [eporner] Extract JSON-LD (#17519) + + +version 2018.09.10 + +Core ++ [utils] Properly recognize AV1 codec (#17506) + +Extractors ++ [iprima] Add support for prima.iprima.cz (#17514) ++ [tele5] Add support for tele5.de (#7805, #7922, #17331, #17414) +* [nbc] Fix extraction of percent encoded URLs (#17374) + + +version 2018.09.08 + +Extractors +* [youtube] Fix extraction (#17457, #17464) ++ [pornhub:uservideos] Add support for new URLs (#17388) +* [iprima] Confirm adult check (#17437) +* [slideslive] Make check for video service name case-insensitive (#17429) +* [radiojavan] Fix extraction (#17151) +* [generic] Skip unsuccessful jwplayer extraction (#16735) + + +version 2018.09.01 + +Core +* [utils] Skip remote IP addresses non matching to source address' IP version + when creating a connection (#13422, #17362) + +Extractors ++ [ard] Add support for one.ard.de (#17397) +* [niconico] Fix extraction on python3 (#17393, #17407) +* [ard] Extract f4m formats +* [crunchyroll] Parse vilos media data (#17343) ++ [ard] Add support for Beta ARD Mediathek ++ [bandcamp] Extract more metadata (#13197) +* [internazionale] Fix extraction of non-available-abroad videos (#17386) + + +version 2018.08.28 + +Extractors ++ [youtube:playlist] Add support for music album playlists (OLAK5uy_ prefix) + (#17361) +* [bitchute] Fix extraction by pass custom User-Agent (#17360) +* [webofstories:playlist] Fix extraction (#16914) ++ [tvplayhome] Add support for new tvplay URLs (#17344) ++ [generic] Allow relative src for videojs embeds (#17324) ++ [xfileshare] Add support for vidto.se (#17317) ++ [vidzi] Add support for vidzi.nu (#17316) ++ [nova:embed] Add support for media.cms.nova.cz (#17282) + + +version 2018.08.22 + +Core +* [utils] Use pure browser header for User-Agent (#17236) + +Extractors ++ [kinopoisk] Add support for kinopoisk.ru (#17283) ++ [yourporn] Add support for yourporn.sexy (#17298) ++ [go] Add support for disneynow.go.com (#16299, #17264) ++ [6play] Add support for play.rtl.hr (#17249) +* [anvato] Fallback to generic API key for access-key-to-API-key lookup + (#16788, #17254) +* [lci] Fix extraction (#17274) +* [bbccouk] Extend id URL regular expression (#17270) +* [cwtv] Fix extraction (#17256) +* [nova] Fix extraction (#17241) ++ [generic] Add support for expressen embeds +* [raywenderlich] Adapt to site redesign (#17225) ++ [redbulltv] Add support redbull.com tv URLs (#17218) ++ [bitchute] Add support for bitchute.com (#14052) ++ [clyp] Add support for token protected media (#17184) +* [imdb] Fix extension extraction (#17167) + + +version 2018.08.04 + +Extractors +* [funk:channel] Improve byChannelAlias extraction (#17142) +* [twitch] Fix authentication (#17024, #17126) +* [twitch:vod] Improve URL regular expression (#17135) +* [watchbox] Fix extraction (#17107) +* [pbs] Fix extraction (#17109) +* [theplatform] Relax URL regular expression (#16181, #17097) ++ [viqeo] Add support for viqeo.tv (#17066) + + +version 2018.07.29 + +Extractors +* [crunchyroll:playlist] Restrict URL regular expression (#17069, #17076) ++ [pornhub] Add support for subtitles (#16924, #17088) +* [ceskatelevize] Use https for API call (#16997, #16999) +* [dailymotion:playlist] Fix extraction (#16894) +* [ted] Improve extraction +* [ted] Fix extraction for videos without nativeDownloads (#16756, #17085) +* [telecinco] Fix extraction (#17080) +* [mitele] Reduce number of requests +* [rai] Return non HTTP relinker URL intact (#17055) +* [vk] Fix extraction for inline only videos (#16923) +* [streamcloud] Fix extraction (#17054) +* [facebook] Fix tahoe player extraction with authentication (#16655) ++ [puhutv] Add support for puhutv.com (#12712, #16010, #16269) + + +version 2018.07.21 + +Core ++ [utils] Introduce url_or_none +* [utils] Allow JSONP without function name (#17028) ++ [extractor/common] Extract DASH and MSS formats from SMIL manifests + +Extractors ++ [bbc] Add support for BBC Radio Play pages (#17022) +* [iwara] Fix download URLs (#17026) +* [vrtnu] Relax title extraction and extract JSON-LD (#17018) ++ [viu] Pass Referer and Origin headers and area id (#16992) ++ [vimeo] Add another config regular expression (#17013) ++ [facebook] Extract view count (#16942) +* [dailymotion] Improve description extraction (#16984) +* [slutload] Fix and improve extraction (#17001) +* [mediaset] Fix extraction (#16977) ++ [theplatform] Add support for theplatform TLD customization (#16977) +* [imgur] Relax URL regular expression (#16987) +* [pornhub] Improve extraction and extract all formats (#12166, #15891, #16262, + #16959) + + +version 2018.07.10 + +Core +* [utils] Share JSON-LD regular expression +* [downloader/dash] Improve error handling (#16927) + +Extractors ++ [nrktv] Add support for new season and serie URL schema ++ [nrktv] Add support for new episode URL schema (#16909) ++ [frontendmasters] Add support for frontendmasters.com (#3661, #16328) +* [funk] Fix extraction (#16918) +* [watchbox] Fix extraction (#16904) +* [dplayit] Sort formats +* [dplayit] Fix extraction (#16901) +* [youtube] Improve login error handling (#13822) + + +version 2018.07.04 + +Core +* [extractor/common] Properly escape % in MPD templates (#16867) +* [extractor/common] Use source URL as Referer for HTML5 entries (16849) +* Prefer ffmpeg over avconv by default (#8622) + +Extractors +* [pluralsight] Switch to graphql (#16889, #16895, #16896, #16899) +* [lynda] Simplify login and improve error capturing (#16891) ++ [go90] Add support for embed URLs (#16873) +* [go90] Detect geo restriction error and pass geo verification headers + (#16874) +* [vlive] Fix live streams extraction (#16871) +* [npo] Fix typo (#16872) ++ [mediaset] Add support for new videos and extract all formats (#16568) +* [dctptv] Restore extraction based on REST API (#16850) +* [svt] Improve extraction and add support for pages (#16802) +* [porncom] Fix extraction (#16808) + + +version 2018.06.25 + +Extractors +* [joj] Relax URL regular expression (#16771) +* [brightcove] Workaround sonyliv DRM protected videos (#16807) +* [motherless] Fix extraction (#16786) +* [itv] Make SOAP request non fatal and extract metadata from webpage (#16780) +- [foxnews:insider] Remove extractor (#15810) ++ [foxnews] Add support for iframe embeds (#15810, #16711) + + +version 2018.06.19 + +Core ++ [extractor/common] Introduce expected_status in _download_* methods + for convenient accept of HTTP requests failed with non 2xx status codes ++ [compat] Introduce compat_integer_types + +Extractors +* [peertube] Improve generic support (#16733) ++ [6play] Use geo verification headers +* [rtbf] Fix extraction for python 3.2 +* [vgtv] Improve HLS formats extraction ++ [vgtv] Add support for www.aftonbladet.se/tv URLs +* [bbccouk] Use expected_status +* [markiza] Expect 500 HTTP status code +* [tvnow] Try all clear manifest URLs (#15361) + + +version 2018.06.18 + +Core +* [downloader/rtmp] Fix downloading in verbose mode (#16736) + +Extractors ++ [markiza] Add support for markiza.sk (#16750) +* [wat] Try all supported adaptive URLs ++ [6play] Add support for rtlplay.be and extract hd usp formats ++ [rtbf] Add support for audio and live streams (#9638, #11923) ++ [rtbf] Extract HLS, DASH and all HTTP formats ++ [rtbf] Extract subtitles ++ [rtbf] Fixup specific HTTP URLs (#16101) ++ [expressen] Add support for expressen.se +* [vidzi] Fix extraction (#16678) +* [pbs] Improve extraction (#16623, #16684) +* [bilibili] Restrict cid regular expression (#16638, #16734) + + +version 2018.06.14 + +Core +* [downloader/http] Fix retry on error when streaming to stdout (#16699) + +Extractors ++ [discoverynetworks] Add support for disco-api videos (#16724) ++ [dailymotion] Add support for password protected videos (#9789) ++ [abc:iview] Add support for livestreams (#12354) +* [abc:iview] Fix extraction (#16704) ++ [crackle] Add support for sonycrackle.com (#16698) ++ [tvnet] Add support for tvnet.gov.vn (#15462) +* [nrk] Update API hosts and try all previously known ones (#16690) +* [wimp] Fix Youtube embeds extraction + + +version 2018.06.11 + +Extractors +* [npo] Extend URL regular expression and add support for npostart.nl (#16682) ++ [inc] Add support for another embed schema (#16666) +* [tv4] Fix format extraction (#16650) ++ [nexx] Add support for free cdn (#16538) ++ [pbs] Add another cove id pattern (#15373) ++ [rbmaradio] Add support for 192k format (#16631) + + +version 2018.06.04 + +Extractors ++ [camtube] Add support for camtube.co ++ [twitter:card] Extract guest token (#16609) ++ [chaturbate] Use geo verification headers ++ [bbc] Add support for bbcthree (#16612) +* [youtube] Move metadata extraction after video availability check ++ [youtube] Extract track and artist ++ [safari] Add support for new URL schema (#16614) +* [adn] Fix extraction + + +version 2018.06.02 + +Core +* [utils] Improve determine_ext + +Extractors ++ [facebook] Add support for tahoe player videos (#15441, #16554) +* [cbc] Improve extraction (#16583, #16593) +* [openload] Improve ext extraction (#16595) ++ [twitter:card] Add support for another endpoint (#16586) ++ [openload] Add support for oload.win and oload.download (#16592) +* [audimedia] Fix extraction (#15309) ++ [francetv] Add support for sport.francetvinfo.fr (#15645) +* [mlb] Improve extraction (#16587) +- [nhl] Remove old extractors +* [rbmaradio] Check formats availability (#16585) + + +version 2018.05.30 + +Core +* [downloader/rtmp] Generalize download messages and report time elapsed + on finish +* [downloader/rtmp] Gracefully handle live streams interrupted by user + +Extractors +* [teamcoco] Fix extraction for full episodes (#16573) +* [spiegel] Fix info extraction (#16538) ++ [apa] Add support for apa.at (#15041, #15672) ++ [bellmedia] Add support for bnnbloomberg.ca (#16560) ++ [9c9media] Extract MPD formats and subtitles +* [cammodels] Use geo verification headers ++ [ufctv] Add support for authentication (#16542) ++ [cammodels] Add support for cammodels.com (#14499) +* [utils] Fix style id extraction for namespaced id attribute in dfxp2srt + (#16551) +* [soundcloud] Detect format extension (#16549) +* [cbc] Fix playlist title extraction (#16502) ++ [tumblr] Detect and report sensitive media (#13829) ++ [tumblr] Add support for authentication (#15133) + + +version 2018.05.26 + +Core +* [utils] Improve parse_age_limit + +Extractors +* [audiomack] Stringify video id (#15310) +* [izlesene] Fix extraction (#16233, #16271, #16407) ++ [indavideo] Add support for generic embeds (#11989) +* [indavideo] Fix extraction (#11221) +* [indavideo] Sign download URLs (#16174) ++ [peertube] Add support for PeerTube based sites (#16301, #16329) +* [imgur] Fix extraction (#16537) ++ [hidive] Add support for authentication (#16534) ++ [nbc] Add support for stream.nbcsports.com (#13911) ++ [viewlift] Add support for hoichoi.tv (#16536) +* [go90] Extract age limit and detect DRM protection(#10127) +* [viewlift] fix extraction for snagfilms.com (#15766) +* [globo] Improve extraction (#4189) + * Add support for authentication + * Simplify URL signing + * Extract DASH and MSS formats +* [leeco] Fix extraction (#16464) +* [teamcoco] Add fallback for format extraction (#16484) +* [teamcoco] Improve URL regular expression (#16484) +* [imdb] Improve extraction (#4085, #14557) + + +version 2018.05.18 + +Extractors +* [vimeo:likes] Relax URL regular expression and fix single page likes + extraction (#16475) +* [pluralsight] Fix clip id extraction (#16460) ++ [mychannels] Add support for mychannels.com (#15334) +- [moniker] Remove extractor (#15336) +* [pbs] Fix embed data extraction (#16474) ++ [mtv] Add support for paramountnetwork.com and bellator.com (#15418) +* [youtube] Fix hd720 format position +* [dailymotion] Remove fragment part from m3u8 URLs (#8915) +* [3sat] Improve extraction (#15350) + * Extract all formats + * Extract more format metadata + * Improve format sorting + * Use hls native downloader + * Detect and bypass geo-restriction ++ [dtube] Add support for d.tube (#15201) +* [options] Fix typo (#16450) +* [youtube] Improve format filesize extraction (#16453) +* [youtube] Make uploader extraction non fatal (#16444) +* [youtube] Fix extraction for embed restricted live streams (#16433) +* [nbc] Improve info extraction (#16440) +* [twitch:clips] Fix extraction (#16429) +* [redditr] Relax URL regular expression (#16426, #16427) +* [mixcloud] Bypass throttling for HTTP formats (#12579, #16424) ++ [nick] Add support for nickjr.de (#13230) +* [teamcoco] Fix extraction (#16374) + + +version 2018.05.09 + +Core +* [YoutubeDL] Ensure ext exists for automatic captions +* Introduce --geo-bypass-ip-block + +Extractors ++ [udemy] Extract asset captions ++ [udemy] Extract stream URLs (#16372) ++ [businessinsider] Add support for businessinsider.com (#16387, #16388, #16389) ++ [cloudflarestream] Add support for cloudflarestream.com (#16375) +* [watchbox] Fix extraction (#16356) +* [discovery] Extract Affiliate/Anonymous Auth Token from cookies (#14954) ++ [itv:btcc] Add support for itv.com/btcc (#16139) +* [tunein] Use live title for live streams (#16347) +* [itv] Improve extraction (#16253) + + +version 2018.05.01 + +Core +* [downloader/fragment] Restart download if .ytdl file is corrupt (#16312) ++ [extractor/common] Extract interaction statistic ++ [utils] Add merge_dicts ++ [extractor/common] Add _download_json_handle + +Extractors +* [kaltura] Improve iframe embeds detection (#16337) ++ [udemy] Extract outputs renditions (#16289, #16291, #16320, #16321, #16334, + #16335) ++ [zattoo] Add support for zattoo.com and mobiltv.quickline.com (#14668, #14676) +* [yandexmusic] Convert release_year to int +* [udemy] Override _download_webpage_handle instead of _download_webpage +* [xiami] Override _download_webpage_handle instead of _download_webpage +* [yandexmusic] Override _download_webpage_handle instead of _download_webpage +* [youtube] Correctly disable polymer on all requests (#16323, #16326) +* [generic] Prefer enclosures over links in RSS feeds (#16189) ++ [redditr] Add support for old.reddit.com URLs (#16274) +* [nrktv] Update API host (#16324) ++ [imdb] Extract all formats (#16249) ++ [vimeo] Extract JSON-LD (#16295) +* [funk:channel] Improve extraction (#16285) + + +version 2018.04.25 + +Core +* [utils] Fix match_str for boolean meta fields ++ [Makefile] Add support for pandoc 2 and disable smart extension (#16251) +* [YoutubeDL] Fix typo in media extension compatibility checker (#16215) + +Extractors ++ [openload] Recognize IPv6 stream URLs (#16136, #16137, #16205, #16246, + #16250) ++ [twitch] Extract is_live according to status (#16259) +* [pornflip] Relax URL regular expression (#16258) +- [etonline] Remove extractor (#16256) +* [breakcom] Fix extraction (#16254) ++ [youtube] Add ability to authenticate with cookies +* [youtube:feed] Implement lazy playlist extraction (#10184) ++ [svt] Add support for TV channel live streams (#15279, #15809) +* [ccma] Fix video extraction (#15931) +* [rentv] Fix extraction (#15227) ++ [nick] Add support for nickjr.nl (#16230) +* [extremetube] Fix metadata extraction ++ [keezmovies] Add support for generic embeds (#16134, #16154) +* [nexx] Extract new azure URLs (#16223) +* [cbssports] Fix extraction (#16217) +* [kaltura] Improve embeds detection (#16201) +* [instagram:user] Fix extraction (#16119) +* [cbs] Skip DRM asset types (#16104) + + +version 2018.04.16 + +Extractors +* [smotri:broadcast] Fix extraction (#16180) ++ [picarto] Add support for picarto.tv (#6205, #12514, #15276, #15551) +* [vine:user] Fix extraction (#15514, #16190) +* [pornhub] Relax URL regular expression (#16165) +* [cbc:watch] Re-acquire device token when expired (#16160) ++ [fxnetworks] Add support for https theplatform URLs (#16125, #16157) ++ [instagram:user] Add request signing (#16119) ++ [twitch] Add support for mobile URLs (#16146) + + +version 2018.04.09 + +Core +* [YoutubeDL] Do not save/restore console title while simulate (#16103) +* [extractor/common] Relax JSON-LD context check (#16006) + +Extractors ++ [generic] Add support for tube8 embeds ++ [generic] Add support for share-videos.se embeds (#16089, #16115) +* [odnoklassniki] Extend URL regular expression (#16081) +* [steam] Bypass mature content check (#16113) ++ [acast] Extract more metadata +* [acast] Fix extraction (#16118) +* [instagram:user] Fix extraction (#16119) +* [drtuber] Fix title extraction (#16107, #16108) +* [liveleak] Extend URL regular expression (#16117) ++ [openload] Add support for oload.xyz +* [openload] Relax stream URL regular expression +* [openload] Fix extraction (#16099) ++ [svtplay:series] Add support for season URLs ++ [svtplay:series] Add support for series (#11130, #16059) + + +version 2018.04.03 + +Extractors ++ [tvnow] Add support for shows (#15837) +* [dramafever] Fix authentication (#16067) +* [afreecatv] Use partial view only when necessary (#14450) ++ [afreecatv] Add support for authentication (#14450) ++ [nationalgeographic] Add support for new URL schema (#16001, #16054) +* [xvideos] Fix thumbnail extraction (#15978, #15979) +* [medialaan] Fix vod id (#16038) ++ [openload] Add support for oload.site (#16039) +* [naver] Fix extraction (#16029) +* [dramafever] Partially switch to API v5 (#16026) +* [abc:iview] Unescape title and series meta fields (#15994) +* [videa] Extend URL regular expression (#16003) + + +version 2018.03.26.1 + +Core ++ [downloader/external] Add elapsed time to progress hook (#10876) +* [downloader/external,fragment] Fix download finalization when writing file + to stdout (#10809, #10876, #15799) + +Extractors +* [vrv] Fix extraction on python2 (#15928) +* [afreecatv] Update referrer (#15947) ++ [24video] Add support for 24video.sexy (#15973) +* [crackle] Bypass geo restriction +* [crackle] Fix extraction (#15969) ++ [lenta] Add support for lenta.ru (#15953) ++ [instagram:user] Add pagination (#15934) +* [youku] Update ccode (#15939) +* [libsyn] Adapt to new page structure + + +version 2018.03.20 + +Core +* [extractor/common] Improve thumbnail extraction for HTML5 entries +* Generalize XML manifest processing code and improve XSPF parsing ++ [extractor/common] Add _download_xml_handle ++ [extractor/common] Add support for relative URIs in _parse_xspf (#15794) + +Extractors ++ [7plus] Extract series metadata (#15862, #15906) +* [9now] Bypass geo restriction (#15920) +* [cbs] Skip unavailable assets (#13490, #13506, #15776) ++ [canalc2] Add support for HTML5 videos (#15916, #15919) ++ [ceskatelevize] Add support for iframe embeds (#15918) ++ [prosiebensat1] Add support for galileo.tv (#15894) ++ [generic] Add support for xfileshare embeds (#15879) +* [bilibili] Switch to v2 playurl API +* [bilibili] Fix and improve extraction (#15048, #15430, #15622, #15863) +* [heise] Improve extraction (#15496, #15784, #15026) +* [instagram] Fix user videos extraction (#15858) + + +version 2018.03.14 + +Extractors +* [soundcloud] Update client id (#15866) ++ [tennistv] Add support for tennistv.com ++ [line] Add support for tv.line.me (#9427) +* [xnxx] Fix extraction (#15817) +* [njpwworld] Fix authentication (#15815) + + +version 2018.03.10 + +Core +* [downloader/hls] Skip uplynk ad fragments (#15748) + +Extractors +* [pornhub] Don't override session cookies (#15697) ++ [raywenderlich] Add support for videos.raywenderlich.com (#15251) +* [funk] Fix extraction and rework extractors (#15792) +* [nexx] Restore reverse engineered approach ++ [heise] Add support for kaltura embeds (#14961, #15728) ++ [tvnow] Extract series metadata (#15774) +* [ruutu] Continue formats extraction on NOT-USED URLs (#15775) +* [vrtnu] Use redirect URL for building video JSON URL (#15767, #15769) +* [vimeo] Modernize login code and improve error messaging +* [archiveorg] Fix extraction (#15770, #15772) ++ [hidive] Add support for hidive.com (#15494) +* [afreecatv] Detect deleted videos +* [afreecatv] Fix extraction (#15755) +* [vice] Fix extraction and rework extractors (#11101, #13019, #13622, #13778) ++ [vidzi] Add support for vidzi.si (#15751) +* [npo] Fix typo + + +version 2018.03.03 + +Core ++ [utils] Add parse_resolution +Revert respect --prefer-insecure while updating + +Extractors ++ [yapfiles] Add support for yapfiles.ru (#15726, #11085) +* [spankbang] Fix formats extraction (#15727) +* [adn] Fix extraction (#15716) ++ [toggle] Extract DASH and ISM formats (#15721) ++ [nickelodeon] Add support for nickelodeon.com.tr (#15706) +* [npo] Validate and filter format URLs (#15709) + + +version 2018.02.26 + +Extractors +* [udemy] Use custom User-Agent (#15571) + + +version 2018.02.25 + +Core +* [postprocessor/embedthumbnail] Skip embedding when there aren't any + thumbnails (#12573) +* [extractor/common] Improve jwplayer subtitles extraction (#15695) + +Extractors ++ [vidlii] Add support for vidlii.com (#14472, #14512, #14779) ++ [streamango] Capture and output error messages +* [streamango] Fix extraction (#14160, #14256) ++ [telequebec] Add support for emissions (#14649, #14655) ++ [telequebec:live] Add support for live streams (#15688) ++ [mailru:music] Add support for mail.ru/music (#15618) +* [aenetworks] Switch to akamai HLS formats (#15612) +* [ytsearch] Fix flat title extraction (#11260, #15681) + + +version 2018.02.22 + +Core ++ [utils] Fixup some common URL typos in sanitize_url (#15649) +* Respect --prefer-insecure while updating (#15497) + +Extractors +* [vidio] Fix HLS URL extraction (#15675) ++ [nexx] Add support for arc.nexx.cloud URLs +* [nexx] Switch to arc API (#15652) +* [redtube] Fix duration extraction (#15659) ++ [sonyliv] Respect referrer (#15648) ++ [brightcove:new] Use referrer for formats' HTTP headers ++ [cbc] Add support for olympics.cbc.ca (#15535) ++ [fusion] Add support for fusion.tv (#15628) +* [npo] Improve quality metadata extraction +* [npo] Relax URL regular expression (#14987, #14994) ++ [npo] Capture and output error message ++ [pornhub] Add support for channels (#15613) +* [youtube] Handle shared URLs with generic extractor (#14303) + + +version 2018.02.11 + +Core ++ [YoutubeDL] Add support for filesize_approx in format selector (#15550) + +Extractors ++ [francetv] Add support for live streams (#13689) ++ [francetv] Add support for zouzous.fr and ludo.fr (#10454, #13087, #13103, + #15012) +* [francetv] Separate main extractor and rework others to delegate to it +* [francetv] Improve manifest URL signing (#15536) ++ [francetv] Sign m3u8 manifest URLs (#15565) ++ [veoh] Add support for embed URLs (#15561) +* [afreecatv] Fix extraction (#15556) +* [periscope] Use accessVideoPublic endpoint (#15554) +* [discovery] Fix auth request (#15542) ++ [6play] Extract subtitles (#15541) +* [newgrounds] Fix metadata extraction (#15531) ++ [nbc] Add support for stream.nbcolympics.com (#10295) +* [dvtv] Fix live streams extraction (#15442) + + +version 2018.02.08 + +Extractors ++ [myvi] Extend URL regular expression ++ [myvi:embed] Add support for myvi.tv embeds (#15521) ++ [prosiebensat1] Extend URL regular expression (#15520) +* [pokemon] Relax URL regular expression and extend title extraction (#15518) ++ [gameinformer] Use geo verification headers +* [la7] Fix extraction (#15501, #15502) +* [gameinformer] Fix brightcove id extraction (#15416) ++ [afreecatv] Pass referrer to video info request (#15507) ++ [telebruxelles] Add support for live streams +* [telebruxelles] Relax URL regular expression +* [telebruxelles] Fix extraction (#15504) +* [extractor/common] Respect secure schemes in _extract_wowza_formats + + +version 2018.02.04 + +Core +* [downloader/http] Randomize HTTP chunk size ++ [downloader/http] Add ability to pass downloader options via info dict +* [downloader/http] Fix 302 infinite loops by not reusing requests ++ Document http_chunk_size + +Extractors ++ [brightcove] Pass embed page URL as referrer (#15486) ++ [youtube] Enforce using chunked HTTP downloading for DASH formats + + +version 2018.02.03 + +Core ++ Introduce --http-chunk-size for chunk-based HTTP downloading ++ Add support for IronPython +* [downloader/ism] Fix Python 3.2 support + +Extractors +* [redbulltv] Fix extraction (#15481) +* [redtube] Fix metadata extraction (#15472) +* [pladform] Respect platform id and extract HLS formats (#15468) +- [rtlnl] Remove progressive formats (#15459) +* [6play] Do no modify asset URLs with a token (#15248) +* [nationalgeographic] Relax URL regular expression +* [dplay] Relax URL regular expression (#15458) +* [cbsinteractive] Fix data extraction (#15451) ++ [amcnetworks] Add support for sundancetv.com (#9260) + + +version 2018.01.27 + +Core +* [extractor/common] Improve _json_ld for articles +* Switch codebase to use compat_b64decode ++ [compat] Add compat_b64decode + +Extractors ++ [seznamzpravy] Add support for seznam.cz and seznamzpravy.cz (#14102, #14616) +* [dplay] Bypass geo restriction ++ [dplay] Add support for disco-api videos (#15396) +* [youtube] Extract precise error messages (#15284) +* [teachertube] Capture and output error message +* [teachertube] Fix and relax thumbnail extraction (#15403) ++ [prosiebensat1] Add another clip id regular expression (#15378) +* [tbs] Update tokenizer url (#15395) +* [mixcloud] Use compat_b64decode (#15394) +- [thesixtyone] Remove extractor (#15341) + + +version 2018.01.21 + +Core +* [extractor/common] Improve jwplayer DASH formats extraction (#9242, #15187) +* [utils] Improve scientific notation handling in js_to_json (#14789) + +Extractors ++ [southparkdk] Add support for southparkstudios.nu ++ [southpark] Add support for collections (#14803) +* [franceinter] Fix upload date extraction (#14996) ++ [rtvs] Add support for rtvs.sk (#9242, #15187) +* [restudy] Fix extraction and extend URL regular expression (#15347) +* [youtube:live] Improve live detection (#15365) ++ [springboardplatform] Add support for springboardplatform.com +* [prosiebensat1] Add another clip id regular expression (#15290) +- [ringtv] Remove extractor (#15345) + + +version 2018.01.18 + +Extractors +* [soundcloud] Update client id (#15306) +- [kamcord] Remove extractor (#15322) ++ [spiegel] Add support for nexx videos (#15285) +* [twitch] Fix authentication and error capture (#14090, #15264) +* [vk] Detect more errors due to copyright complaints (#15259) + + +version 2018.01.14 + +Extractors +* [youtube] Fix live streams extraction (#15202) +* [wdr] Bypass geo restriction +* [wdr] Rework extractors (#14598) ++ [wdr] Add support for wdrmaus.de/elefantenseite (#14598) ++ [gamestar] Add support for gamepro.de (#3384) +* [viafree] Skip rtmp formats (#15232) ++ [pandoratv] Add support for mobile URLs (#12441) ++ [pandoratv] Add support for new URL format (#15131) ++ [ximalaya] Add support for ximalaya.com (#14687) ++ [digg] Add support for digg.com (#15214) +* [limelight] Tolerate empty pc formats (#15150, #15151, #15207) +* [ndr:embed:base] Make separate formats extraction non fatal (#15203) ++ [weibo] Add extractor (#15079) ++ [ok] Add support for live streams +* [canalplus] Fix extraction (#15072) +* [bilibili] Fix extraction (#15188) + + +version 2018.01.07 + +Core +* [utils] Fix youtube-dl under PyPy3 on Windows +* [YoutubeDL] Output python implementation in debug header + +Extractors ++ [jwplatform] Add support for multiple embeds (#15192) +* [mitele] Fix extraction (#15186) ++ [motherless] Add support for groups (#15124) +* [lynda] Relax URL regular expression (#15185) +* [soundcloud] Fallback to avatar picture for thumbnail (#12878) +* [youku] Fix list extraction (#15135) +* [openload] Fix extraction (#15166) +* [lynda] Skip invalid subtitles (#15159) +* [twitch] Pass video id to url_result when extracting playlist (#15139) +* [rtve.es:alacarta] Fix extraction of some new URLs +* [acast] Fix extraction (#15147) + + +version 2017.12.31 + +Core ++ [extractor/common] Add container meta field for formats extracted + in _parse_mpd_formats (#13616) ++ [downloader/hls] Use HTTP headers for key request +* [common] Use AACL as the default fourcc when AudioTag is 255 +* [extractor/common] Fix extraction of DASH formats with the same + representation id (#15111) + +Extractors ++ [slutload] Add support for mobile URLs (#14806) +* [abc:iview] Bypass geo restriction +* [abc:iview] Fix extraction (#14711, #14782, #14838, #14917, #14963, #14985, + #15035, #15057, #15061, #15071, #15095, #15106) +* [openload] Fix extraction (#15118) +- [sandia] Remove extractor +- [collegerama] Remove extractor ++ [mediasite] Add support for sites based on Mediasite Video Platform (#5428, + #11185, #14343) ++ [ufctv] Add support for ufc.tv (#14520) +* [pluralsight] Fix missing first line of subtitles (#11118) +* [openload] Fallback on f-page extraction (#14665, #14879) +* [vimeo] Improve password protected videos extraction (#15114) +* [aws] Fix canonical/signed headers generation on python 2 (#15102) + + +version 2017.12.28 + +Extractors ++ [internazionale] Add support for internazionale.it (#14973) +* [playtvak] Relax video regular expression and make description optional + (#15037) ++ [filmweb] Add support for filmweb.no (#8773, #10368) ++ [23video] Add support for 23video.com ++ [espn] Add support for fivethirtyeight.com (#6864) ++ [umg:de] Add support for universal-music.de (#11582, #11584) ++ [espn] Add support for espnfc and extract more formats (#8053) +* [youku] Update ccode (#14880) ++ [openload] Add support for oload.stream (#15070) +* [youku] Fix list extraction (#15065) + + +version 2017.12.23 + +Core +* [extractor/common] Move X-Forwarded-For setup code into _request_webpage ++ [YoutubeDL] Add support for playlist_uploader and playlist_uploader_id in + output template (#11427, #15018) ++ [extractor/common] Introduce uploader, uploader_id and uploader_url + meta fields for playlists (#11427, #15018) +* [downloader/fragment] Encode filename of fragment being removed (#15020) ++ [utils] Add another date format pattern (#14999) + +Extractors ++ [kaltura] Add another embed pattern for entry_id ++ [7plus] Add support for 7plus.com.au (#15043) +* [animeondemand] Relax login error regular expression ++ [shahid] Add support for show pages (#7401) ++ [youtube] Extract uploader, uploader_id and uploader_url for playlists + (#11427, #15018) +* [afreecatv] Improve format extraction (#15019) ++ [cspan] Add support for audio only pages and catch page errors (#14995) ++ [mailru] Add support for embed URLs (#14904) +* [crunchyroll] Future-proof XML element checks (#15013) +* [cbslocal] Fix timestamp extraction (#14999, #15000) +* [discoverygo] Correct TTML subtitle extension +* [vk] Make view count optional (#14979) +* [disney] Skip Apple FairPlay formats (#14982) +* [voot] Fix format extraction (#14758) + + +version 2017.12.14 + +Core +* [postprocessor/xattr] Clarify NO_SPACE message (#14970) +* [downloader/http] Return actual download result from real_download (#14971) + +Extractors ++ [itv] Extract more subtitles and duration +* [itv] Improve extraction (#14944) ++ [byutv] Add support for geo restricted videos +* [byutv] Fix extraction (#14966, #14967) ++ [bbccouk] Fix extraction for 320k HLS streams ++ [toutv] Add support for special video URLs (#14179) +* [discovery] Fix free videos extraction (#14157, #14954) +* [tvnow] Fix extraction (#7831) ++ [nickelodeon:br] Add support for nickelodeon brazil websites (#14893) +* [nick] Improve extraction (#14876) +* [tbs] Fix extraction (#13658) + + +version 2017.12.10 + +Core ++ [utils] Add sami mimetype to mimetype2ext + +Extractors +* [culturebox] Improve video id extraction (#14947) +* [twitter] Improve extraction (#14197) ++ [udemy] Extract more HLS formats +* [udemy] Improve course id extraction (#14938) ++ [stretchinternet] Add support for portal.stretchinternet.com (#14576) +* [ellentube] Fix extraction (#14407, #14570) ++ [raiplay:playlist] Add support for playlists (#14563) +* [sonyliv] Bypass geo restriction +* [sonyliv] Extract higher quality formats (#14922) +* [fox] Extract subtitles ++ [fox] Add support for Adobe Pass authentication (#14205, #14489) +- [dailymotion:cloud] Remove extractor (#6794) +* [xhamster] Fix thumbnail extraction (#14780) ++ [xhamster] Add support for mobile URLs (#14780) +* [generic] Don't pass video id as mpd id while extracting DASH (#14902) +* [ard] Skip invalid stream URLs (#14906) +* [porncom] Fix metadata extraction (#14911) +* [pluralsight] Detect agreement request (#14913) +* [toutv] Fix login (#14614) + + +version 2017.12.02 + +Core ++ [downloader/fragment] Commit part file after each fragment ++ [extractor/common] Add durations for DASH fragments with bare SegmentURLs ++ [extractor/common] Add support for DASH manifests with SegmentLists with + bare SegmentURLs (#14844) ++ [utils] Add hvc1 codec code to parse_codecs + +Extractors +* [xhamster] Fix extraction (#14884) +* [youku] Update ccode (#14872) +* [mnet] Fix format extraction (#14883) ++ [xiami] Add Referer header to API request +* [mtv] Correct scc extention in extracted subtitles (#13730) +* [vvvvid] Fix extraction for kenc videos (#13406) ++ [br] Add support for BR Mediathek videos (#14560, #14788) ++ [daisuki] Add support for motto.daisuki.com (#14681) +* [odnoklassniki] Fix API metadata request (#14862) +* [itv] Fix HLS formats extraction ++ [pbs] Add another media id regular expression + + +version 2017.11.26 + +Core +* [extractor/common] Use final URL when dumping request (#14769) + +Extractors +* [fczenit] Fix extraction +- [firstpost] Remove extractor +* [freespeech] Fix extraction +* [nexx] Extract more formats ++ [openload] Add support for openload.link (#14763) +* [empflix] Relax URL regular expression +* [empflix] Fix extractrion +* [tnaflix] Don't modify download URLs (#14811) +- [gamersyde] Remove extractor +* [francetv:generationwhat] Fix extraction ++ [massengeschmacktv] Add support for Massengeschmack TV +* [fox9] Fix extraction +* [faz] Fix extraction and add support for Perform Group embeds (#14714) ++ [performgroup] Add support for performgroup.com ++ [jwplatform] Add support for iframes (#14828) +* [culturebox] Fix extraction (#14827) +* [youku] Fix extraction; update ccode (#14815) +* [livestream] Make SMIL extraction non fatal (#14792) ++ [drtuber] Add support for mobile URLs (#14772) ++ [spankbang] Add support for mobile URLs (#14771) +* [instagram] Fix description, timestamp and counters extraction (#14755) + + +version 2017.11.15 + +Core +* [common] Skip Apple FairPlay m3u8 manifests (#14741) +* [YoutubeDL] Fix playlist range optimization for --playlist-items (#14740) + +Extractors +* [vshare] Capture and output error message +* [vshare] Fix extraction (#14473) +* [crunchyroll] Extract old RTMP formats +* [tva] Fix extraction (#14736) +* [gamespot] Lower preference of HTTP formats (#14652) +* [instagram:user] Fix extraction (#14699) +* [ccma] Fix typo (#14730) +- Remove sensitive data from logging in messages +* [instagram:user] Fix extraction (#14699) ++ [gamespot] Add support for article URLs (#14652) +* [gamespot] Skip Brightcove Once HTTP formats (#14652) +* [cartoonnetwork] Update tokenizer_src (#14666) ++ [wsj] Recognize another URL pattern (#14704) +* [pandatv] Update API URL and sign format URLs (#14693) +* [crunchyroll] Use old login method (#11572) + + +version 2017.11.06 + +Core ++ [extractor/common] Add protocol for f4m formats +* [f4m] Prefer baseURL for relative URLs (#14660) +* [extractor/common] Respect URL query in _extract_wowza_formats (14645) + +Extractors ++ [hotstar:playlist] Add support for playlists (#12465) +* [hotstar] Bypass geo restriction (#14672) +- [22tracks] Remove extractor (#11024, #14628) ++ [skysport] Sdd support ooyala videos protected with embed_token (#14641) +* [gamespot] Extract formats referenced with new data fields (#14652) +* [spankbang] Detect unavailable videos (#14644) + + +version 2017.10.29 + +Core +* [extractor/common] Prefix format id for audio only HLS formats ++ [utils] Add support for zero years and months in parse_duration + +Extractors +* [egghead] Fix extraction (#14388) ++ [fxnetworks] Extract series metadata (#14603) ++ [younow] Add support for younow.com (#9255, #9432, #12436) +* [dctptv] Fix extraction (#14599) +* [youtube] Restrict embed regular expression (#14600) +* [vimeo] Restrict iframe embed regular expression (#14600) +* [soundgasm] Improve extraction (#14588) +- [myvideo] Remove extractor (#8557) ++ [nbc] Add support for classic-tv videos (#14575) ++ [vrtnu] Add support for cookies authentication and simplify (#11873) ++ [canvas] Add support for vrt.be/vrtnu (#11873) +* [twitch:clips] Fix title extraction (#14566) ++ [ndtv] Add support for sub-sites (#14534) +* [dramafever] Fix login error message extraction ++ [nick] Add support for more nickelodeon sites (no, dk, se, ch, fr, es, pt, + ro, hu) (#14553) + + +version 2017.10.20 + +Core +* [downloader/fragment] Report warning instead of error on inconsistent + download state +* [downloader/hls] Fix total fragments count when ad fragments exist + +Extractors +* [parliamentliveuk] Fix extraction (#14524) +* [soundcloud] Update client id (#14546) ++ [servus] Add support for servus.com (#14362) ++ [unity] Add support for unity3d.com (#14528) +* [youtube] Replace youtube redirect URLs in description (#14517) +* [pbs] Restrict direct video URL regular expression (#14519) +* [drtv] Respect preference for direct HTTP formats (#14509) ++ [eporner] Add support for embed URLs (#14507) +* [arte] Capture and output error message +* [niconico] Improve uploader metadata extraction robustness (#14135) + + +version 2017.10.15.1 + +Core +* [downloader/hls] Ignore anvato ad fragments (#14496) +* [downloader/fragment] Output ad fragment count + +Extractors +* [scrippsnetworks:watch] Bypass geo restriction ++ [anvato] Add ability to bypass geo restriction +* [redditr] Fix extraction for URLs with query (#14495) + + +version 2017.10.15 + +Core ++ [common] Add support for jwplayer youtube embeds + +Extractors +* [scrippsnetworks:watch] Fix extraction (#14389) +* [anvato] Process master m3u8 manifests +* [youtube] Fix relative URLs in description +* [spike] Bypass geo restriction ++ [howstuffworks] Add support for more domains +* [infoq] Fix http format downloading ++ [rtlnl] Add support for another type of embeds ++ [onionstudios] Add support for bulbs-video embeds +* [udn] Fix extraction +* [shahid] Fix extraction (#14448) +* [kaltura] Ignore Widevine encrypted video (.wvm) (#14471) +* [vh1] Fix extraction (#9613) + + +version 2017.10.12 + +Core +* [YoutubeDL] Improve _default_format_spec (#14461) + +Extractors +* [steam] Fix extraction (#14067) ++ [funk] Add support for funk.net (#14464) ++ [nexx] Add support for shortcuts and relax domain id extraction ++ [voxmedia] Add support for recode.net (#14173) ++ [once] Add support for vmap URLs ++ [generic] Add support for channel9 embeds (#14469) +* [tva] Fix extraction (#14328) ++ [tubitv] Add support for new URL format (#14460) +- [afreecatv:global] Remove extractor +- [youtube:shared] Removed extractor (#14420) ++ [slideslive] Add support for slideslive.com (#2680) ++ [facebook] Support thumbnails (#14416) +* [vvvvid] Fix episode number extraction (#14456) +* [hrti:playlist] Relax URL regular expression +* [wdr] Relax media link regular expression (#14447) +* [hrti] Relax URL regular expression (#14443) +* [fox] Delegate extraction to uplynk:preplay (#14147) ++ [youtube] Add support for hooktube.com (#14437) + + +version 2017.10.07 + +Core +* [YoutubeDL] Ignore duplicates in --playlist-items +* [YoutubeDL] Fix out of range --playlist-items for iterable playlists and + reduce code duplication (#14425) ++ [utils] Use cache in OnDemandPagedList by default +* [postprocessor/ffmpeg] Convert to opus using libopus (#14381) + +Extractors +* [reddit] Sort formats (#14430) +* [lnkgo] Relax URL regular expression (#14423) +* [pornflip] Extend URL regular expression (#14405, #14406) ++ [xtube] Add support for embed URLs (#14417) ++ [xvideos] Add support for embed URLs and improve extraction (#14409) +* [beeg] Fix extraction (#14403) +* [tvn24] Relax URL regular expression (#14395) +* [nbc] Fix extraction (#13651, #13715, #14137, #14198, #14312, #14314, #14378, + #14392, #14414, #14419, #14431) ++ [ketnet] Add support for videos without direct sources (#14377) +* [canvas] Generalize mediazone.vrt.be extractor and rework canvas and een ++ [afreecatv] Add support for adult videos (#14376) + + +version 2017.10.01 + +Core +* [YoutubeDL] Document youtube_include_dash_manifest + +Extractors ++ [tvp] Add support for new URL schema (#14368) ++ [generic] Add support for single format Video.js embeds (#14371) +* [yahoo] Bypass geo restriction for brightcove (#14210) +* [yahoo] Use extracted brightcove account id (#14210) +* [rtve:alacarta] Fix extraction (#14290) ++ [yahoo] Add support for custom brigthcove embeds (#14210) ++ [generic] Add support for Video.js embeds ++ [gfycat] Add support for /gifs/detail URLs (#14322) +* [generic] Fix infinite recursion for twitter:player URLs (#14339) +* [xhamsterembed] Fix extraction (#14308) + + +version 2017.09.24 + +Core ++ [options] Accept lrc as a subtitle conversion target format (#14292) +* [utils] Fix handling raw TTML subtitles (#14191) + +Extractors +* [24video] Fix timestamp extraction and make non fatal (#14295) ++ [24video] Add support for 24video.adult (#14295) ++ [kakao] Add support for tv.kakao.com (#12298, #14007) ++ [twitter] Add support for URLs without user id (#14270) ++ [americastestkitchen] Add support for americastestkitchen.com (#10764, + #13996) +* [generic] Fix support for multiple HTML5 videos on one page (#14080) +* [mixcloud] Fix extraction (#14088, #14132) ++ [lynda] Add support for educourse.ga (#14286) +* [beeg] Fix extraction (#14275) +* [nbcsports:vplayer] Correct theplatform URL (#13873) +* [twitter] Fix duration extraction (#14141) +* [tvplay] Bypass geo restriction ++ [heise] Add support for YouTube embeds (#14109) ++ [popcorntv] Add support for popcorntv.it (#5914, #14211) +* [viki] Update app data (#14181) +* [morningstar] Relax URL regular expression (#14222) +* [openload] Fix extraction (#14225, #14257) +* [noovo] Fix extraction (#14214) +* [dailymotion:playlist] Relax URL regular expression (#14219) ++ [twitch] Add support for go.twitch.tv URLs (#14215) +* [vgtv] Relax URL regular expression (#14223) + + +version 2017.09.15 + +Core +* [downloader/fragment] Restart inconsistent incomplete fragment downloads + (#13731) +* [YoutubeDL] Download raw subtitles files (#12909, #14191) + +Extractors +* [condenast] Fix extraction (#14196, #14207) ++ [orf] Add support for f4m stories +* [tv4] Relax URL regular expression (#14206) +* [animeondemand] Bypass geo restriction ++ [animeondemand] Add support for flash videos (#9944) + + +version 2017.09.11 + +Extractors +* [rutube:playlist] Fix suitable (#14166) + + +version 2017.09.10 + +Core ++ [utils] Introduce bool_or_none +* [YoutubeDL] Ensure dir existence for each requested format (#14116) + +Extractors +* [fox] Fix extraction (#14147) +* [rutube] Use bool_or_none +* [rutube] Rework and generalize playlist extractors (#13565) ++ [rutube:playlist] Add support for playlists (#13534, #13565) ++ [radiocanada] Add fallback for title extraction (#14145) +* [vk] Use dedicated YouTube embeds extraction routine +* [vice] Use dedicated YouTube embeds extraction routine +* [cracked] Use dedicated YouTube embeds extraction routine +* [chilloutzone] Use dedicated YouTube embeds extraction routine +* [abcnews] Use dedicated YouTube embeds extraction routine +* [youtube] Separate methods for embeds extraction +* [redtube] Fix formats extraction (#14122) +* [arte] Relax unavailability check (#14112) ++ [manyvids] Add support for preview videos from manyvids.com (#14053, #14059) +* [vidme:user] Relax URL regular expression (#14054) +* [bpb] Fix extraction (#14043, #14086) +* [soundcloud] Fix download URL with private tracks (#14093) +* [aliexpress:live] Add support for live.aliexpress.com (#13698, #13707) +* [viidea] Capture and output lecture error message (#14099) +* [radiocanada] Skip unsupported platforms (#14100) + + +version 2017.09.02 + +Extractors +* [youtube] Force old layout for each webpage (#14068, #14072, #14074, #14076, + #14077, #14079, #14082, #14083, #14094, #14095, #14096) +* [youtube] Fix upload date extraction (#14065) ++ [charlierose] Add support for episodes (#14062) ++ [bbccouk] Add support for w-prefixed ids (#14056) +* [googledrive] Extend URL regular expression (#9785) ++ [googledrive] Add support for source format (#14046) +* [pornhd] Fix extraction (#14005) + + +version 2017.08.27.1 + +Extractors + +* [youtube] Fix extraction with --youtube-skip-dash-manifest enabled (#14037) + + +version 2017.08.27 + +Core ++ [extractor/common] Extract height and format id for HTML5 videos (#14034) +* [downloader/http] Rework HTTP downloader (#506, #809, #2849, #4240, #6023, + #8625, #9483) + * Simplify code and split into separate routines to facilitate maintaining + * Make retry mechanism work on errors during actual download not only + during connection establishment phase + * Retry on ECONNRESET and ETIMEDOUT during reading data from network + * Retry on content too short + * Show error description on retry + +Extractors +* [generic] Lower preference for extraction from LD-JSON +* [rai] Fix audio formats extraction (#14024) +* [youtube] Fix controversy videos extraction (#14027, #14029) +* [mixcloud] Fix extraction (#14015, #14020) + + +version 2017.08.23 + +Core ++ [extractor/common] Introduce _parse_xml +* [extractor/common] Make HLS and DASH extraction in_parse_html5_media_entries + non fatal (#13970) +* [utils] Fix unescapeHTML for misformed string like "&a"" (#13935) + +Extractors +* [cbc:watch] Bypass geo restriction (#13993) +* [toutv] Relax DRM check (#13994) ++ [googledrive] Add support for subtitles (#13619, #13638) +* [pornhub] Relax uploader regular expression (#13906, #13975) +* [bandcamp:album] Extract track titles (#13962) ++ [bbccouk] Add support for events URLs (#13893) ++ [liveleak] Support multi-video pages (#6542) ++ [liveleak] Support another liveleak embedding pattern (#13336) +* [cda] Fix extraction (#13935) ++ [laola1tv] Add support for tv.ittf.com (#13965) +* [mixcloud] Fix extraction (#13958, #13974, #13980, #14003) + + +version 2017.08.18 + +Core +* [YoutubeDL] Sanitize byte string format URLs (#13951) ++ [extractor/common] Add support for float durations in _parse_mpd_formats + (#13919) + +Extractors +* [arte] Detect unavailable videos (#13945) +* [generic] Convert redirect URLs to unicode strings (#13951) +* [udemy] Fix paid course detection (#13943) +* [pluralsight] Use RPC API for course extraction (#13937) ++ [clippit] Add support for clippituser.tv ++ [qqmusic] Support new URL schemes (#13805) +* [periscope] Renew HLS extraction (#13917) +* [mixcloud] Extract decrypt key + + +version 2017.08.13 + +Core +* [YoutubeDL] Make sure format id is not empty +* [extractor/common] Make _family_friendly_search optional +* [extractor/common] Respect source's type attribute for HTML5 media (#13892) + +Extractors +* [pornhub:playlistbase] Skip videos from drop-down menu (#12819, #13902) ++ [fourtube] Add support pornerbros.com (#6022) ++ [fourtube] Add support porntube.com (#7859, #13901) ++ [fourtube] Add support fux.com +* [limelight] Improve embeds detection (#13895) ++ [reddit] Add support for v.redd.it and reddit.com (#13847) +* [aparat] Extract all formats (#13887) +* [mixcloud] Fix play info decryption (#13885) ++ [generic] Add support for vzaar embeds (#13876) + + +version 2017.08.09 + +Core +* [utils] Skip missing params in cli_bool_option (#13865) + +Extractors +* [xxxymovies] Fix title extraction (#13868) ++ [nick] Add support for nick.com.pl (#13860) +* [mixcloud] Fix play info decryption (#13867) +* [20min] Fix embeds extraction (#13852) +* [dplayit] Fix extraction (#13851) ++ [niconico] Support videos with multiple formats (#13522) ++ [niconico] Support HTML5-only videos (#13806) + + +version 2017.08.06 + +Core +* Use relative paths for DASH fragments (#12990) + +Extractors +* [pluralsight] Fix format selection +- [mpora] Remove extractor (#13826) ++ [voot] Add support for voot.com (#10255, #11644, #11814, #12350, #13218) +* [vlive:channel] Limit number of videos per page to 100 (#13830) +* [podomatic] Extend URL regular expression (#13827) +* [cinchcast] Extend URL regular expression +* [yandexdisk] Relax URL regular expression (#13824) +* [vidme] Extract DASH and HLS formats +- [teamfour] Remove extractor (#13782) +* [pornhd] Fix extraction (#13783) +* [udemy] Fix subtitles extraction (#13812) +* [mlb] Extend URL regular expression (#13740, #13773) ++ [pbs] Add support for new URL schema (#13801) +* [nrktv] Update API host (#13796) + + +version 2017.07.30.1 + +Core +* [downloader/hls] Use redirect URL as manifest base (#13755) +* [options] Correctly hide login info from debug outputs (#13696) + +Extractors ++ [watchbox] Add support for watchbox.de (#13739) +- [clipfish] Remove extractor ++ [youjizz] Fix extraction (#13744) ++ [generic] Add support for another ooyala embed pattern (#13727) ++ [ard] Add support for lives (#13771) +* [soundcloud] Update client id ++ [soundcloud:trackstation] Add support for track stations (#13733) +* [svtplay] Use geo verification proxy for API request +* [svtplay] Update API URL (#13767) ++ [yandexdisk] Add support for yadi.sk (#13755) ++ [megaphone] Add support for megaphone.fm +* [amcnetworks] Make rating optional (#12453) +* [cloudy] Fix extraction (#13737) ++ [nickru] Add support for nickelodeon.ru +* [mtv] Improve thumbnal extraction +* [nick] Automate geo-restriction bypass (#13711) +* [niconico] Improve error reporting (#13696) + + +version 2017.07.23 + +Core +* [YoutubeDL] Improve default format specification (#13704) +* [YoutubeDL] Do not override id, extractor and extractor_key for + url_transparent entities +* [extractor/common] Fix playlist_from_matches + +Extractors +* [itv] Fix production id extraction (#13671, #13703) +* [vidio] Make duration non fatal and fix typo +* [mtv] Skip missing video parts (#13690) +* [sportbox:embed] Fix extraction ++ [npo] Add support for npo3.nl URLs (#13695) +* [dramafever] Remove video id from title (#13699) ++ [egghead:lesson] Add support for lessons (#6635) +* [funnyordie] Extract more metadata (#13677) +* [youku:show] Fix playlist extraction (#13248) ++ [dispeak] Recognize sevt subdomain (#13276) +* [adn] Improve error reporting (#13663) +* [crunchyroll] Relax series and season regular expression (#13659) ++ [spiegel:article] Add support for nexx iframe embeds (#13029) ++ [nexx:embed] Add support for iframe embeds +* [nexx] Improve JS embed extraction ++ [pearvideo] Add support for pearvideo.com (#13031) + + +version 2017.07.15 + +Core +* [YoutubeDL] Don't expand environment variables in meta fields (#13637) + +Extractors +* [spiegeltv] Delegate extraction to nexx extractor (#13159) ++ [nexx] Add support for nexx.cloud (#10807, #13465) +* [generic] Fix rutube embeds extraction (#13641) +* [karrierevideos] Fix title extraction (#13641) +* [youtube] Don't capture YouTube Red ad for creator meta field (#13621) +* [slideshare] Fix extraction (#13617) ++ [5tv] Add another video URL pattern (#13354, #13606) +* [drtv] Make HLS and HDS extraction non fatal +* [ted] Fix subtitles extraction (#13628, #13629) +* [vine] Make sure the title won't be empty ++ [twitter] Support HLS streams in vmap URLs ++ [periscope] Support pscp.tv URLs in embedded frames +* [twitter] Extract mp4 urls via mobile API (#12726) +* [niconico] Fix authentication error handling (#12486) +* [giantbomb] Extract m3u8 formats (#13626) ++ [vlive:playlist] Add support for playlists (#13613) + + +version 2017.07.09 + +Core ++ [extractor/common] Add support for AMP tags in _parse_html5_media_entries ++ [utils] Support attributes with no values in get_elements_by_attribute + +Extractors ++ [dailymail] Add support for embeds ++ [joj] Add support for joj.sk (#13268) +* [abc.net.au:iview] Extract more formats (#13492, #13489) +* [egghead:course] Fix extraction (#6635, #13370) ++ [cjsw] Add support for cjsw.com (#13525) ++ [eagleplatform] Add support for referrer protected videos (#13557) ++ [eagleplatform] Add support for another embed pattern (#13557) +* [veoh] Extend URL regular expression (#13601) +* [npo:live] Fix live stream id extraction (#13568, #13605) +* [googledrive] Fix height extraction (#13603) ++ [dailymotion] Add support for new layout (#13580) +- [yam] Remove extractor +* [xhamster] Extract all formats and fix duration extraction (#13593) ++ [xhamster] Add support for new URL schema (#13593) +* [espn] Extend URL regular expression (#13244, #13549) +* [kaltura] Fix typo in subtitles extraction (#13569) +* [vier] Adapt extraction to redesign (#13575) + + +version 2017.07.02 + +Core +* [extractor/common] Improve _json_ld + +Extractors ++ [thisoldhouse] Add more fallbacks for video id +* [thisoldhouse] Fix video id extraction (#13540, #13541) +* [xfileshare] Extend format regular expression (#13536) +* [ted] Fix extraction (#13535) ++ [tastytrade] Add support for tastytrade.com (#13521) +* [dplayit] Relax video id regular expression (#13524) ++ [generic] Extract more generic metadata (#13527) ++ [bbccouk] Capture and output error message (#13501, #13518) +* [cbsnews] Relax video info regular expression (#13284, #13503) ++ [facebook] Add support for plugin video embeds and multiple embeds (#13493) +* [soundcloud] Switch to https for API requests (#13502) +* [pandatv] Switch to https for API and download URLs ++ [pandatv] Add support for https URLs (#13491) ++ [niconico] Support sp subdomain (#13494) + + +version 2017.06.25 + +Core ++ [adobepass] Add support for DIRECTV NOW (mso ATTOTT) (#13472) +* [YoutubeDL] Skip malformed formats for better extraction robustness + +Extractors ++ [wsj] Add support for barrons.com (#13470) ++ [ign] Add another video id pattern (#13328) ++ [raiplay:live] Add support for live streams (#13414) ++ [redbulltv] Add support for live videos and segments (#13486) ++ [onetpl] Add support for videos embedded via pulsembed (#13482) +* [ooyala] Make more robust +* [ooyala] Skip empty format URLs (#13471, #13476) +* [hgtv.com:show] Fix typo + + +version 2017.06.23 + +Core +* [adobepass] Fix extraction on older python 2.6 + +Extractors +* [youtube] Adapt to new automatic captions rendition (#13467) +* [hgtv.com:show] Relax video config regular expression (#13279, #13461) +* [drtuber] Fix formats extraction (#12058) +* [youporn] Fix upload date extraction +* [youporn] Improve formats extraction +* [youporn] Fix title extraction (#13456) +* [googledrive] Fix formats sorting (#13443) +* [watchindianporn] Fix extraction (#13411, #13415) ++ [vimeo] Add fallback mp4 extension for original format ++ [ruv] Add support for ruv.is (#13396) +* [viu] Fix extraction on older python 2.6 +* [pandora.tv] Fix upload_date extraction (#12846) ++ [asiancrush] Add support for asiancrush.com (#13420) + + +version 2017.06.18 + +Core +* [downloader/common] Use utils.shell_quote for debug command line +* [utils] Use compat_shlex_quote in shell_quote +* [postprocessor/execafterdownload] Encode command line (#13407) +* [compat] Fix compat_shlex_quote on Windows (#5889, #10254) +* [postprocessor/metadatafromtitle] Fix missing optional meta fields processing + in --metadata-from-title (#13408) +* [extractor/common] Fix json dumping with --geo-bypass ++ [extractor/common] Improve jwplayer subtitles extraction ++ [extractor/common] Improve jwplayer formats extraction (#13379) + +Extractors +* [polskieradio] Fix extraction (#13392) ++ [xfileshare] Add support for fastvideo.me (#13385) +* [bilibili] Fix extraction of videos with double quotes in titles (#13387) +* [4tube] Fix extraction (#13381, #13382) ++ [disney] Add support for disneychannel.de (#13383) +* [npo] Improve URL regular expression (#13376) ++ [corus] Add support for showcase.ca ++ [corus] Add support for history.ca (#13359) + + +version 2017.06.12 + +Core +* [utils] Handle compat_HTMLParseError in extract_attributes (#13349) ++ [compat] Introduce compat_HTMLParseError +* [utils] Improve unified_timestamp +* [extractor/generic] Ensure format id is unicode string +* [extractor/common] Return unicode string from _match_id ++ [YoutubeDL] Sanitize more fields (#13313) + +Extractors ++ [xfileshare] Add support for rapidvideo.tv (#13348) +* [xfileshare] Modernize and pass Referer ++ [rutv] Add support for testplayer.vgtrk.com (#13347) ++ [newgrounds] Extract more metadata (#13232) ++ [newgrounds:playlist] Add support for playlists (#10611) +* [newgrounds] Improve formats and uploader extraction (#13346) +* [msn] Fix formats extraction +* [turbo] Ensure format id is string +* [sexu] Ensure height is int +* [jove] Ensure comment count is int +* [golem] Ensure format id is string +* [gfycat] Ensure filesize is int +* [foxgay] Ensure height is int +* [flickr] Ensure format id is string +* [sohu] Fix numeric fields +* [safari] Improve authentication detection (#13319) +* [liveleak] Ensure height is int (#13313) +* [streamango] Make title optional (#13292) +* [rtlnl] Improve URL regular expression (#13295) +* [tvplayer] Fix extraction (#13291) + + +version 2017.06.05 + +Core +* [YoutubeDL] Don't emit ANSI escape codes on Windows (#13270) + +Extractors ++ [bandcamp:weekly] Add support for bandcamp weekly (#12758) +* [pornhub:playlist] Fix extraction (#13281) +- [godtv] Remove extractor (#13175) +* [safari] Fix typo (#13252) +* [youtube] Improve chapters extraction (#13247) +* [1tv] Lower preference for HTTP formats (#13246) +* [francetv] Relax URL regular expression +* [drbonanza] Fix extraction (#13231) +* [packtpub] Fix authentication (#13240) + + +version 2017.05.29 + +Extractors +* [youtube] Fix DASH MPD extraction for videos with non-encrypted format URLs + (#13211) +* [xhamster] Fix uploader and like/dislike count extraction (#13216)) ++ [xhamster] Extract categories (#11728) ++ [abcnews] Add support for embed URLs (#12851) +* [gaskrank] Fix extraction (#12493) +* [medialaan] Fix videos with missing videoUrl (#12774) +* [dvtv] Fix playlist support ++ [dvtv] Add support for DASH and HLS formats (#3063) ++ [beam:vod] Add support for beam.pro/mixer.com VODs (#13032)) +* [cbsinteractive] Relax URL regular expression (#13213) +* [adn] Fix formats extraction ++ [youku] Extract more metadata (#10433) +* [cbsnews] Fix extraction (#13205) + + +version 2017.05.26 + +Core ++ [utils] strip_jsonp() can recognize more patterns +* [postprocessor/ffmpeg] Fix metadata filename handling on Python 2 (#13182) + +Extractors ++ [youtube] DASH MPDs with cipher signatures are recognized now (#11381) ++ [bbc] Add support for authentication +* [tudou] Merge into youku extractor (#12214) +* [youku:show] Fix extraction +* [youku] Fix extraction (#13191) +* [udemy] Fix extraction for outputs' format entries without URL (#13192) +* [vimeo] Fix formats' sorting (#13189) +* [cbsnews] Fix extraction for 60 Minutes videos (#12861) + + +version 2017.05.23 + +Core ++ [downloader/external] Pass -loglevel to ffmpeg downloader (#13183) ++ [adobepass] Add support for Bright House Networks (#13149) + +Extractors ++ [streamcz] Add support for subtitles (#13174) +* [youtube] Fix DASH manifest signature decryption (#8944, #13156) +* [toggle] Relax URL regular expression (#13172) +* [toypics] Fix extraction (#13077) +* [njpwworld] Fix extraction (#13162, #13169) ++ [hitbox] Add support for smashcast.tv (#13154) +* [mitele] Update app key regular expression (#13158) + + +version 2017.05.18.1 + +Core +* [jsinterp] Fix typo and cleanup regular expressions (#13134) + + +version 2017.05.18 + +Core ++ [jsinterp] Add support for quoted names and indexers (#13123, #13124, #13125, + #13126, #13128, #13129, #13130, #13131, #13132) ++ [extractor/common] Add support for schemeless URLs in _extract_wowza_formats + (#13088, #13092) ++ [utils] Recognize more audio codecs (#13081) + +Extractors ++ [vier] Extract more metadata (#12539) +* [vier] Improve extraction (#12801) + + Add support for authentication + * Bypass authentication when no credentials provided + * Improve extraction robustness +* [dailymail] Fix sources extraction (#13057) +* [dailymotion] Extend URL regular expression (#13079) + + +version 2017.05.14 + +Core ++ [extractor/common] Respect Width and Height attributes in ISM manifests ++ [postprocessor/metadatafromtitle] Add support regular expression syntax for + --metadata-from-title (#13065) + +Extractors ++ [mediaset] Add support for video.mediaset.it (#12708, #12964) +* [orf:radio] Fix extraction (#11643, #12926) +* [aljazeera] Extend URL regular expression (#13053) +* [imdb] Relax URL regular expression (#13056) ++ [francetv] Add support for mobile.france.tv (#13068) ++ [upskill] Add support for upskillcourses.com (#13043) +* [thescene] Fix extraction (#13061) +* [condenast] Improve embed support +* [liveleak] Fix extraction (#12053) ++ [douyu] Support Douyu shows (#12228) +* [myspace] Improve URL regular expression (#13040) +* [adultswim] Use desktop platform in assets URL (#13041) + + +version 2017.05.09 + +Core +* [YoutubeDL] Force --restrict-filenames when no locale is set on all python + versions (#13027) + +Extractors +* [francetv] Adapt to site redesign (#13034) ++ [packtpub] Add support for authentication (#12622) +* [drtv] Lower preference for SignLanguage formats (#13013, #13016) ++ [cspan] Add support for brightcove live embeds (#13028) +* [vrv] Extract DASH formats and subtitles +* [funimation] Fix authentication (#13021) +* [adultswim] Fix extraction (#8640, #10950, #11042, #12121) + + Add support for Adobe Pass authentication + + Add support for live streams + + Add support for show pages +* [turner] Extract thumbnail, is_live and strip description ++ [nonktube] Add support for nonktube.com (#8647, #13024) ++ [nuevo] Pass headers to _extract_nuevo +* [nbc] Improve extraction (#12364) + + +version 2017.05.07 + +Common +* [extractor/common] Fix typo in _extract_akamai_formats ++ [postprocessor/ffmpeg] Embed chapters into media file with --add-metadata ++ [extractor/common] Introduce chapters meta field + +Extractors +* [youtube] Fix authentication (#12820, #12927, #12973, #12992, #12993, #12995, + #13003) +* [bilibili] Fix video downloading (#13001) +* [rmcdecouverte] Fix extraction (#12937) +* [theplatform] Extract chapters +* [bandcamp] Fix thumbnail extraction (#12980) +* [pornhub] Extend URL regular expression (#12996) ++ [youtube] Extract chapters ++ [nrk] Extract chapters ++ [vice] Add support for ooyala embeds in article pages ++ [vice] Support vice articles (#12968) +* [vice] Fix extraction for non en_us videos (#12967) +* [gdcvault] Fix extraction for some videos (#12733) +* [pbs] Improve multipart video support (#12981) +* [laola1tv] Fix extraction (#12880) ++ [cda] Support birthday verification (#12789) +* [leeco] Fix extraction (#12974) ++ [pbs] Extract chapters +* [amp] Imporove thumbnail and subtitles extraction +* [foxsports] Fix extraction (#12945) +- [coub] Remove comment count extraction (#12941) + + +version 2017.05.01 + +Core ++ [extractor/common] Extract view count from JSON-LD +* [utils] Improve unified_timestamp ++ [utils] Add video/mp2t to mimetype2ext +* [downloader/external] Properly handle live stream downloading cancellation + (#8932) ++ [utils] Add support for unicode whitespace in clean_html on python 2 (#12906) + +Extractors +* [infoq] Make audio format extraction non fatal (#12938) +* [brightcove] Allow whitespace around attribute names in embedded code ++ [zaq1] Add support for zaq1.pl (#12693) ++ [xvideos] Extract duration (#12828) +* [vevo] Fix extraction (#12879) ++ [noovo] Add support for noovo.ca (#12792) ++ [washingtonpost] Add support for embeds (#12699) +* [yandexmusic:playlist] Fix extraction for python 3 (#12888) +* [anvato] Improve extraction (#12913) + * Promote to regular shortcut based extractor + * Add mcp to access key mapping table + * Add support for embeds extraction + * Add support for anvato embeds in generic extractor +* [xtube] Fix extraction for older FLV videos (#12734) +* [tvplayer] Fix extraction (#12908) + + +version 2017.04.28 + +Core ++ [adobepass] Use geo verification headers for all requests +- [downloader/fragment] Remove assert for resume_len when no fragments + downloaded ++ [extractor/common] Add manifest_url for explicit group rendition formats +* [extractor/common] Fix manifest_url for m3u8 formats +- [extractor/common] Don't list master m3u8 playlists in format list (#12832) + +Extractor +* [aenetworks] Fix extraction for shows with single season ++ [go] Add support for Disney, DisneyJunior and DisneyXD show pages +* [youtube] Recognize new locale-based player URLs (#12885) ++ [streamable] Add support for new embedded URL schema (#12844) +* [arte:+7] Relax URL regular expression (#12837) + + +version 2017.04.26 + +Core +* Introduce --keep-fragments for keeping fragments of fragmented download + on disk after download is finished +* [YoutubeDL] Fix output template for missing timestamp (#12796) +* [socks] Handle cases where credentials are required but missing +* [extractor/common] Improve HLS extraction (#12211) + * Extract m3u8 parsing to separate method + * Improve rendition groups extraction + * Build stream name according stream GROUP-ID + * Ignore reference to AUDIO group without URI when stream has no CODECS + * Use float for scaled tbr in _parse_m3u8_formats +* [utils] Add support for TTML styles in dfxp2srt +* [downloader/hls] No need to download keys for fragments that have been + already downloaded +* [downloader/fragment] Improve fragment downloading + * Resume immediately + * Don't concatenate fragments and decrypt them on every resume + * Optimize disk storage usage, don't store intermediate fragments on disk + * Store bookkeeping download state file ++ [extractor/common] Add support for multiple getters in try_get ++ [extractor/common] Add support for video of WebPage context in _json_ld + (#12778) ++ [extractor/common] Relax JWPlayer regular expression and remove + duplicate URLs (#12768) + +Extractors +* [iqiyi] Fix extraction of Yule videos +* [vidio] Improve extraction and sort formats ++ [brightcove] Match only video elements with data-video-id attribute +* [iqiyi] Fix playlist detection (#12504) +- [azubu] Remove extractor (#12813) +* [porn91] Fix extraction (#12814) +* [vidzi] Fix extraction (#12793) ++ [amp] Extract error message (#12795) ++ [xfileshare] Add support for gorillavid.com and daclips.com (#12776) +* [instagram] Fix extraction (#12777) ++ [generic] Support Brightcove videos in ', + webpage, 'embed url')) + if VKIE.suitable(embed_url): + return self.url_result(embed_url, VKIE.ie_key(), video_id) + + self._request_webpage( + HEADRequest(embed_url), video_id, headers={'Referer': url}) + video_id, sig, _, access_token = self._get_cookies(embed_url)['video_ext'].value.split('%3A') + item = self._download_json( + 'https://api.vk.com/method/video.get', video_id, + headers={'User-Agent': 'okhttp/3.4.1'}, query={ + 'access_token': access_token, + 'sig': sig, + 'v': 5.44, + 'videos': video_id, + })['response']['items'][0] + title = item['title'] + + formats = [] + for f_id, f_url in item.get('files', {}).items(): + if f_id == 'external': + return self.url_result(f_url) + ext, height = f_id.split('_') + formats.append({ + 'format_id': height + 'p', + 'url': f_url, + 'height': int_or_none(height), + 'ext': ext, + }) + self._sort_formats(formats) + + thumbnails = [] + for k, v in item.items(): + if k.startswith('photo_') and v: + width = k.replace('photo_', '') + thumbnails.append({ + 'id': width, + 'url': v, + 'width': int_or_none(width), + }) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'comment_count': int_or_none(item.get('comments')), + 'description': item.get('description'), + 'duration': int_or_none(item.get('duration')), + 'thumbnails': thumbnails, + 'timestamp': int_or_none(item.get('date')), + 'uploader': item.get('owner_id'), + 'view_count': int_or_none(item.get('views')), + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/bitchute.py youtube-dl-2019.05.20/youtube_dl/extractor/bitchute.py --- youtube-dl-2016.02.22/youtube_dl/extractor/bitchute.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/bitchute.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,135 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools +import re + +from .common import InfoExtractor +from ..utils import ( + orderedSet, + urlencode_postdata, +) + + +class BitChuteIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?:video|embed|torrent/[^/]+)/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.bitchute.com/video/szoMrox2JEI/', + 'md5': '66c4a70e6bfc40dcb6be3eb1d74939eb', + 'info_dict': { + 'id': 'szoMrox2JEI', + 'ext': 'mp4', + 'title': 'Fuck bitches get money', + 'description': 'md5:3f21f6fb5b1d17c3dee9cf6b5fe60b3a', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Victoria X Rave', + }, + }, { + 'url': 'https://www.bitchute.com/embed/lbb5G1hjPhw/', + 'only_matching': True, + }, { + 'url': 'https://www.bitchute.com/torrent/Zee5BE49045h/szoMrox2JEI.webtorrent', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'https://www.bitchute.com/video/%s' % video_id, video_id, headers={ + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.57 Safari/537.36', + }) + + title = self._html_search_regex( + (r'<[^>]+\bid=["\']video-title[^>]+>([^<]+)', r'([^<]+)'), + webpage, 'title', default=None) or self._html_search_meta( + 'description', webpage, 'title', + default=None) or self._og_search_description(webpage) + + format_urls = [] + for mobj in re.finditer( + r'addWebSeed\s*\(\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage): + format_urls.append(mobj.group('url')) + format_urls.extend(re.findall(r'as=(https?://[^&"\']+)', webpage)) + + formats = [ + {'url': format_url} + for format_url in orderedSet(format_urls)] + + if not formats: + formats = self._parse_html5_media_entries( + url, webpage, video_id)[0]['formats'] + + self._check_formats(formats, video_id) + self._sort_formats(formats) + + description = self._html_search_regex( + r'(?s)<div\b[^>]+\bclass=["\']full hidden[^>]+>(.+?)</div>', + webpage, 'description', fatal=False) + thumbnail = self._og_search_thumbnail( + webpage, default=None) or self._html_search_meta( + 'twitter:image:src', webpage, 'thumbnail') + uploader = self._html_search_regex( + (r'(?s)<div class=["\']channel-banner.*?<p\b[^>]+\bclass=["\']name[^>]+>(.+?)</p>', + r'(?s)<p\b[^>]+\bclass=["\']video-author[^>]+>(.+?)</p>'), + webpage, 'uploader', fatal=False) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'formats': formats, + } + + +class BitChuteChannelIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bitchute\.com/channel/(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://www.bitchute.com/channel/victoriaxrave/', + 'playlist_mincount': 185, + 'info_dict': { + 'id': 'victoriaxrave', + }, + } + + _TOKEN = 'zyG6tQcGPE5swyAEFLqKUwMuMMuF6IO2DZ6ZDQjGfsL0e4dcTLwqkTTul05Jdve7' + + def _entries(self, channel_id): + channel_url = 'https://www.bitchute.com/channel/%s/' % channel_id + offset = 0 + for page_num in itertools.count(1): + data = self._download_json( + '%sextend/' % channel_url, channel_id, + 'Downloading channel page %d' % page_num, + data=urlencode_postdata({ + 'csrfmiddlewaretoken': self._TOKEN, + 'name': '', + 'offset': offset, + }), headers={ + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'Referer': channel_url, + 'X-Requested-With': 'XMLHttpRequest', + 'Cookie': 'csrftoken=%s' % self._TOKEN, + }) + if data.get('success') is False: + break + html = data.get('html') + if not html: + break + video_ids = re.findall( + r'class=["\']channel-videos-image-container[^>]+>\s*<a\b[^>]+\bhref=["\']/video/([^"\'/]+)', + html) + if not video_ids: + break + offset += len(video_ids) + for video_id in video_ids: + yield self.url_result( + 'https://www.bitchute.com/video/%s' % video_id, + ie=BitChuteIE.ie_key(), video_id=video_id) + + def _real_extract(self, url): + channel_id = self._match_id(url) + return self.playlist_result( + self._entries(channel_id), playlist_id=channel_id) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/bleacherreport.py youtube-dl-2019.05.20/youtube_dl/extractor/bleacherreport.py --- youtube-dl-2016.02.22/youtube_dl/extractor/bleacherreport.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/bleacherreport.py 2019-06-07 18:49:07.000000000 +0000 @@ -28,14 +28,14 @@ 'add_ie': ['Ooyala'], }, { 'url': 'http://bleacherreport.com/articles/2586817-aussie-golfers-get-fright-of-their-lives-after-being-chased-by-angry-kangaroo', - 'md5': 'af5f90dc9c7ba1c19d0a3eac806bbf50', + 'md5': '6a5cd403418c7b01719248ca97fb0692', 'info_dict': { 'id': '2586817', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'Aussie Golfers Get Fright of Their Lives After Being Chased by Angry Kangaroo', 'timestamp': 1446839961, 'uploader': 'Sean Fay', - 'description': 'md5:825e94e0f3521df52fa83b2ed198fa20', + 'description': 'md5:b1601e2314c4d8eec23b6eafe086a757', 'uploader_id': 6466954, 'upload_date': '20151011', }, @@ -90,7 +90,7 @@ _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P<id>[0-9a-f-]{36})' _TESTS = [{ 'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1', - 'md5': '8c2c12e3af7805152675446c905d159b', + 'md5': '2e4b0a997f9228ffa31fada5c53d1ed1', 'info_dict': { 'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1', 'ext': 'flv', diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/blinkx.py youtube-dl-2019.05.20/youtube_dl/extractor/blinkx.py --- youtube-dl-2016.02.22/youtube_dl/extractor/blinkx.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/blinkx.py 2019-06-07 18:49:07.000000000 +0000 @@ -32,8 +32,8 @@ video_id = self._match_id(url) display_id = video_id[:8] - api_url = ('https://apib4.blinkx.com/api.php?action=play_video&' + - 'video=%s' % video_id) + api_url = ('https://apib4.blinkx.com/api.php?action=play_video&' + + 'video=%s' % video_id) data_json = self._download_webpage(api_url, display_id) data = json.loads(data_json)['api']['results'][0] duration = None diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/bloomberg.py youtube-dl-2019.05.20/youtube_dl/extractor/bloomberg.py --- youtube-dl-2016.02.22/youtube_dl/extractor/bloomberg.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/bloomberg.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import re @@ -17,6 +18,25 @@ 'title': 'Shah\'s Presentation on Foreign-Exchange Strategies', 'description': 'md5:a8ba0302912d03d246979735c17d2761', }, + 'params': { + 'format': 'best[format_id^=hds]', + }, + }, { + # video ID in BPlayer(...) + 'url': 'http://www.bloomberg.com/features/2016-hello-world-new-zealand/', + 'info_dict': { + 'id': '938c7e72-3f25-4ddb-8b85-a9be731baa74', + 'ext': 'flv', + 'title': 'Meet the Real-Life Tech Wizards of Middle Earth', + 'description': 'Hello World, Episode 1: New Zealand’s freaky AI babies, robot exoskeletons, and a virtual you.', + }, + 'params': { + 'format': 'best[format_id^=hds]', + }, + }, { + # data-bmmrid= + 'url': 'https://www.bloomberg.com/politics/articles/2017-02-08/le-pen-aide-briefed-french-central-banker-on-plan-to-print-money', + 'only_matching': True, }, { 'url': 'http://www.bloomberg.com/news/articles/2015-11-12/five-strange-things-that-have-been-happening-in-financial-markets', 'only_matching': True, @@ -29,8 +49,14 @@ name = self._match_id(url) webpage = self._download_webpage(url, name) video_id = self._search_regex( - r'["\']bmmrId["\']\s*:\s*(["\'])(?P<url>.+?)\1', - webpage, 'id', group='url') + (r'["\']bmmrId["\']\s*:\s*(["\'])(?P<id>(?:(?!\1).)+)\1', + r'videoId\s*:\s*(["\'])(?P<id>(?:(?!\1).)+)\1', + r'data-bmmrid=(["\'])(?P<id>(?:(?!\1).)+)\1'), + webpage, 'id', group='id', default=None) + if not video_id: + bplayer_data = self._parse_json(self._search_regex( + r'BPlayer\(null,\s*({[^;]+})\);', webpage, 'id'), name) + video_id = bplayer_data['id'] title = re.sub(': Video$', '', self._og_search_title(webpage)) embed_info = self._download_json( diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/bokecc.py youtube-dl-2019.05.20/youtube_dl/extractor/bokecc.py --- youtube-dl-2016.02.22/youtube_dl/extractor/bokecc.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/bokecc.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,60 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_parse_qs +from ..utils import ExtractorError + + +class BokeCCBaseIE(InfoExtractor): + def _extract_bokecc_formats(self, webpage, video_id, format_id=None): + player_params_str = self._html_search_regex( + r'<(?:script|embed)[^>]+src="http://p\.bokecc\.com/player\?([^"]+)', + webpage, 'player params') + + player_params = compat_parse_qs(player_params_str) + + info_xml = self._download_xml( + 'http://p.bokecc.com/servlet/playinfo?uid=%s&vid=%s&m=1' % ( + player_params['siteid'][0], player_params['vid'][0]), video_id) + + formats = [{ + 'format_id': format_id, + 'url': quality.find('./copy').attrib['playurl'], + 'preference': int(quality.attrib['value']), + } for quality in info_xml.findall('./video/quality')] + + self._sort_formats(formats) + + return formats + + +class BokeCCIE(BokeCCBaseIE): + _IE_DESC = 'CC视频' + _VALID_URL = r'https?://union\.bokecc\.com/playvideo\.bo\?(?P<query>.*)' + + _TESTS = [{ + 'url': 'http://union.bokecc.com/playvideo.bo?vid=E44D40C15E65EA30&uid=CD0C5D3C8614B28B', + 'info_dict': { + 'id': 'CD0C5D3C8614B28B_E44D40C15E65EA30', + 'ext': 'flv', + 'title': 'BokeCC Video', + }, + }] + + def _real_extract(self, url): + qs = compat_parse_qs(re.match(self._VALID_URL, url).group('query')) + if not qs.get('vid') or not qs.get('uid'): + raise ExtractorError('Invalid URL', expected=True) + + video_id = '%s_%s' % (qs['uid'][0], qs['vid'][0]) + + webpage = self._download_webpage(url, video_id) + + return { + 'id': video_id, + 'title': 'BokeCC Video', # no title provided in the webpage + 'formats': self._extract_bokecc_formats(webpage, video_id), + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/bostonglobe.py youtube-dl-2019.05.20/youtube_dl/extractor/bostonglobe.py --- youtube-dl-2016.02.22/youtube_dl/extractor/bostonglobe.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/bostonglobe.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,72 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + +from ..utils import ( + extract_attributes, +) + + +class BostonGlobeIE(InfoExtractor): + _VALID_URL = r'(?i)https?://(?:www\.)?bostonglobe\.com/.*/(?P<id>[^/]+)/\w+(?:\.html)?' + _TESTS = [ + { + 'url': 'http://www.bostonglobe.com/metro/2017/02/11/tree-finally-succumbs-disease-leaving-hole-neighborhood/h1b4lviqzMTIn9sVy8F3gP/story.html', + 'md5': '0a62181079c85c2d2b618c9a738aedaf', + 'info_dict': { + 'title': 'A tree finally succumbs to disease, leaving a hole in a neighborhood', + 'id': '5320421710001', + 'ext': 'mp4', + 'description': 'It arrived as a sapling when the Back Bay was in its infancy, a spindly American elm tamped down into a square of dirt cut into the brick sidewalk of 1880s Marlborough Street, no higher than the first bay window of the new brownstone behind it.', + 'timestamp': 1486877593, + 'upload_date': '20170212', + 'uploader_id': '245991542', + }, + }, + { + # Embedded youtube video; we hand it off to the Generic extractor. + 'url': 'https://www.bostonglobe.com/lifestyle/names/2017/02/17/does-ben-affleck-play-matt-damon-favorite-version-batman/ruqkc9VxKBYmh5txn1XhSI/story.html', + 'md5': '582b40327089d5c0c949b3c54b13c24b', + 'info_dict': { + 'title': "Who Is Matt Damon's Favorite Batman?", + 'id': 'ZW1QCnlA6Qc', + 'ext': 'mp4', + 'upload_date': '20170217', + 'description': 'md5:3b3dccb9375867e0b4d527ed87d307cb', + 'uploader': 'The Late Late Show with James Corden', + 'uploader_id': 'TheLateLateShow', + }, + 'expected_warnings': ['404'], + }, + ] + + def _real_extract(self, url): + page_id = self._match_id(url) + webpage = self._download_webpage(url, page_id) + + page_title = self._og_search_title(webpage, default=None) + + # <video data-brightcove-video-id="5320421710001" data-account="245991542" data-player="SJWAiyYWg" data-embed="default" class="video-js" controls itemscope itemtype="http://schema.org/VideoObject"> + entries = [] + for video in re.findall(r'(?i)(<video[^>]+>)', webpage): + attrs = extract_attributes(video) + + video_id = attrs.get('data-brightcove-video-id') + account_id = attrs.get('data-account') + player_id = attrs.get('data-player') + embed = attrs.get('data-embed') + + if video_id and account_id and player_id and embed: + entries.append( + 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' + % (account_id, player_id, embed, video_id)) + + if len(entries) == 0: + return self.url_result(url, 'Generic') + elif len(entries) == 1: + return self.url_result(entries[0], 'BrightcoveNew') + else: + return self.playlist_from_matches(entries, page_id, page_title, ie='BrightcoveNew') diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/bpb.py youtube-dl-2019.05.20/youtube_dl/extractor/bpb.py --- youtube-dl-2016.02.22/youtube_dl/extractor/bpb.py 2016-02-01 10:41:30.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/bpb.py 2019-06-07 18:49:07.000000000 +0000 @@ -12,7 +12,7 @@ class BpbIE(InfoExtractor): IE_DESC = 'Bundeszentrale für politische Bildung' - _VALID_URL = r'http://www\.bpb\.de/mediathek/(?P<id>[0-9]+)/' + _VALID_URL = r'https?://(?:www\.)?bpb\.de/mediathek/(?P<id>[0-9]+)/' _TEST = { 'url': 'http://www.bpb.de/mediathek/297/joachim-gauck-zu-1989-und-die-erinnerung-an-die-ddr', @@ -33,13 +33,18 @@ title = self._html_search_regex( r'<h2 class="white">(.*?)</h2>', webpage, 'title') video_info_dicts = re.findall( - r"({\s*src:\s*'http://film\.bpb\.de/[^}]+})", webpage) + r"({\s*src\s*:\s*'https?://film\.bpb\.de/[^}]+})", webpage) formats = [] for video_info in video_info_dicts: - video_info = self._parse_json(video_info, video_id, transform_source=js_to_json) - quality = video_info['quality'] - video_url = video_info['src'] + video_info = self._parse_json( + video_info, video_id, transform_source=js_to_json, fatal=False) + if not video_info: + continue + video_url = video_info.get('src') + if not video_url: + continue + quality = 'high' if '_high' in video_url else 'low' formats.append({ 'url': video_url, 'preference': 10 if quality == 'high' else 0, diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/bravotv.py youtube-dl-2019.05.20/youtube_dl/extractor/bravotv.py --- youtube-dl-2016.02.22/youtube_dl/extractor/bravotv.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/bravotv.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,84 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .adobepass import AdobePassIE +from ..utils import ( + smuggle_url, + update_url_query, + int_or_none, +) + + +class BravoTVIE(AdobePassIE): + _VALID_URL = r'https?://(?:www\.)?bravotv\.com/(?:[^/]+/)+(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.bravotv.com/top-chef/season-16/episode-15/videos/the-top-chef-season-16-winner-is', + 'md5': 'e34684cfea2a96cd2ee1ef3a60909de9', + 'info_dict': { + 'id': 'epL0pmK1kQlT', + 'ext': 'mp4', + 'title': 'The Top Chef Season 16 Winner Is...', + 'description': 'Find out who takes the title of Top Chef!', + 'uploader': 'NBCU-BRAV', + 'upload_date': '20190314', + 'timestamp': 1552591860, + } + }, { + 'url': 'http://www.bravotv.com/below-deck/season-3/ep-14-reunion-part-1', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + settings = self._parse_json(self._search_regex( + r'<script[^>]+data-drupal-selector="drupal-settings-json"[^>]*>({.+?})</script>', webpage, 'drupal settings'), + display_id) + info = {} + query = { + 'mbr': 'true', + } + account_pid, release_pid = [None] * 2 + tve = settings.get('ls_tve') + if tve: + query['manifest'] = 'm3u' + mobj = re.search(r'<[^>]+id="pdk-player"[^>]+data-url=["\']?(?:https?:)?//player\.theplatform\.com/p/([^/]+)/(?:[^/]+/)*select/([^?#&"\']+)', webpage) + if mobj: + account_pid, tp_path = mobj.groups() + release_pid = tp_path.strip('/').split('/')[-1] + else: + account_pid = 'HNK2IC' + tp_path = release_pid = tve['release_pid'] + if tve.get('entitlement') == 'auth': + adobe_pass = settings.get('tve_adobe_auth', {}) + resource = self._get_mvpd_resource( + adobe_pass.get('adobePassResourceId', 'bravo'), + tve['title'], release_pid, tve.get('rating')) + query['auth'] = self._extract_mvpd_auth( + url, release_pid, adobe_pass.get('adobePassRequestorId', 'bravo'), resource) + else: + shared_playlist = settings['ls_playlist'] + account_pid = shared_playlist['account_pid'] + metadata = shared_playlist['video_metadata'][shared_playlist['default_clip']] + tp_path = release_pid = metadata.get('release_pid') + if not release_pid: + release_pid = metadata['guid'] + tp_path = 'media/guid/2140479951/' + release_pid + info.update({ + 'title': metadata['title'], + 'description': metadata.get('description'), + 'season_number': int_or_none(metadata.get('season_num')), + 'episode_number': int_or_none(metadata.get('episode_num')), + }) + query['switch'] = 'progressive' + info.update({ + '_type': 'url_transparent', + 'id': release_pid, + 'url': smuggle_url(update_url_query( + 'http://link.theplatform.com/s/%s/%s' % (account_pid, tp_path), + query), {'force_smil_url': True}), + 'ie_key': 'ThePlatform', + }) + return info diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/breakcom.py youtube-dl-2019.05.20/youtube_dl/extractor/breakcom.py --- youtube-dl-2016.02.22/youtube_dl/extractor/breakcom.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/breakcom.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,17 +1,17 @@ from __future__ import unicode_literals import re -import json from .common import InfoExtractor +from .youtube import YoutubeIE from ..utils import ( int_or_none, - parse_age_limit, + url_or_none, ) class BreakIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?break\.com/video/(?:[^/]+/)*.+-(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?break\.com/video/(?P<display_id>[^/]+?)(?:-(?P<id>\d+))?(?:[/?#&]|$)' _TESTS = [{ 'url': 'http://www.break.com/video/when-girls-act-like-guys-2468056', 'info_dict': { @@ -19,6 +19,21 @@ 'ext': 'mp4', 'title': 'When Girls Act Like D-Bags', 'age_limit': 13, + }, + }, { + # youtube embed + 'url': 'http://www.break.com/video/someone-forgot-boat-brakes-work', + 'info_dict': { + 'id': 'RrrDLdeL2HQ', + 'ext': 'mp4', + 'title': 'Whale Watching Boat Crashing Into San Diego Dock', + 'description': 'md5:afc1b2772f0a8468be51dd80eb021069', + 'upload_date': '20160331', + 'uploader': 'Steve Holden', + 'uploader_id': 'sdholden07', + }, + 'params': { + 'skip_download': True, } }, { 'url': 'http://www.break.com/video/ugc/baby-flex-2773063', @@ -26,39 +41,51 @@ }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage( - 'http://www.break.com/embed/%s' % video_id, video_id) - info = json.loads(self._search_regex( - r'var embedVars = ({.*})\s*?</script>', - webpage, 'info json', flags=re.DOTALL)) - - youtube_id = info.get('youtubeId') - if youtube_id: - return self.url_result(youtube_id, 'Youtube') - - formats = [{ - 'url': media['uri'] + '?' + info['AuthToken'], - 'tbr': media['bitRate'], - 'width': media['width'], - 'height': media['height'], - } for media in info['media'] if media.get('mediaPurpose') == 'play'] + display_id, video_id = re.match(self._VALID_URL, url).groups() - if not formats: + webpage = self._download_webpage(url, display_id) + + youtube_url = YoutubeIE._extract_url(webpage) + if youtube_url: + return self.url_result(youtube_url, ie=YoutubeIE.ie_key()) + + content = self._parse_json( + self._search_regex( + r'(?s)content["\']\s*:\s*(\[.+?\])\s*[,\n]', webpage, + 'content'), + display_id) + + formats = [] + for video in content: + video_url = url_or_none(video.get('url')) + if not video_url: + continue + bitrate = int_or_none(self._search_regex( + r'(\d+)_kbps', video_url, 'tbr', default=None)) formats.append({ - 'url': info['videoUri'] + 'url': video_url, + 'format_id': 'http-%d' % bitrate if bitrate else 'http', + 'tbr': bitrate, }) - self._sort_formats(formats) - duration = int_or_none(info.get('videoLengthInSeconds')) - age_limit = parse_age_limit(info.get('audienceRating')) + title = self._search_regex( + (r'title["\']\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1', + r'<h1[^>]*>(?P<value>[^<]+)'), webpage, 'title', group='value') + + def get(key, name): + return int_or_none(self._search_regex( + r'%s["\']\s*:\s*["\'](\d+)' % key, webpage, name, + default=None)) + + age_limit = get('ratings', 'age limit') + video_id = video_id or get('pid', 'video id') or display_id return { 'id': video_id, - 'title': info['contentName'], - 'thumbnail': info['thumbUri'], - 'duration': duration, + 'display_id': display_id, + 'title': title, + 'thumbnail': self._og_search_thumbnail(webpage), 'age_limit': age_limit, 'formats': formats, } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/brightcove.py youtube-dl-2019.05.20/youtube_dl/extractor/brightcove.py --- youtube-dl-2016.02.22/youtube_dl/extractor/brightcove.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/brightcove.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,38 +1,44 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals -import re +import base64 import json +import re +import struct from .common import InfoExtractor +from .adobepass import AdobePassIE from ..compat import ( compat_etree_fromstring, compat_parse_qs, compat_str, - compat_urllib_parse, compat_urllib_parse_urlparse, compat_urlparse, compat_xml_parse_error, + compat_HTTPError, ) from ..utils import ( determine_ext, ExtractorError, + extract_attributes, find_xpath_attr, fix_xml_ampersands, float_or_none, js_to_json, int_or_none, parse_iso8601, - sanitized_Request, unescapeHTML, unsmuggle_url, + update_url_query, + clean_html, + mimetype2ext, ) class BrightcoveLegacyIE(InfoExtractor): IE_NAME = 'brightcove:legacy' _VALID_URL = r'(?:https?://.*brightcove\.com/(services|viewer).*?\?|brightcove:)(?P<query>.*)' - _FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s' + _FEDERATED_URL = 'http://c.brightcove.com/services/viewer/htmlFederated' _TESTS = [ { @@ -46,6 +52,9 @@ 'title': 'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”', 'uploader': '8TV', 'description': 'md5:a950cc4285c43e44d763d036710cd9cd', + 'timestamp': 1368213670, + 'upload_date': '20130510', + 'uploader_id': '1589608506001', } }, { @@ -57,6 +66,9 @@ 'title': 'JVMLS 2012: Arrays 2.0 - Opportunities and Challenges', 'description': 'John Rose speaks at the JVM Language Summit, August 1, 2012.', 'uploader': 'Oracle', + 'timestamp': 1344975024, + 'upload_date': '20120814', + 'uploader_id': '1460825906', }, }, { @@ -68,6 +80,9 @@ 'title': 'This Bracelet Acts as a Personal Thermostat', 'description': 'md5:547b78c64f4112766ccf4e151c20b6a0', 'uploader': 'Mashable', + 'timestamp': 1382041798, + 'upload_date': '20131017', + 'uploader_id': '1130468786001', }, }, { @@ -81,22 +96,27 @@ 'description': 'md5:363109c02998fee92ec02211bd8000df', 'uploader': 'National Ballet of Canada', }, + 'skip': 'Video gone', }, { # test flv videos served by akamaihd.net # From http://www.redbull.com/en/bike/stories/1331655643987/replay-uci-dh-world-cup-2014-from-fort-william - 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?%40videoPlayer=ref%3ABC2996102916001&linkBaseURL=http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fvideos%2F1331655630249%2Freplay-uci-fort-william-2014-dh&playerKey=AQ%7E%7E%2CAAAApYJ7UqE%7E%2Cxqr_zXk0I-zzNndy8NlHogrCb5QdyZRf&playerID=1398061561001#__youtubedl_smuggle=%7B%22Referer%22%3A+%22http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fstories%2F1331655643987%2Freplay-uci-dh-world-cup-2014-from-fort-william%22%7D', + 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?%40videoPlayer=ref%3Aevent-stream-356&linkBaseURL=http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fvideos%2F1331655630249%2Freplay-uci-fort-william-2014-dh&playerKey=AQ%7E%7E%2CAAAApYJ7UqE%7E%2Cxqr_zXk0I-zzNndy8NlHogrCb5QdyZRf&playerID=1398061561001#__youtubedl_smuggle=%7B%22Referer%22%3A+%22http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fstories%2F1331655643987%2Freplay-uci-dh-world-cup-2014-from-fort-william%22%7D', # The md5 checksum changes on each download 'info_dict': { - 'id': '2996102916001', + 'id': '3750436379001', 'ext': 'flv', 'title': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals', - 'uploader': 'Red Bull TV', + 'uploader': 'RBTV Old (do not use)', 'description': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals', + 'timestamp': 1409122195, + 'upload_date': '20140827', + 'uploader_id': '710858724001', }, + 'skip': 'Video gone', }, { - # playlist test + # playlist with 'videoList' # from http://support.brightcove.com/en/video-cloud/docs/playlist-support-single-video-players 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=3550052898001&playerKey=AQ%7E%7E%2CAAABmA9XpXk%7E%2C-Kp7jNgisre1fG5OdqpAFUTcs0lP_ZoL', 'info_dict': { @@ -105,7 +125,28 @@ }, 'playlist_mincount': 7, }, + { + # playlist with 'playlistTab' (https://github.com/ytdl-org/youtube-dl/issues/9965) + 'url': 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=AQ%7E%7E,AAABXlLMdok%7E,NJ4EoMlZ4rZdx9eU1rkMVd8EaYPBBUlg', + 'info_dict': { + 'id': '1522758701001', + 'title': 'Lesson 08', + }, + 'playlist_mincount': 10, + }, + { + # playerID inferred from bcpid + # from http://www.un.org/chinese/News/story.asp?NewsID=27724 + 'url': 'https://link.brightcove.com/services/player/bcpid1722935254001/?bctid=5360463607001&autoStart=false&secureConnections=true&width=650&height=350', + 'only_matching': True, # Tested in GenericIE + } ] + FLV_VCODECS = { + 1: 'SORENSON', + 2: 'ON2', + 3: 'H264', + 4: 'VP8', + } @classmethod def _build_brighcove_url(cls, object_str): @@ -114,10 +155,10 @@ <object class="BrightcoveExperience">{params}</object> """ - # Fix up some stupid HTML, see https://github.com/rg3/youtube-dl/issues/1553 + # Fix up some stupid HTML, see https://github.com/ytdl-org/youtube-dl/issues/1553 object_str = re.sub(r'(<param(?:\s+[a-zA-Z0-9_]+="[^"]*")*)>', lambda m: m.group(1) + '/>', object_str) - # Fix up some stupid XML, see https://github.com/rg3/youtube-dl/issues/1608 + # Fix up some stupid XML, see https://github.com/ytdl-org/youtube-dl/issues/1608 object_str = object_str.replace('<--', '<!--') # remove namespace to simplify extraction object_str = re.sub(r'(<object[^>]*)(xmlns=".*?")', r'\1', object_str) @@ -136,17 +177,20 @@ else: flashvars = {} + data_url = object_doc.attrib.get('data', '') + data_url_params = compat_parse_qs(compat_urllib_parse_urlparse(data_url).query) + def find_param(name): if name in flashvars: return flashvars[name] node = find_xpath_attr(object_doc, './param', 'name', name) if node is not None: return node.attrib['value'] - return None + return data_url_params.get(name) params = {} - playerID = find_param('playerID') + playerID = find_param('playerID') or find_param('playerId') if playerID is None: raise ExtractorError('Cannot find player ID') params['playerID'] = playerID @@ -155,9 +199,19 @@ # Not all pages define this value if playerKey is not None: params['playerKey'] = playerKey - # The three fields hold the id of the video - videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID') + # These fields hold the id of the video + videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID') or find_param('@videoList') if videoPlayer is not None: + if isinstance(videoPlayer, list): + videoPlayer = videoPlayer[0] + videoPlayer = videoPlayer.strip() + # UUID is also possible for videoPlayer (e.g. + # http://www.popcornflix.com/hoodies-vs-hooligans/7f2d2b87-bbf2-4623-acfb-ea942b4f01dd + # or http://www8.hp.com/cn/zh/home.html) + if not (re.match( + r'^(?:\d+|[\da-fA-F]{8}-?[\da-fA-F]{4}-?[\da-fA-F]{4}-?[\da-fA-F]{4}-?[\da-fA-F]{12})$', + videoPlayer) or videoPlayer.startswith('ref:')): + return None params['@videoPlayer'] = videoPlayer linkBase = find_param('linkBaseURL') if linkBase is not None: @@ -171,7 +225,7 @@ # // build Brightcove <object /> XML # } m = re.search( - r'''(?x)customBC.\createVideo\( + r'''(?x)customBC\.createVideo\( .*? # skipping width and height ["\'](?P<playerID>\d+)["\']\s*,\s* # playerID ["\'](?P<playerKey>AQ[^"\']{48})[^"\']*["\']\s*,\s* # playerKey begins with AQ and is 50 characters @@ -184,8 +238,7 @@ @classmethod def _make_brightcove_url(cls, params): - data = compat_urllib_parse.urlencode(params) - return cls._FEDERATED_URL_TEMPLATE % data + return update_url_query(cls._FEDERATED_URL, params) @classmethod def _extract_brightcove_url(cls, webpage): @@ -200,13 +253,16 @@ """Return a list of all Brightcove URLs from the webpage """ url_m = re.search( - r'<meta\s+property=[\'"]og:video[\'"]\s+content=[\'"](https?://(?:secure|c)\.brightcove.com/[^\'"]+)[\'"]', - webpage) + r'''(?x) + <meta\s+ + (?:property|itemprop)=([\'"])(?:og:video|embedURL)\1[^>]+ + content=([\'"])(?P<url>https?://(?:secure|c)\.brightcove.com/(?:(?!\2).)+)\2 + ''', webpage) if url_m: - url = unescapeHTML(url_m.group(1)) + url = unescapeHTML(url_m.group('url')) # Some sites don't add it, we can't download with this url, for example: # http://www.ktvu.com/videos/news/raw-video-caltrain-releases-video-of-man-almost/vCTZdY/ - if 'playerKey' in url or 'videoId' in url: + if 'playerKey' in url or 'videoId' in url or 'idVideo' in url: return [url] matches = re.findall( @@ -219,15 +275,19 @@ if matches: return list(filter(None, [cls._build_brighcove_url(m) for m in matches])) - return list(filter(None, [ - cls._build_brighcove_url_from_js(custom_bc) - for custom_bc in re.findall(r'(customBC\.createVideo\(.+?\);)', webpage)])) + matches = re.findall(r'(customBC\.createVideo\(.+?\);)', webpage) + if matches: + return list(filter(None, [ + cls._build_brighcove_url_from_js(custom_bc) + for custom_bc in matches])) + return [src for _, src in re.findall( + r'<iframe[^>]+src=([\'"])((?:https?:)?//link\.brightcove\.com/services/player/(?!\1).+)\1', webpage)] def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) # Change the 'videoId' and others field to '@videoPlayer' - url = re.sub(r'(?<=[?&])(videoI(d|D)|bctid)', '%40videoPlayer', url) + url = re.sub(r'(?<=[?&])(videoI(d|D)|idVideo|bctid)', '%40videoPlayer', url) # Change bckey (used by bcove.me urls) to playerKey url = re.sub(r'(?<=[?&])bckey', 'playerKey', url) mobj = re.match(self._VALID_URL, url) @@ -238,8 +298,12 @@ if videoPlayer: # We set the original url as the default 'Referer' header referer = smuggled_data.get('Referer', url) + if 'playerID' not in query: + mobj = re.search(r'/bcpid(\d+)', url) + if mobj is not None: + query['playerID'] = [mobj.group(1)] return self._get_video_info( - videoPlayer[0], query_str, query, referer=referer) + videoPlayer[0], query, referer=referer) elif 'playerKey' in query: player_key = query['playerKey'] return self._get_playlist_info(player_key[0]) @@ -248,20 +312,45 @@ 'Cannot find playerKey= variable. Did you forget quotes in a shell invocation?', expected=True) - def _get_video_info(self, video_id, query_str, query, referer=None): - request_url = self._FEDERATED_URL_TEMPLATE % query_str - req = sanitized_Request(request_url) + def _brightcove_new_url_result(self, publisher_id, video_id): + brightcove_new_url = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' % (publisher_id, video_id) + return self.url_result(brightcove_new_url, BrightcoveNewIE.ie_key(), video_id) + + def _get_video_info(self, video_id, query, referer=None): + headers = {} linkBase = query.get('linkBaseURL') if linkBase is not None: referer = linkBase[0] if referer is not None: - req.add_header('Referer', referer) - webpage = self._download_webpage(req, video_id) + headers['Referer'] = referer + webpage = self._download_webpage(self._FEDERATED_URL, video_id, headers=headers, query=query) error_msg = self._html_search_regex( r"<h1>We're sorry.</h1>([\s\n]*<p>.*?</p>)+", webpage, 'error message', default=None) if error_msg is not None: + publisher_id = query.get('publisherId') + if publisher_id and publisher_id[0].isdigit(): + publisher_id = publisher_id[0] + if not publisher_id: + player_key = query.get('playerKey') + if player_key and ',' in player_key[0]: + player_key = player_key[0] + else: + player_id = query.get('playerID') + if player_id and player_id[0].isdigit(): + player_page = self._download_webpage( + 'http://link.brightcove.com/services/player/bcpid' + player_id[0], + video_id, headers=headers, fatal=False) + if player_page: + player_key = self._search_regex( + r'<param\s+name="playerKey"\s+value="([\w~,-]+)"', + player_page, 'player key', fatal=False) + if player_key: + enc_pub_id = player_key.split(',')[1].replace('~', '=') + publisher_id = struct.unpack('>Q', base64.urlsafe_b64decode(enc_pub_id))[0] + if publisher_id: + return self._brightcove_new_url_result(publisher_id, video_id) raise ExtractorError( 'brightcove said: %s' % error_msg, expected=True) @@ -279,24 +368,35 @@ info_url, player_key, 'Downloading playlist information') json_data = json.loads(playlist_info) - if 'videoList' not in json_data: + if 'videoList' in json_data: + playlist_info = json_data['videoList'] + playlist_dto = playlist_info['mediaCollectionDTO'] + elif 'playlistTabs' in json_data: + playlist_info = json_data['playlistTabs'] + playlist_dto = playlist_info['lineupListDTO']['playlistDTOs'][0] + else: raise ExtractorError('Empty playlist') - playlist_info = json_data['videoList'] - videos = [self._extract_video_info(video_info) for video_info in playlist_info['mediaCollectionDTO']['videoDTOs']] + + videos = [self._extract_video_info(video_info) for video_info in playlist_dto['videoDTOs']] return self.playlist_result(videos, playlist_id='%s' % playlist_info['id'], - playlist_title=playlist_info['mediaCollectionDTO']['displayName']) + playlist_title=playlist_dto['displayName']) def _extract_video_info(self, video_info): + video_id = compat_str(video_info['id']) + publisher_id = video_info.get('publisherId') info = { - 'id': compat_str(video_info['id']), + 'id': video_id, 'title': video_info['displayName'].strip(), 'description': video_info.get('shortDescription'), 'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'), 'uploader': video_info.get('publisherName'), + 'uploader_id': compat_str(publisher_id) if publisher_id else None, + 'duration': float_or_none(video_info.get('length'), 1000), + 'timestamp': int_or_none(video_info.get('creationDate'), 1000), } - renditions = video_info.get('renditions') + renditions = video_info.get('renditions', []) + video_info.get('IOSRenditions', []) if renditions: formats = [] for rend in renditions: @@ -308,7 +408,8 @@ url_comp = compat_urllib_parse_urlparse(url) if url_comp.path.endswith('.m3u8'): formats.extend( - self._extract_m3u8_formats(url, info['id'], 'mp4')) + self._extract_m3u8_formats( + url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) continue elif 'akamaihd.net' in url_comp.netloc: # This type of renditions are served through @@ -317,19 +418,42 @@ ext = 'flv' if ext is None: ext = determine_ext(url) - size = rend.get('size') - formats.append({ + tbr = int_or_none(rend.get('encodingRate'), 1000) + a_format = { + 'format_id': 'http%s' % ('-%s' % tbr if tbr else ''), 'url': url, 'ext': ext, - 'height': rend.get('frameHeight'), - 'width': rend.get('frameWidth'), - 'filesize': size if size != 0 else None, - }) + 'filesize': int_or_none(rend.get('size')) or None, + 'tbr': tbr, + } + if rend.get('audioOnly'): + a_format.update({ + 'vcodec': 'none', + }) + else: + a_format.update({ + 'height': int_or_none(rend.get('frameHeight')), + 'width': int_or_none(rend.get('frameWidth')), + 'vcodec': rend.get('videoCodec'), + }) + + # m3u8 manifests with remote == false are media playlists + # Not calling _extract_m3u8_formats here to save network traffic + if ext == 'm3u8': + a_format.update({ + 'format_id': 'hls%s' % ('-%s' % tbr if tbr else ''), + 'ext': 'mp4', + 'protocol': 'm3u8_native', + }) + + formats.append(a_format) self._sort_formats(formats) info['formats'] = formats elif video_info.get('FLVFullLengthURL') is not None: info.update({ 'url': video_info['FLVFullLengthURL'], + 'vcodec': self.FLV_VCODECS.get(video_info.get('FLVFullCodec')), + 'filesize': int_or_none(video_info.get('FLVFullSize')), }) if self._downloader.params.get('include_ads', False): @@ -348,14 +472,18 @@ else: return ad_info - if 'url' not in info and not info.get('formats'): - raise ExtractorError('Unable to extract video url for %s' % info['id']) + if not info.get('url') and not info.get('formats'): + uploader_id = info.get('uploader_id') + if uploader_id: + info.update(self._brightcove_new_url_result(uploader_id, video_id)) + else: + raise ExtractorError('Unable to extract video url for %s' % video_id) return info -class BrightcoveNewIE(InfoExtractor): +class BrightcoveNewIE(AdobePassIE): IE_NAME = 'brightcove:new' - _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*videoId=(?P<video_id>(?:ref:)?\d+)' + _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*videoId=(?P<video_id>\d+|ref:[^&]+)' _TESTS = [{ 'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001', 'md5': 'c8100925723840d4b0d243f7025703be', @@ -368,7 +496,7 @@ 'timestamp': 1441391203, 'upload_date': '20150904', 'uploader_id': '929656772001', - 'formats': 'mincount:22', + 'formats': 'mincount:20', }, }, { # with rtmp streams @@ -382,96 +510,112 @@ 'timestamp': 1433556729, 'upload_date': '20150606', 'uploader_id': '4036320279001', - 'formats': 'mincount:41', + 'formats': 'mincount:39', }, 'params': { + # m3u8 download 'skip_download': True, } }, { # ref: prefixed video id 'url': 'http://players.brightcove.net/3910869709001/21519b5c-4b3b-4363-accb-bdc8f358f823_default/index.html?videoId=ref:7069442', 'only_matching': True, + }, { + # non numeric ref: prefixed video id + 'url': 'http://players.brightcove.net/710858724001/default_default/index.html?videoId=ref:event-stream-356', + 'only_matching': True, + }, { + # unavailable video without message but with error_code + 'url': 'http://players.brightcove.net/1305187701/c832abfb-641b-44eb-9da0-2fe76786505f_default/index.html?videoId=4377407326001', + 'only_matching': True, }] @staticmethod - def _extract_url(webpage): - urls = BrightcoveNewIE._extract_urls(webpage) + def _extract_url(ie, webpage): + urls = BrightcoveNewIE._extract_urls(ie, webpage) return urls[0] if urls else None @staticmethod - def _extract_urls(webpage): + def _extract_urls(ie, webpage): # Reference: # 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe - # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript - # 3. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/embed-in-page.html - # 4. https://support.brightcove.com/en/video-cloud/docs/dynamically-assigning-videos-player + # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#tag + # 3. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript + # 4. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/in-page-embed-player-implementation.html + # 5. https://support.brightcove.com/en/video-cloud/docs/dynamically-assigning-videos-player entries = [] # Look for iframe embeds [1] for _, url in re.findall( - r'<iframe[^>]+src=(["\'])((?:https?:)//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage): - entries.append(url) + r'<iframe[^>]+src=(["\'])((?:https?:)?//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage): + entries.append(url if url.startswith('http') else 'http:' + url) - # Look for embed_in_page embeds [2] - for video_id, account_id, player_id, embed in re.findall( - # According to examples from [3] it's unclear whether video id - # may be optional and what to do when it is - # According to [4] data-video-id may be prefixed with ref: - r'''(?sx) - <video[^>]+ - data-video-id=["\']((?:ref:)?\d+)["\'][^>]*>.*? - </video>.*? - <script[^>]+ - src=["\'](?:https?:)?//players\.brightcove\.net/ - (\d+)/([\da-f-]+)_([^/]+)/index\.min\.js + # Look for <video> tags [2] and embed_in_page embeds [3] + # [2] looks like: + for video, script_tag, account_id, player_id, embed in re.findall( + r'''(?isx) + (<video\s+[^>]*\bdata-video-id\s*=\s*['"]?[^>]+>) + (?:.*? + (<script[^>]+ + src=["\'](?:https?:)?//players\.brightcove\.net/ + (\d+)/([^/]+)_([^/]+)/index(?:\.min)?\.js + ) + )? ''', webpage): - entries.append( - 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' - % (account_id, player_id, embed, video_id)) + attrs = extract_attributes(video) - return entries + # According to examples from [4] it's unclear whether video id + # may be optional and what to do when it is + video_id = attrs.get('data-video-id') + if not video_id: + continue + + account_id = account_id or attrs.get('data-account') + if not account_id: + continue + + player_id = player_id or attrs.get('data-player') or 'default' + embed = embed or attrs.get('data-embed') or 'default' + + bc_url = 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' % ( + account_id, player_id, embed, video_id) + + # Some brightcove videos may be embedded with video tag only and + # without script tag or any mentioning of brightcove at all. Such + # embeds are considered ambiguous since they are matched based only + # on data-video-id and data-account attributes and in the wild may + # not be brightcove embeds at all. Let's check reconstructed + # brightcove URLs in case of such embeds and only process valid + # ones. By this we ensure there is indeed a brightcove embed. + if not script_tag and not ie._is_valid_url( + bc_url, video_id, 'possible brightcove video'): + continue - def _real_extract(self, url): - account_id, player_id, embed, video_id = re.match(self._VALID_URL, url).groups() - - webpage = self._download_webpage( - 'http://players.brightcove.net/%s/%s_%s/index.min.js' - % (account_id, player_id, embed), video_id) + entries.append(bc_url) - policy_key = None - - catalog = self._search_regex( - r'catalog\(({.+?})\);', webpage, 'catalog', default=None) - if catalog: - catalog = self._parse_json( - js_to_json(catalog), video_id, fatal=False) - if catalog: - policy_key = catalog.get('policyKey') - - if not policy_key: - policy_key = self._search_regex( - r'policyKey\s*:\s*(["\'])(?P<pk>.+?)\1', - webpage, 'policy key', group='pk') - - req = sanitized_Request( - 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' - % (account_id, video_id), - headers={'Accept': 'application/json;pk=%s' % policy_key}) - json_data = self._download_json(req, video_id) + return entries - title = json_data['name'] + def _parse_brightcove_metadata(self, json_data, video_id, headers={}): + title = json_data['name'].strip() formats = [] for source in json_data.get('sources', []): - source_type = source.get('type') + container = source.get('container') + ext = mimetype2ext(source.get('type')) src = source.get('src') - if source_type == 'application/x-mpegURL': + # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object + if ext == 'ism' or container == 'WVM' or source.get('key_systems'): + continue + elif ext == 'm3u8' or container == 'M2TS': if not src: continue formats.extend(self._extract_m3u8_formats( - src, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) + src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + elif ext == 'mpd': + if not src: + continue + formats.extend(self._extract_mpd_formats(src, video_id, 'dash', fatal=False)) else: streaming_src = source.get('streaming_src') stream_name, app_name = source.get('stream_name'), source.get('app_name') @@ -479,15 +623,23 @@ continue tbr = float_or_none(source.get('avg_bitrate'), 1000) height = int_or_none(source.get('height')) + width = int_or_none(source.get('width')) f = { 'tbr': tbr, - 'width': int_or_none(source.get('width')), - 'height': height, 'filesize': int_or_none(source.get('size')), - 'container': source.get('container'), - 'vcodec': source.get('codec'), - 'ext': source.get('container').lower(), + 'container': container, + 'ext': ext or container.lower(), } + if width == 0 and height == 0: + f.update({ + 'vcodec': 'none', + }) + else: + f.update({ + 'width': width, + 'height': height, + 'vcodec': source.get('codec'), + }) def build_format_id(kind): format_id = kind @@ -501,7 +653,7 @@ f.update({ 'url': src or streaming_src, 'format_id': build_format_id('http' if src else 'http-streaming'), - 'preference': 2 if src else 1, + 'source_preference': 0 if src else -1, }) else: f.update({ @@ -510,22 +662,114 @@ 'format_id': build_format_id('rtmp'), }) formats.append(f) + if not formats: + # for sonyliv.com DRM protected videos + s3_source_url = json_data.get('custom_fields', {}).get('s3sourceurl') + if s3_source_url: + formats.append({ + 'url': s3_source_url, + 'format_id': 'source', + }) + + errors = json_data.get('errors') + if not formats and errors: + error = errors[0] + raise ExtractorError( + error.get('message') or error.get('error_subcode') or error['error_code'], expected=True) + self._sort_formats(formats) - description = json_data.get('description') - thumbnail = json_data.get('thumbnail') - timestamp = parse_iso8601(json_data.get('published_at')) + for f in formats: + f.setdefault('http_headers', {}).update(headers) + + subtitles = {} + for text_track in json_data.get('text_tracks', []): + if text_track.get('src'): + subtitles.setdefault(text_track.get('srclang'), []).append({ + 'url': text_track['src'], + }) + + is_live = False duration = float_or_none(json_data.get('duration'), 1000) - tags = json_data.get('tags', []) + if duration is not None and duration <= 0: + is_live = True return { 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, + 'title': self._live_title(title) if is_live else title, + 'description': clean_html(json_data.get('description')), + 'thumbnail': json_data.get('thumbnail') or json_data.get('poster'), 'duration': duration, - 'timestamp': timestamp, - 'uploader_id': account_id, + 'timestamp': parse_iso8601(json_data.get('published_at')), + 'uploader_id': json_data.get('account_id'), 'formats': formats, - 'tags': tags, + 'subtitles': subtitles, + 'tags': json_data.get('tags', []), + 'is_live': is_live, } + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + self._initialize_geo_bypass({ + 'countries': smuggled_data.get('geo_countries'), + 'ip_blocks': smuggled_data.get('geo_ip_blocks'), + }) + + account_id, player_id, embed, video_id = re.match(self._VALID_URL, url).groups() + + webpage = self._download_webpage( + 'http://players.brightcove.net/%s/%s_%s/index.min.js' + % (account_id, player_id, embed), video_id) + + policy_key = None + + catalog = self._search_regex( + r'catalog\(({.+?})\);', webpage, 'catalog', default=None) + if catalog: + catalog = self._parse_json( + js_to_json(catalog), video_id, fatal=False) + if catalog: + policy_key = catalog.get('policyKey') + + if not policy_key: + policy_key = self._search_regex( + r'policyKey\s*:\s*(["\'])(?P<pk>.+?)\1', + webpage, 'policy key', group='pk') + + api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' % (account_id, video_id) + headers = { + 'Accept': 'application/json;pk=%s' % policy_key, + } + referrer = smuggled_data.get('referrer') + if referrer: + headers.update({ + 'Referer': referrer, + 'Origin': re.search(r'https?://[^/]+', referrer).group(0), + }) + try: + json_data = self._download_json(api_url, video_id, headers=headers) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + json_data = self._parse_json(e.cause.read().decode(), video_id)[0] + message = json_data.get('message') or json_data['error_code'] + if json_data.get('error_subcode') == 'CLIENT_GEO': + self.raise_geo_restricted(msg=message) + raise ExtractorError(message, expected=True) + raise + + errors = json_data.get('errors') + if errors and errors[0].get('error_subcode') == 'TVE_AUTH': + custom_fields = json_data['custom_fields'] + tve_token = self._extract_mvpd_auth( + smuggled_data['source_url'], video_id, + custom_fields['bcadobepassrequestorid'], + custom_fields['bcadobepassresourceid']) + json_data = self._download_json( + api_url, video_id, headers={ + 'Accept': 'application/json;pk=%s' % policy_key + }, query={ + 'tveToken': tve_token, + }) + + return self._parse_brightcove_metadata( + json_data, video_id, headers=headers) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/br.py youtube-dl-2019.05.20/youtube_dl/extractor/br.py --- youtube-dl-2016.02.22/youtube_dl/extractor/br.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/br.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,20 +1,23 @@ # coding: utf-8 from __future__ import unicode_literals +import json import re from .common import InfoExtractor from ..utils import ( + determine_ext, ExtractorError, int_or_none, parse_duration, + parse_iso8601, xpath_element, xpath_text, ) class BRIE(InfoExtractor): - IE_DESC = 'Bayerischer Rundfunk Mediathek' + IE_DESC = 'Bayerischer Rundfunk' _VALID_URL = r'(?P<base_url>https?://(?:www\.)?br(?:-klassik)?\.de)/(?:[a-z0-9\-_]+/)+(?P<id>[a-z0-9\-_]+)\.html' _TESTS = [ @@ -29,7 +32,8 @@ 'duration': 180, 'uploader': 'Reinhard Weber', 'upload_date': '20150422', - } + }, + 'skip': '404 not found', }, { 'url': 'http://www.br.de/nachrichten/oberbayern/inhalt/muenchner-polizeipraesident-schreiber-gestorben-100.html', @@ -40,7 +44,8 @@ 'title': 'Manfred Schreiber ist tot', 'description': 'md5:b454d867f2a9fc524ebe88c3f5092d97', 'duration': 26, - } + }, + 'skip': '404 not found', }, { 'url': 'https://www.br-klassik.de/audio/peeping-tom-premierenkritik-dance-festival-muenchen-100.html', @@ -51,7 +56,8 @@ 'title': 'Kurzweilig und sehr bewegend', 'description': 'md5:0351996e3283d64adeb38ede91fac54e', 'duration': 296, - } + }, + 'skip': '404 not found', }, { 'url': 'http://www.br.de/radio/bayern1/service/team/videos/team-video-erdelt100.html', @@ -74,7 +80,7 @@ 'description': 'md5:bb659990e9e59905c3d41e369db1fbe3', 'duration': 893, 'uploader': 'Eva Maria Steimle', - 'upload_date': '20140117', + 'upload_date': '20170208', } }, ] @@ -120,10 +126,10 @@ for asset in assets.findall('asset'): format_url = xpath_text(asset, ['downloadUrl', 'url']) asset_type = asset.get('type') - if asset_type == 'HDS': + if asset_type.startswith('HDS'): formats.extend(self._extract_f4m_formats( format_url + '?hdcore=3.2.0', media_id, f4m_id='hds', fatal=False)) - elif asset_type == 'HLS': + elif asset_type.startswith('HLS'): formats.extend(self._extract_m3u8_formats( format_url, media_id, 'mp4', 'm3u8_native', m3u8_id='hds', fatal=False)) else: @@ -166,3 +172,140 @@ } for variant in variants.findall('variant') if xpath_text(variant, 'url')] thumbnails.sort(key=lambda x: x['width'] * x['height'], reverse=True) return thumbnails + + +class BRMediathekIE(InfoExtractor): + IE_DESC = 'Bayerischer Rundfunk Mediathek' + _VALID_URL = r'https?://(?:www\.)?br\.de/mediathek/video/[^/?&#]*?-(?P<id>av:[0-9a-f]{24})' + + _TESTS = [{ + 'url': 'https://www.br.de/mediathek/video/gesundheit-die-sendung-vom-28112017-av:5a1e6a6e8fce6d001871cc8e', + 'md5': 'fdc3d485835966d1622587d08ba632ec', + 'info_dict': { + 'id': 'av:5a1e6a6e8fce6d001871cc8e', + 'ext': 'mp4', + 'title': 'Die Sendung vom 28.11.2017', + 'description': 'md5:6000cdca5912ab2277e5b7339f201ccc', + 'timestamp': 1511942766, + 'upload_date': '20171129', + } + }] + + def _real_extract(self, url): + clip_id = self._match_id(url) + + clip = self._download_json( + 'https://proxy-base.master.mango.express/graphql', + clip_id, data=json.dumps({ + "query": """{ + viewer { + clip(id: "%s") { + title + description + duration + createdAt + ageRestriction + videoFiles { + edges { + node { + publicLocation + fileSize + videoProfile { + width + height + bitrate + encoding + } + } + } + } + captionFiles { + edges { + node { + publicLocation + } + } + } + teaserImages { + edges { + node { + imageFiles { + edges { + node { + publicLocation + width + height + } + } + } + } + } + } + } + } +}""" % clip_id}).encode(), headers={ + 'Content-Type': 'application/json', + })['data']['viewer']['clip'] + title = clip['title'] + + formats = [] + for edge in clip.get('videoFiles', {}).get('edges', []): + node = edge.get('node', {}) + n_url = node.get('publicLocation') + if not n_url: + continue + ext = determine_ext(n_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + n_url, clip_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + else: + video_profile = node.get('videoProfile', {}) + tbr = int_or_none(video_profile.get('bitrate')) + format_id = 'http' + if tbr: + format_id += '-%d' % tbr + formats.append({ + 'format_id': format_id, + 'url': n_url, + 'width': int_or_none(video_profile.get('width')), + 'height': int_or_none(video_profile.get('height')), + 'tbr': tbr, + 'filesize': int_or_none(node.get('fileSize')), + }) + self._sort_formats(formats) + + subtitles = {} + for edge in clip.get('captionFiles', {}).get('edges', []): + node = edge.get('node', {}) + n_url = node.get('publicLocation') + if not n_url: + continue + subtitles.setdefault('de', []).append({ + 'url': n_url, + }) + + thumbnails = [] + for edge in clip.get('teaserImages', {}).get('edges', []): + for image_edge in edge.get('node', {}).get('imageFiles', {}).get('edges', []): + node = image_edge.get('node', {}) + n_url = node.get('publicLocation') + if not n_url: + continue + thumbnails.append({ + 'url': n_url, + 'width': int_or_none(node.get('width')), + 'height': int_or_none(node.get('height')), + }) + + return { + 'id': clip_id, + 'title': title, + 'description': clip.get('description'), + 'duration': int_or_none(clip.get('duration')), + 'timestamp': parse_iso8601(clip.get('createdAt')), + 'age_limit': int_or_none(clip.get('ageRestriction')), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/businessinsider.py youtube-dl-2019.05.20/youtube_dl/extractor/businessinsider.py --- youtube-dl-2016.02.22/youtube_dl/extractor/businessinsider.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/businessinsider.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,42 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .jwplatform import JWPlatformIE + + +class BusinessInsiderIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+\.)?businessinsider\.(?:com|nl)/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'http://uk.businessinsider.com/how-much-radiation-youre-exposed-to-in-everyday-life-2016-6', + 'md5': 'ca237a53a8eb20b6dc5bd60564d4ab3e', + 'info_dict': { + 'id': 'hZRllCfw', + 'ext': 'mp4', + 'title': "Here's how much radiation you're exposed to in everyday life", + 'description': 'md5:9a0d6e2c279948aadaa5e84d6d9b99bd', + 'upload_date': '20170709', + 'timestamp': 1499606400, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.businessinsider.nl/5-scientifically-proven-things-make-you-less-attractive-2017-7/', + 'only_matching': True, + }, { + 'url': 'http://www.businessinsider.com/excel-index-match-vlookup-video-how-to-2015-2?IR=T', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + jwplatform_id = self._search_regex( + (r'data-media-id=["\']([a-zA-Z0-9]{8})', + r'id=["\']jwplayer_([a-zA-Z0-9]{8})', + r'id["\']?\s*:\s*["\']?([a-zA-Z0-9]{8})'), + webpage, 'jwplatform id') + return self.url_result( + 'jwplatform:%s' % jwplatform_id, ie=JWPlatformIE.ie_key(), + video_id=video_id) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/buzzfeed.py youtube-dl-2019.05.20/youtube_dl/extractor/buzzfeed.py --- youtube-dl-2016.02.22/youtube_dl/extractor/buzzfeed.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/buzzfeed.py 2019-06-07 18:49:07.000000000 +0000 @@ -5,6 +5,7 @@ import re from .common import InfoExtractor +from .facebook import FacebookIE class BuzzFeedIE(InfoExtractor): @@ -20,11 +21,11 @@ 'info_dict': { 'id': 'aVCR29aE_OQ', 'ext': 'mp4', + 'title': 'Angry Ram destroys a punching bag..', + 'description': 'md5:c59533190ef23fd4458a5e8c8c872345', 'upload_date': '20141024', 'uploader_id': 'Buddhanz1', - 'description': 'He likes to stay in shape with his heavy bag, he wont stop until its on the ground\n\nFollow Angry Ram on Facebook for regular updates -\nhttps://www.facebook.com/pages/Angry-Ram/1436897249899558?ref=hl', - 'uploader': 'Buddhanz', - 'title': 'Angry Ram destroys a punching bag', + 'uploader': 'Angry Ram', } }] }, { @@ -41,13 +42,30 @@ 'info_dict': { 'id': 'mVmBL8B-In0', 'ext': 'mp4', + 'title': 're:Munchkin the Teddy Bear gets her exercise', + 'description': 'md5:28faab95cda6e361bcff06ec12fc21d8', 'upload_date': '20141124', 'uploader_id': 'CindysMunchkin', - 'description': 're:© 2014 Munchkin the', 'uploader': 're:^Munchkin the', - 'title': 're:Munchkin the Teddy Bear gets her exercise', }, }] + }, { + 'url': 'http://www.buzzfeed.com/craigsilverman/the-most-adorable-crash-landing-ever#.eq7pX0BAmK', + 'info_dict': { + 'id': 'the-most-adorable-crash-landing-ever', + 'title': 'Watch This Baby Goose Make The Most Adorable Crash Landing', + 'description': 'This gosling knows how to stick a landing.', + }, + 'playlist': [{ + 'md5': '763ca415512f91ca62e4621086900a23', + 'info_dict': { + 'id': '971793786185728', + 'ext': 'mp4', + 'title': 'We set up crash pads so that the goslings on our roof would have a safe landi...', + 'uploader': 'Calgary Outdoor Centre-University of Calgary', + }, + }], + 'add_ie': ['Facebook'], }] def _real_extract(self, url): @@ -66,6 +84,11 @@ continue entries.append(self.url_result(video['url'])) + facebook_urls = FacebookIE._extract_urls(webpage) + entries.extend([ + self.url_result(facebook_url) + for facebook_url in facebook_urls]) + return { '_type': 'playlist', 'id': playlist_id, diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/byutv.py youtube-dl-2019.05.20/youtube_dl/extractor/byutv.py --- youtube-dl-2016.02.22/youtube_dl/extractor/byutv.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/byutv.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,49 +1,92 @@ from __future__ import unicode_literals -import json import re from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import parse_duration class BYUtvIE(InfoExtractor): - _VALID_URL = r'^https?://(?:www\.)?byutv.org/watch/[0-9a-f-]+/(?P<video_id>[^/?#]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?byutv\.org/(?:watch|player)/(?!event/)(?P<id>[0-9a-f-]+)(?:/(?P<display_id>[^/?#&]+))?' + _TESTS = [{ + # ooyalaVOD 'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d/studio-c-season-5-episode-5', 'info_dict': { - 'id': 'studio-c-season-5-episode-5', + 'id': 'ZvanRocTpW-G5_yZFeltTAMv6jxOU9KH', + 'display_id': 'studio-c-season-5-episode-5', 'ext': 'mp4', - 'description': 'md5:e07269172baff037f8e8bf9956bc9747', 'title': 'Season 5 Episode 5', - 'thumbnail': 're:^https?://.*\.jpg$', + 'description': 'md5:1d31dc18ef4f075b28f6a65937d22c65', + 'thumbnail': r're:^https?://.*', 'duration': 1486.486, }, 'params': { 'skip_download': True, - } - } + }, + 'add_ie': ['Ooyala'], + }, { + # dvr + 'url': 'https://www.byutv.org/player/8f1dab9b-b243-47c8-b525-3e2d021a3451/byu-softball-pacific-vs-byu-41219---game-2', + 'info_dict': { + 'id': '8f1dab9b-b243-47c8-b525-3e2d021a3451', + 'display_id': 'byu-softball-pacific-vs-byu-41219---game-2', + 'ext': 'mp4', + 'title': 'Pacific vs. BYU (4/12/19)', + 'description': 'md5:1ac7b57cb9a78015910a4834790ce1f3', + 'duration': 11645, + }, + 'params': { + 'skip_download': True + }, + }, { + 'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d', + 'only_matching': True, + }, { + 'url': 'https://www.byutv.org/player/27741493-dc83-40b0-8420-e7ae38a2ae98/byu-football-toledo-vs-byu-93016?listid=4fe0fee5-0d3c-4a29-b725-e4948627f472&listindex=0&q=toledo', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('video_id') + video_id = mobj.group('id') + display_id = mobj.group('display_id') or video_id - webpage = self._download_webpage(url, video_id) - episode_code = self._search_regex( - r'(?s)episode:(.*?\}),\s*\n', webpage, 'episode information') - episode_json = re.sub( - r'(\n\s+)([a-zA-Z]+):\s+\'(.*?)\'', r'\1"\2": "\3"', episode_code) - ep = json.loads(episode_json) + info = self._download_json( + 'https://api.byutv.org/api3/catalog/getvideosforcontent', + display_id, query={ + 'contentid': video_id, + 'channel': 'byutv', + 'x-byutv-context': 'web$US', + }, headers={ + 'x-byutv-context': 'web$US', + 'x-byutv-platformkey': 'xsaaw9c7y5', + }) - if ep['providerType'] == 'Ooyala': + ep = info.get('ooyalaVOD') + if ep: return { '_type': 'url_transparent', 'ie_key': 'Ooyala', 'url': 'ooyala:%s' % ep['providerId'], 'id': video_id, - 'title': ep['title'], + 'display_id': display_id, + 'title': ep.get('title'), 'description': ep.get('description'), 'thumbnail': ep.get('imageThumbnail'), } - else: - raise ExtractorError('Unsupported provider %s' % ep['provider']) + + ep = info['dvr'] + title = ep['title'] + formats = self._extract_m3u8_formats( + ep['videoUrl'], video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + self._sort_formats(formats) + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': ep.get('description'), + 'thumbnail': ep.get('imageThumbnail'), + 'duration': parse_duration(ep.get('length')), + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/c56.py youtube-dl-2019.05.20/youtube_dl/extractor/c56.py --- youtube-dl-2016.02.22/youtube_dl/extractor/c56.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/c56.py 2019-06-07 18:49:07.000000000 +0000 @@ -4,12 +4,13 @@ import re from .common import InfoExtractor +from ..utils import js_to_json class C56IE(InfoExtractor): _VALID_URL = r'https?://(?:(?:www|player)\.)?56\.com/(?:.+?/)?(?:v_|(?:play_album.+-))(?P<textid>.+?)\.(?:html|swf)' IE_NAME = '56.com' - _TEST = { + _TESTS = [{ 'url': 'http://www.56.com/u39/v_OTM0NDA3MTY.html', 'md5': 'e59995ac63d0457783ea05f93f12a866', 'info_dict': { @@ -18,12 +19,29 @@ 'title': '网事知多少 第32期:车怒', 'duration': 283.813, }, - } + }, { + 'url': 'http://www.56.com/u47/v_MTM5NjQ5ODc2.html', + 'md5': '', + 'info_dict': { + 'id': '82247482', + 'title': '爱的诅咒之杜鹃花开', + }, + 'playlist_count': 7, + 'add_ie': ['Sohu'], + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE) text_id = mobj.group('textid') + webpage = self._download_webpage(url, text_id) + sohu_video_info_str = self._search_regex( + r'var\s+sohuVideoInfo\s*=\s*({[^}]+});', webpage, 'Sohu video info', default=None) + if sohu_video_info_str: + sohu_video_info = self._parse_json( + sohu_video_info_str, text_id, transform_source=js_to_json) + return self.url_result(sohu_video_info['url'], 'Sohu') + page = self._download_json( 'http://vxml.56.com/json/%s/' % text_id, text_id, 'Downloading video info') diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/camdemy.py youtube-dl-2019.05.20/youtube_dl/extractor/camdemy.py --- youtube-dl-2016.02.22/youtube_dl/extractor/camdemy.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/camdemy.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,22 +1,23 @@ # coding: utf-8 from __future__ import unicode_literals -import datetime import re from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( - parse_iso8601, + clean_html, + parse_duration, str_to_int, + unified_strdate, ) class CamdemyIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?camdemy\.com/media/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?camdemy\.com/media/(?P<id>\d+)' _TESTS = [{ # single file 'url': 'http://www.camdemy.com/media/5181/', @@ -25,86 +26,93 @@ 'id': '5181', 'ext': 'mp4', 'title': 'Ch1-1 Introduction, Signals (02-23-2012)', - 'thumbnail': 're:^https?://.*\.jpg$', - 'description': '', + 'thumbnail': r're:^https?://.*\.jpg$', 'creator': 'ss11spring', + 'duration': 1591, 'upload_date': '20130114', - 'timestamp': 1358154556, 'view_count': int, } }, { # With non-empty description + # webpage returns "No permission or not login" 'url': 'http://www.camdemy.com/media/13885', 'md5': '4576a3bb2581f86c61044822adbd1249', 'info_dict': { 'id': '13885', 'ext': 'mp4', 'title': 'EverCam + Camdemy QuickStart', - 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 'md5:050b62f71ed62928f8a35f1a41e186c9', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'md5:2a9f989c2b153a2342acee579c6e7db6', 'creator': 'evercam', - 'upload_date': '20140620', - 'timestamp': 1403271569, + 'duration': 318, } }, { - # External source + # External source (YouTube) 'url': 'http://www.camdemy.com/media/14842', - 'md5': '50e1c3c3aa233d3d7b7daa2fa10b1cf7', 'info_dict': { 'id': '2vsYQzNIsJo', 'ext': 'mp4', + 'title': 'Excel 2013 Tutorial - How to add Password Protection', + 'description': 'Excel 2013 Tutorial for Beginners - How to add Password Protection', 'upload_date': '20130211', 'uploader': 'Hun Kim', - 'description': 'Excel 2013 Tutorial for Beginners - How to add Password Protection', 'uploader_id': 'hunkimtutorials', - 'title': 'Excel 2013 Tutorial - How to add Password Protection', - } + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): video_id = self._match_id(url) - page = self._download_webpage(url, video_id) + + webpage = self._download_webpage(url, video_id) src_from = self._html_search_regex( - r"<div class='srcFrom'>Source: <a title='([^']+)'", page, - 'external source', default=None) + r"class=['\"]srcFrom['\"][^>]*>Sources?(?:\s+from)?\s*:\s*<a[^>]+(?:href|title)=(['\"])(?P<url>(?:(?!\1).)+)\1", + webpage, 'external source', default=None, group='url') if src_from: return self.url_result(src_from) oembed_obj = self._download_json( 'http://www.camdemy.com/oembed/?format=json&url=' + url, video_id) + title = oembed_obj['title'] thumb_url = oembed_obj['thumbnail_url'] video_folder = compat_urlparse.urljoin(thumb_url, 'video/') file_list_doc = self._download_xml( compat_urlparse.urljoin(video_folder, 'fileList.xml'), - video_id, 'Filelist XML') + video_id, 'Downloading filelist XML') file_name = file_list_doc.find('./video/item/fileName').text video_url = compat_urlparse.urljoin(video_folder, file_name) - timestamp = parse_iso8601(self._html_search_regex( - r"<div class='title'>Posted\s*:</div>\s*<div class='value'>([^<>]+)<", - page, 'creation time', fatal=False), - delimiter=' ', timezone=datetime.timedelta(hours=8)) - view_count = str_to_int(self._html_search_regex( - r"<div class='title'>Views\s*:</div>\s*<div class='value'>([^<>]+)<", - page, 'view count', fatal=False)) + # Some URLs return "No permission or not login" in a webpage despite being + # freely available via oembed JSON URL (e.g. http://www.camdemy.com/media/13885) + upload_date = unified_strdate(self._search_regex( + r'>published on ([^<]+)<', webpage, + 'upload date', default=None)) + view_count = str_to_int(self._search_regex( + r'role=["\']viewCnt["\'][^>]*>([\d,.]+) views', + webpage, 'view count', default=None)) + description = self._html_search_meta( + 'description', webpage, default=None) or clean_html( + oembed_obj.get('description')) return { 'id': video_id, 'url': video_url, - 'title': oembed_obj['title'], + 'title': title, 'thumbnail': thumb_url, - 'description': self._html_search_meta('description', page), - 'creator': oembed_obj['author_name'], - 'duration': oembed_obj['duration'], - 'timestamp': timestamp, + 'description': description, + 'creator': oembed_obj.get('author_name'), + 'duration': parse_duration(oembed_obj.get('duration')), + 'upload_date': upload_date, 'view_count': view_count, } class CamdemyFolderIE(InfoExtractor): - _VALID_URL = r'http://www.camdemy.com/folder/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?camdemy\.com/folder/(?P<id>\d+)' _TESTS = [{ # links with trailing slash 'url': 'http://www.camdemy.com/folder/450', @@ -139,7 +147,7 @@ parsed_url = list(compat_urlparse.urlparse(url)) query = dict(compat_urlparse.parse_qsl(parsed_url[4])) query.update({'displayMode': 'list'}) - parsed_url[4] = compat_urllib_parse.urlencode(query) + parsed_url[4] = compat_urllib_parse_urlencode(query) final_url = compat_urlparse.urlunparse(parsed_url) page = self._download_webpage(final_url, folder_id) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/cammodels.py youtube-dl-2019.05.20/youtube_dl/extractor/cammodels.py --- youtube-dl-2016.02.22/youtube_dl/extractor/cammodels.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/cammodels.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,98 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + url_or_none, +) + + +class CamModelsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?cammodels\.com/cam/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.cammodels.com/cam/AutumnKnight/', + 'only_matching': True, + 'age_limit': 18 + }] + + def _real_extract(self, url): + user_id = self._match_id(url) + + webpage = self._download_webpage( + url, user_id, headers=self.geo_verification_headers()) + + manifest_root = self._html_search_regex( + r'manifestUrlRoot=([^&\']+)', webpage, 'manifest', default=None) + + if not manifest_root: + ERRORS = ( + ("I'm offline, but let's stay connected", 'This user is currently offline'), + ('in a private show', 'This user is in a private show'), + ('is currently performing LIVE', 'This model is currently performing live'), + ) + for pattern, message in ERRORS: + if pattern in webpage: + error = message + expected = True + break + else: + error = 'Unable to find manifest URL root' + expected = False + raise ExtractorError(error, expected=expected) + + manifest = self._download_json( + '%s%s.json' % (manifest_root, user_id), user_id) + + formats = [] + for format_id, format_dict in manifest['formats'].items(): + if not isinstance(format_dict, dict): + continue + encodings = format_dict.get('encodings') + if not isinstance(encodings, list): + continue + vcodec = format_dict.get('videoCodec') + acodec = format_dict.get('audioCodec') + for media in encodings: + if not isinstance(media, dict): + continue + media_url = url_or_none(media.get('location')) + if not media_url: + continue + + format_id_list = [format_id] + height = int_or_none(media.get('videoHeight')) + if height is not None: + format_id_list.append('%dp' % height) + f = { + 'url': media_url, + 'format_id': '-'.join(format_id_list), + 'width': int_or_none(media.get('videoWidth')), + 'height': height, + 'vbr': int_or_none(media.get('videoKbps')), + 'abr': int_or_none(media.get('audioKbps')), + 'fps': int_or_none(media.get('fps')), + 'vcodec': vcodec, + 'acodec': acodec, + } + if 'rtmp' in format_id: + f['ext'] = 'flv' + elif 'hls' in format_id: + f.update({ + 'ext': 'mp4', + # hls skips fragments, preferring rtmp + 'preference': -1, + }) + else: + continue + formats.append(f) + self._sort_formats(formats) + + return { + 'id': user_id, + 'title': self._live_title(user_id), + 'is_live': True, + 'formats': formats, + 'age_limit': 18 + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/camtube.py youtube-dl-2019.05.20/youtube_dl/extractor/camtube.py --- youtube-dl-2016.02.22/youtube_dl/extractor/camtube.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/camtube.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,71 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unified_timestamp, +) + + +class CamTubeIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www|api)\.)?camtube\.co/recordings?/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://camtube.co/recording/minafay-030618-1136-chaturbate-female', + 'info_dict': { + 'id': '42ad3956-dd5b-445a-8313-803ea6079fac', + 'display_id': 'minafay-030618-1136-chaturbate-female', + 'ext': 'mp4', + 'title': 'minafay-030618-1136-chaturbate-female', + 'duration': 1274, + 'timestamp': 1528018608, + 'upload_date': '20180603', + 'age_limit': 18 + }, + 'params': { + 'skip_download': True, + }, + }] + + _API_BASE = 'https://api.camtube.co' + + def _real_extract(self, url): + display_id = self._match_id(url) + + token = self._download_json( + '%s/rpc/session/new' % self._API_BASE, display_id, + 'Downloading session token')['token'] + + self._set_cookie('api.camtube.co', 'session', token) + + video = self._download_json( + '%s/recordings/%s' % (self._API_BASE, display_id), display_id, + headers={'Referer': url}) + + video_id = video['uuid'] + timestamp = unified_timestamp(video.get('createdAt')) + duration = int_or_none(video.get('duration')) + view_count = int_or_none(video.get('viewCount')) + like_count = int_or_none(video.get('likeCount')) + creator = video.get('stageName') + + formats = [{ + 'url': '%s/recordings/%s/manifest.m3u8' + % (self._API_BASE, video_id), + 'format_id': 'hls', + 'ext': 'mp4', + 'protocol': 'm3u8_native', + }] + + return { + 'id': video_id, + 'display_id': display_id, + 'title': display_id, + 'timestamp': timestamp, + 'duration': duration, + 'view_count': view_count, + 'like_count': like_count, + 'creator': creator, + 'formats': formats, + 'age_limit': 18 + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/camwithher.py youtube-dl-2019.05.20/youtube_dl/extractor/camwithher.py --- youtube-dl-2016.02.22/youtube_dl/extractor/camwithher.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/camwithher.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,89 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_duration, + unified_strdate, +) + + +class CamWithHerIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?camwithher\.tv/view_video\.php\?.*\bviewkey=(?P<id>\w+)' + + _TESTS = [{ + 'url': 'http://camwithher.tv/view_video.php?viewkey=6e9a24e2c0e842e1f177&page=&viewtype=&category=', + 'info_dict': { + 'id': '5644', + 'ext': 'flv', + 'title': 'Periscope Tease', + 'description': 'In the clouds teasing on periscope to my favorite song', + 'duration': 240, + 'view_count': int, + 'comment_count': int, + 'uploader': 'MileenaK', + 'upload_date': '20160322', + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + } + }, { + 'url': 'http://camwithher.tv/view_video.php?viewkey=6dfd8b7c97531a459937', + 'only_matching': True, + }, { + 'url': 'http://camwithher.tv/view_video.php?page=&viewkey=6e9a24e2c0e842e1f177&viewtype=&category=', + 'only_matching': True, + }, { + 'url': 'http://camwithher.tv/view_video.php?viewkey=b6c3b5bea9515d1a1fc4&page=&viewtype=&category=mv', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + flv_id = self._html_search_regex( + r'<a[^>]+href=["\']/download/\?v=(\d+)', webpage, 'video id') + + # Video URL construction algorithm is reverse-engineered from cwhplayer.swf + rtmp_url = 'rtmp://camwithher.tv/clipshare/%s' % ( + ('mp4:%s.mp4' % flv_id) if int(flv_id) > 2010 else flv_id) + + title = self._html_search_regex( + r'<div[^>]+style="float:left"[^>]*>\s*<h2>(.+?)</h2>', webpage, 'title') + description = self._html_search_regex( + r'>Description:</span>(.+?)</div>', webpage, 'description', default=None) + + runtime = self._search_regex( + r'Runtime\s*:\s*(.+?) \|', webpage, 'duration', default=None) + if runtime: + runtime = re.sub(r'[\s-]', '', runtime) + duration = parse_duration(runtime) + view_count = int_or_none(self._search_regex( + r'Views\s*:\s*(\d+)', webpage, 'view count', default=None)) + comment_count = int_or_none(self._search_regex( + r'Comments\s*:\s*(\d+)', webpage, 'comment count', default=None)) + + uploader = self._search_regex( + r'Added by\s*:\s*<a[^>]+>([^<]+)</a>', webpage, 'uploader', default=None) + upload_date = unified_strdate(self._search_regex( + r'Added on\s*:\s*([\d-]+)', webpage, 'upload date', default=None)) + + return { + 'id': flv_id, + 'url': rtmp_url, + 'ext': 'flv', + 'no_resume': True, + 'title': title, + 'description': description, + 'duration': duration, + 'view_count': view_count, + 'comment_count': comment_count, + 'uploader': uploader, + 'upload_date': upload_date, + 'age_limit': 18 + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/canalc2.py youtube-dl-2019.05.20/youtube_dl/extractor/canalc2.py --- youtube-dl-2016.02.22/youtube_dl/extractor/canalc2.py 2016-01-14 09:35:12.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/canalc2.py 2019-06-07 18:49:07.000000000 +0000 @@ -16,13 +16,10 @@ 'md5': '060158428b650f896c542dfbb3d6487f', 'info_dict': { 'id': '12163', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Terrasses du Numérique', 'duration': 122, }, - 'params': { - 'skip_download': True, # Requires rtmpdump - } }, { 'url': 'http://archives-canalc2.u-strasbg.fr/video.asp?idVideo=11427&voir=oui', 'only_matching': True, @@ -34,6 +31,10 @@ webpage = self._download_webpage( 'http://www.canalc2.tv/video/%s' % video_id, video_id) + title = self._html_search_regex( + r'(?s)class="[^"]*col_description[^"]*">.*?<h3>(.+?)</h3>', + webpage, 'title') + formats = [] for _, video_url in re.findall(r'file\s*=\s*(["\'])(.+?)\1', webpage): if video_url.startswith('rtmp://'): @@ -52,17 +53,21 @@ 'url': video_url, 'format_id': 'http', }) - self._sort_formats(formats) - title = self._html_search_regex( - r'(?s)class="[^"]*col_description[^"]*">.*?<h3>(.*?)</h3>', webpage, 'title') - duration = parse_duration(self._search_regex( - r'id=["\']video_duree["\'][^>]*>([^<]+)', - webpage, 'duration', fatal=False)) + if formats: + info = { + 'formats': formats, + } + else: + info = self._parse_html5_media_entries(url, webpage, url)[0] + + self._sort_formats(info['formats']) - return { + info.update({ 'id': video_id, 'title': title, - 'duration': duration, - 'formats': formats, - } + 'duration': parse_duration(self._search_regex( + r'id=["\']video_duree["\'][^>]*>([^<]+)', + webpage, 'duration', fatal=False)), + }) + return info diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/canalplus.py youtube-dl-2019.05.20/youtube_dl/extractor/canalplus.py --- youtube-dl-2016.02.22/youtube_dl/extractor/canalplus.py 2015-12-31 15:50:46.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/canalplus.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,86 +1,58 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( - ExtractorError, - HEADRequest, - unified_strdate, - url_basename, - qualities, + # ExtractorError, + # HEADRequest, int_or_none, + qualities, + unified_strdate, ) class CanalplusIE(InfoExtractor): - IE_DESC = 'canalplus.fr, piwiplus.fr and d8.tv' - _VALID_URL = r'https?://(?:www\.(?P<site>canalplus\.fr|piwiplus\.fr|d8\.tv|itele\.fr)/.*?/(?P<path>.*)|player\.canalplus\.fr/#/(?P<id>[0-9]+))' + IE_DESC = 'mycanal.fr and piwiplus.fr' + _VALID_URL = r'https?://(?:www\.)?(?P<site>mycanal|piwiplus)\.fr/(?:[^/]+/)*(?P<display_id>[^?/]+)(?:\.html\?.*\bvid=|/p/)(?P<id>\d+)' _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/%s/%s?format=json' _SITE_ID_MAP = { - 'canalplus.fr': 'cplus', - 'piwiplus.fr': 'teletoon', - 'd8.tv': 'd8', - 'itele.fr': 'itele', + 'mycanal': 'cplus', + 'piwiplus': 'teletoon', } + # Only works for direct mp4 URLs + _GEO_COUNTRIES = ['FR'] + _TESTS = [{ - 'url': 'http://www.canalplus.fr/c-emissions/pid1830-c-zapping.html?vid=1263092', - 'md5': '12164a6f14ff6df8bd628e8ba9b10b78', + 'url': 'https://www.mycanal.fr/d17-emissions/lolywood/p/1397061', 'info_dict': { - 'id': '1263092', + 'id': '1397061', + 'display_id': 'lolywood', 'ext': 'mp4', - 'title': 'Le Zapping - 13/05/15', - 'description': 'md5:09738c0d06be4b5d06a0940edb0da73f', - 'upload_date': '20150513', + 'title': 'Euro 2016 : Je préfère te prévenir - Lolywood - Episode 34', + 'description': 'md5:7d97039d455cb29cdba0d652a0efaa5e', + 'upload_date': '20160602', }, }, { + # geo restricted, bypassed 'url': 'http://www.piwiplus.fr/videos-piwi/pid1405-le-labyrinthe-boing-super-ranger.html?vid=1108190', 'info_dict': { 'id': '1108190', - 'ext': 'flv', - 'title': 'Le labyrinthe - Boing super ranger', + 'display_id': 'pid1405-le-labyrinthe-boing-super-ranger', + 'ext': 'mp4', + 'title': 'BOING SUPER RANGER - Ep : Le labyrinthe', 'description': 'md5:4cea7a37153be42c1ba2c1d3064376ff', 'upload_date': '20140724', }, - 'skip': 'Only works from France', - }, { - 'url': 'http://www.d8.tv/d8-docs-mags/pid6589-d8-campagne-intime.html', - 'info_dict': { - 'id': '966289', - 'ext': 'flv', - 'title': 'Campagne intime - Documentaire exceptionnel', - 'description': 'md5:d2643b799fb190846ae09c61e59a859f', - 'upload_date': '20131108', - }, - 'skip': 'videos get deleted after a while', - }, { - 'url': 'http://www.itele.fr/france/video/aubervilliers-un-lycee-en-colere-111559', - 'md5': '38b8f7934def74f0d6f3ba6c036a5f82', - 'info_dict': { - 'id': '1213714', - 'ext': 'mp4', - 'title': 'Aubervilliers : un lycée en colère - Le 11/02/2015 à 06h45', - 'description': 'md5:8216206ec53426ea6321321f3b3c16db', - 'upload_date': '20150211', - }, + 'expected_warnings': ['HTTP Error 403: Forbidden'], }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.groupdict().get('id') - - site_id = self._SITE_ID_MAP[mobj.group('site') or 'canal'] - - # Beware, some subclasses do not define an id group - display_id = url_basename(mobj.group('path')) + site, display_id, video_id = re.match(self._VALID_URL, url).groups() - if video_id is None: - webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - [r'<canal:player[^>]+?videoId=(["\'])(?P<id>\d+)', r'id=["\']canal_video_player(?P<id>\d+)'], - webpage, 'video id', group='id') + site_id = self._SITE_ID_MAP[site] info_url = self._VIDEO_INFO_TEMPLATE % (site_id, video_id) video_data = self._download_json(info_url, video_id, 'Downloading video JSON') @@ -92,15 +64,15 @@ preference = qualities(['MOBILE', 'BAS_DEBIT', 'HAUT_DEBIT', 'HD']) - fmt_url = next(iter(media.get('VIDEOS'))) - if '/geo' in fmt_url.lower(): - response = self._request_webpage( - HEADRequest(fmt_url), video_id, - 'Checking if the video is georestricted') - if '/blocage' in response.geturl(): - raise ExtractorError( - 'The video is not available in your country', - expected=True) + # _, fmt_url = next(iter(media['VIDEOS'].items())) + # if '/geo' in fmt_url.lower(): + # response = self._request_webpage( + # HEADRequest(fmt_url), video_id, + # 'Checking if the video is georestricted') + # if '/blocage' in response.geturl(): + # raise ExtractorError( + # 'The video is not available in your country', + # expected=True) formats = [] for format_id, format_url in media['VIDEOS'].items(): @@ -114,7 +86,7 @@ format_url + '?hdcore=2.11.3', video_id, f4m_id=format_id, fatal=False)) else: formats.append({ - # the secret extracted ya function in http://player.canalplus.fr/common/js/canalPlayer.js + # the secret extracted from ya function in http://player.canalplus.fr/common/js/canalPlayer.js 'url': format_url + '?secret=pqzerjlsmdkjfoiuerhsdlfknaes', 'format_id': format_id, 'preference': preference(format_id), diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/canvas.py youtube-dl-2019.05.20/youtube_dl/extractor/canvas.py --- youtube-dl-2016.02.22/youtube_dl/extractor/canvas.py 2016-02-20 13:02:05.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/canvas.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,23 +1,117 @@ from __future__ import unicode_literals +import re +import json + from .common import InfoExtractor -from ..utils import float_or_none +from .gigya import GigyaBaseIE +from ..compat import compat_HTTPError +from ..utils import ( + ExtractorError, + strip_or_none, + float_or_none, + int_or_none, + merge_dicts, + parse_iso8601, +) class CanvasIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?canvas\.be/video/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza)/assets/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', + 'md5': '90139b746a0a9bd7bb631283f6e2a64e', + 'info_dict': { + 'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', + 'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', + 'ext': 'flv', + 'title': 'Nachtwacht: De Greystook', + 'description': 'md5:1db3f5dc4c7109c821261e7512975be7', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 1468.03, + }, + 'expected_warnings': ['is not a supported codec', 'Unknown MIME type'], + }, { + 'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e', + 'only_matching': True, + }] + _HLS_ENTRY_PROTOCOLS_MAP = { + 'HLS': 'm3u8_native', + 'HLS_AES': 'm3u8', + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + site_id, video_id = mobj.group('site_id'), mobj.group('id') + + data = self._download_json( + 'https://mediazone.vrt.be/api/v1/%s/assets/%s' + % (site_id, video_id), video_id) + + title = data['title'] + description = data.get('description') + + formats = [] + for target in data['targetUrls']: + format_url, format_type = target.get('url'), target.get('type') + if not format_url or not format_type: + continue + if format_type in self._HLS_ENTRY_PROTOCOLS_MAP: + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', self._HLS_ENTRY_PROTOCOLS_MAP[format_type], + m3u8_id=format_type, fatal=False)) + elif format_type == 'HDS': + formats.extend(self._extract_f4m_formats( + format_url, video_id, f4m_id=format_type, fatal=False)) + elif format_type == 'MPEG_DASH': + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id=format_type, fatal=False)) + elif format_type == 'HSS': + formats.extend(self._extract_ism_formats( + format_url, video_id, ism_id='mss', fatal=False)) + else: + formats.append({ + 'format_id': format_type, + 'url': format_url, + }) + self._sort_formats(formats) + + subtitles = {} + subtitle_urls = data.get('subtitleUrls') + if isinstance(subtitle_urls, list): + for subtitle in subtitle_urls: + subtitle_url = subtitle.get('url') + if subtitle_url and subtitle.get('type') == 'CLOSED': + subtitles.setdefault('nl', []).append({'url': subtitle_url}) + + return { + 'id': video_id, + 'display_id': video_id, + 'title': title, + 'description': description, + 'formats': formats, + 'duration': float_or_none(data.get('duration'), 1000), + 'thumbnail': data.get('posterImageUrl'), + 'subtitles': subtitles, + } + + +class CanvasEenIE(InfoExtractor): + IE_DESC = 'canvas.be and een.be' + _VALID_URL = r'https?://(?:www\.)?(?P<site_id>canvas|een)\.be/(?:[^/]+/)*(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'http://www.canvas.be/video/de-afspraak/najaar-2015/de-afspraak-veilt-voor-de-warmste-week', - 'md5': 'ea838375a547ac787d4064d8c7860a6c', + 'md5': 'ed66976748d12350b118455979cca293', 'info_dict': { 'id': 'mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e', 'display_id': 'de-afspraak-veilt-voor-de-warmste-week', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'De afspraak veilt voor de Warmste Week', 'description': 'md5:24cb860c320dc2be7358e0e5aa317ba6', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 49.02, - } + }, + 'expected_warnings': ['is not a supported codec'], }, { # with subtitles 'url': 'http://www.canvas.be/video/panorama/2016/pieter-0167', @@ -27,7 +121,7 @@ 'ext': 'mp4', 'title': 'Pieter 0167', 'description': 'md5:943cd30f48a5d29ba02c3a104dc4ec4e', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 2553.08, 'subtitles': { 'nl': [{ @@ -37,58 +131,189 @@ }, 'params': { 'skip_download': True, - } + }, + 'skip': 'Pagina niet gevonden', + }, { + 'url': 'https://www.een.be/sorry-voor-alles/herbekijk-sorry-voor-alles', + 'info_dict': { + 'id': 'mz-ast-11a587f8-b921-4266-82e2-0bce3e80d07f', + 'display_id': 'herbekijk-sorry-voor-alles', + 'ext': 'mp4', + 'title': 'Herbekijk Sorry voor alles', + 'description': 'md5:8bb2805df8164e5eb95d6a7a29dc0dd3', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 3788.06, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Episode no longer available', + }, { + 'url': 'https://www.canvas.be/check-point/najaar-2016/de-politie-uw-vriend', + 'only_matching': True, }] def _real_extract(self, url): - display_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + site_id, display_id = mobj.group('site_id'), mobj.group('id') webpage = self._download_webpage(url, display_id) - title = self._search_regex( + title = strip_or_none(self._search_regex( r'<h1[^>]+class="video__body__header__title"[^>]*>(.+?)</h1>', - webpage, 'title', default=None) or self._og_search_title(webpage) + webpage, 'title', default=None) or self._og_search_title( + webpage, default=None)) video_id = self._html_search_regex( - r'data-video=(["\'])(?P<id>.+?)\1', webpage, 'video id', group='id') + r'data-video=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id', + group='id') - data = self._download_json( - 'https://mediazone.vrt.be/api/v1/canvas/assets/%s' % video_id, display_id) + return { + '_type': 'url_transparent', + 'url': 'https://mediazone.vrt.be/api/v1/%s/assets/%s' % (site_id, video_id), + 'ie_key': CanvasIE.ie_key(), + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': self._og_search_description(webpage), + } - formats = [] - for target in data['targetUrls']: - format_url, format_type = target.get('url'), target.get('type') - if not format_url or not format_type: - continue - if format_type == 'HLS': - formats.extend(self._extract_m3u8_formats( - format_url, display_id, entry_protocol='m3u8_native', - ext='mp4', preference=0, fatal=False, m3u8_id=format_type)) - elif format_type == 'HDS': - formats.extend(self._extract_f4m_formats( - format_url, display_id, f4m_id=format_type, fatal=False)) + +class VrtNUIE(GigyaBaseIE): + IE_DESC = 'VrtNU.be' + _VALID_URL = r'https?://(?:www\.)?vrt\.be/(?P<site_id>vrtnu)/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1/postbus-x-s1a1/', + 'info_dict': { + 'id': 'pbs-pub-2e2d8c27-df26-45c9-9dc6-90c78153044d$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de', + 'ext': 'flv', + 'title': 'De zwarte weduwe', + 'description': 'md5:d90c21dced7db869a85db89a623998d4', + 'duration': 1457.04, + 'thumbnail': r're:^https?://.*\.jpg$', + 'season': '1', + 'season_number': 1, + 'episode_number': 1, + }, + 'skip': 'This video is only available for registered users' + }] + _NETRC_MACHINE = 'vrtnu' + _APIKEY = '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy' + _CONTEXT_ID = 'R3595707040' + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + auth_data = { + 'APIKey': self._APIKEY, + 'targetEnv': 'jssdk', + 'loginID': username, + 'password': password, + 'authMode': 'cookie', + } + + auth_info = self._gigya_login(auth_data) + + # Sometimes authentication fails for no good reason, retry + login_attempt = 1 + while login_attempt <= 3: + try: + # When requesting a token, no actual token is returned, but the + # necessary cookies are set. + self._request_webpage( + 'https://token.vrt.be', + None, note='Requesting a token', errnote='Could not get a token', + headers={ + 'Content-Type': 'application/json', + 'Referer': 'https://www.vrt.be/vrtnu/', + }, + data=json.dumps({ + 'uid': auth_info['UID'], + 'uidsig': auth_info['UIDSignature'], + 'ts': auth_info['signatureTimestamp'], + 'email': auth_info['profile']['email'], + }).encode('utf-8')) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + login_attempt += 1 + self.report_warning('Authentication failed') + self._sleep(1, None, msg_template='Waiting for %(timeout)s seconds before trying again') + else: + raise e else: - formats.append({ - 'format_id': format_type, - 'url': format_url, - }) - self._sort_formats(formats) + break - subtitles = {} - subtitle_urls = data.get('subtitleUrls') - if isinstance(subtitle_urls, list): - for subtitle in subtitle_urls: - subtitle_url = subtitle.get('url') - if subtitle_url and subtitle.get('type') == 'CLOSED': - subtitles.setdefault('nl', []).append({'url': subtitle_url}) + def _real_extract(self, url): + display_id = self._match_id(url) - return { + webpage, urlh = self._download_webpage_handle(url, display_id) + + info = self._search_json_ld(webpage, display_id, default={}) + + # title is optional here since it may be extracted by extractor + # that is delegated from here + title = strip_or_none(self._html_search_regex( + r'(?ms)<h1 class="content__heading">(.+?)</h1>', + webpage, 'title', default=None)) + + description = self._html_search_regex( + r'(?ms)<div class="content__description">(.+?)</div>', + webpage, 'description', default=None) + + season = self._html_search_regex( + [r'''(?xms)<div\ class="tabs__tab\ tabs__tab--active">\s* + <span>seizoen\ (.+?)</span>\s* + </div>''', + r'<option value="seizoen (\d{1,3})" data-href="[^"]+?" selected>'], + webpage, 'season', default=None) + + season_number = int_or_none(season) + + episode_number = int_or_none(self._html_search_regex( + r'''(?xms)<div\ class="content__episode">\s* + <abbr\ title="aflevering">afl</abbr>\s*<span>(\d+)</span> + </div>''', + webpage, 'episode_number', default=None)) + + release_date = parse_iso8601(self._html_search_regex( + r'(?ms)<div class="content__broadcastdate">\s*<time\ datetime="(.+?)"', + webpage, 'release_date', default=None)) + + # If there's a ? or a # in the URL, remove them and everything after + clean_url = urlh.geturl().split('?')[0].split('#')[0].strip('/') + securevideo_url = clean_url + '.mssecurevideo.json' + + try: + video = self._download_json(securevideo_url, display_id) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + self.raise_login_required() + raise + + # We are dealing with a '../<show>.relevant' URL + redirect_url = video.get('url') + if redirect_url: + return self.url_result(self._proto_relative_url(redirect_url, 'https:')) + + # There is only one entry, but with an unknown key, so just get + # the first one + video_id = list(video.values())[0].get('videoid') + + return merge_dicts(info, { + '_type': 'url_transparent', + 'url': 'https://mediazone.vrt.be/api/v1/vrtvideo/assets/%s' % video_id, + 'ie_key': CanvasIE.ie_key(), 'id': video_id, 'display_id': display_id, 'title': title, - 'description': self._og_search_description(webpage), - 'formats': formats, - 'duration': float_or_none(data.get('duration'), 1000), - 'thumbnail': data.get('posterImageUrl'), - 'subtitles': subtitles, - } + 'description': description, + 'season': season, + 'season_number': season_number, + 'episode_number': episode_number, + 'release_date': release_date, + }) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/carambatv.py youtube-dl-2019.05.20/youtube_dl/extractor/carambatv.py --- youtube-dl-2016.02.22/youtube_dl/extractor/carambatv.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/carambatv.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,108 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + float_or_none, + int_or_none, + try_get, +) + +from .videomore import VideomoreIE + + +class CarambaTVIE(InfoExtractor): + _VALID_URL = r'(?:carambatv:|https?://video1\.carambatv\.ru/v/)(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://video1.carambatv.ru/v/191910501', + 'md5': '2f4a81b7cfd5ab866ee2d7270cb34a2a', + 'info_dict': { + 'id': '191910501', + 'ext': 'mp4', + 'title': '[BadComedian] - Разборка в Маниле (Абсолютный обзор)', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 2678.31, + }, + }, { + 'url': 'carambatv:191910501', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_json( + 'http://video1.carambatv.ru/v/%s/videoinfo.js' % video_id, + video_id) + + title = video['title'] + + base_url = video.get('video') or 'http://video1.carambatv.ru/v/%s/' % video_id + + formats = [{ + 'url': base_url + f['fn'], + 'height': int_or_none(f.get('height')), + 'format_id': '%sp' % f['height'] if f.get('height') else None, + } for f in video['qualities'] if f.get('fn')] + self._sort_formats(formats) + + thumbnail = video.get('splash') + duration = float_or_none(try_get( + video, lambda x: x['annotations'][0]['end_time'], compat_str)) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + } + + +class CarambaTVPageIE(InfoExtractor): + _VALID_URL = r'https?://carambatv\.ru/(?:[^/]+/)+(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'http://carambatv.ru/movie/bad-comedian/razborka-v-manile/', + 'md5': 'a49fb0ec2ad66503eeb46aac237d3c86', + 'info_dict': { + 'id': '475222', + 'ext': 'flv', + 'title': '[BadComedian] - Разборка в Маниле (Абсолютный обзор)', + 'thumbnail': r're:^https?://.*\.jpg', + # duration reported by videomore is incorrect + 'duration': int, + }, + 'add_ie': [VideomoreIE.ie_key()], + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + videomore_url = VideomoreIE._extract_url(webpage) + if not videomore_url: + videomore_id = self._search_regex( + r'getVMCode\s*\(\s*["\']?(\d+)', webpage, 'videomore id', + default=None) + if videomore_id: + videomore_url = 'videomore:%s' % videomore_id + if videomore_url: + title = self._og_search_title(webpage) + return { + '_type': 'url_transparent', + 'url': videomore_url, + 'ie_key': VideomoreIE.ie_key(), + 'title': title, + } + + video_url = self._og_search_property('video:iframe', webpage, default=None) + + if not video_url: + video_id = self._search_regex( + r'(?:video_id|crmb_vuid)\s*[:=]\s*["\']?(\d+)', + webpage, 'video id') + video_url = 'carambatv:%s' % video_id + + return self.url_result(video_url, CarambaTVIE.ie_key()) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/cartoonnetwork.py youtube-dl-2019.05.20/youtube_dl/extractor/cartoonnetwork.py --- youtube-dl-2016.02.22/youtube_dl/extractor/cartoonnetwork.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/cartoonnetwork.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,62 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .turner import TurnerBaseIE +from ..utils import int_or_none + + +class CartoonNetworkIE(TurnerBaseIE): + _VALID_URL = r'https?://(?:www\.)?cartoonnetwork\.com/video/(?:[^/]+/)+(?P<id>[^/?#]+)-(?:clip|episode)\.html' + _TEST = { + 'url': 'https://www.cartoonnetwork.com/video/ben-10/how-to-draw-upgrade-episode.html', + 'info_dict': { + 'id': '6e3375097f63874ebccec7ef677c1c3845fa850e', + 'ext': 'mp4', + 'title': 'How to Draw Upgrade', + 'description': 'md5:2061d83776db7e8be4879684eefe8c0f', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + def find_field(global_re, name, content_re=None, value_re='[^"]+', fatal=False): + metadata_re = '' + if content_re: + metadata_re = r'|video_metadata\.content_' + content_re + return self._search_regex( + r'(?:_cnglobal\.currentVideo\.%s%s)\s*=\s*"(%s)";' % (global_re, metadata_re, value_re), + webpage, name, fatal=fatal) + + media_id = find_field('mediaId', 'media id', 'id', '[0-9a-f]{40}', True) + title = find_field('episodeTitle', 'title', '(?:episodeName|name)', fatal=True) + + info = self._extract_ngtv_info( + media_id, {'networkId': 'cartoonnetwork'}, { + 'url': url, + 'site_name': 'CartoonNetwork', + 'auth_required': find_field('authType', 'auth type') != 'unauth', + }) + + series = find_field( + 'propertyName', 'series', 'showName') or self._html_search_meta('partOfSeries', webpage) + info.update({ + 'id': media_id, + 'display_id': display_id, + 'title': title, + 'description': self._html_search_meta('description', webpage), + 'series': series, + 'episode': title, + }) + + for field in ('season', 'episode'): + field_name = field + 'Number' + info[field + '_number'] = int_or_none(find_field( + field_name, field + ' number', value_re=r'\d+') or self._html_search_meta(field_name, webpage)) + + return info diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/cbc.py youtube-dl-2019.05.20/youtube_dl/extractor/cbc.py --- youtube-dl-2016.02.22/youtube_dl/extractor/cbc.py 2016-02-12 20:57:05.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/cbc.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,113 +1,457 @@ # coding: utf-8 from __future__ import unicode_literals +import json import re from .common import InfoExtractor -from ..utils import js_to_json +from ..compat import ( + compat_str, + compat_HTTPError, +) +from ..utils import ( + js_to_json, + smuggle_url, + try_get, + xpath_text, + xpath_element, + xpath_with_ns, + find_xpath_attr, + orderedSet, + parse_duration, + parse_iso8601, + parse_age_limit, + strip_or_none, + int_or_none, + ExtractorError, +) class CBCIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cbc\.ca/(?:[^/]+/)+(?P<id>[^/?#]+)' + IE_NAME = 'cbc.ca' + _VALID_URL = r'https?://(?:www\.)?cbc\.ca/(?!player/)(?:[^/]+/)+(?P<id>[^/?#]+)' _TESTS = [{ # with mediaId 'url': 'http://www.cbc.ca/22minutes/videos/clips-season-23/don-cherry-play-offs', + 'md5': '97e24d09672fc4cf56256d6faa6c25bc', 'info_dict': { 'id': '2682904050', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Don Cherry – All-Stars', 'description': 'Don Cherry has a bee in his bonnet about AHL player John Scott because that guy’s got heart.', - 'timestamp': 1454475540, + 'timestamp': 1454463000, 'upload_date': '20160203', + 'uploader': 'CBCC-NEW', }, - 'params': { - # rtmp download - 'skip_download': True, + 'skip': 'Geo-restricted to Canada', + }, { + # with clipId, feed available via tpfeed.cbc.ca and feed.theplatform.com + 'url': 'http://www.cbc.ca/22minutes/videos/22-minutes-update/22-minutes-update-episode-4', + 'md5': '162adfa070274b144f4fdc3c3b8207db', + 'info_dict': { + 'id': '2414435309', + 'ext': 'mp4', + 'title': '22 Minutes Update: What Not To Wear Quebec', + 'description': "This week's latest Canadian top political story is What Not To Wear Quebec.", + 'upload_date': '20131025', + 'uploader': 'CBCC-NEW', + 'timestamp': 1382717907, }, }, { - # with clipId + # with clipId, feed only available via tpfeed.cbc.ca 'url': 'http://www.cbc.ca/archives/entry/1978-robin-williams-freestyles-on-90-minutes-live', + 'md5': '0274a90b51a9b4971fe005c63f592f12', 'info_dict': { 'id': '2487345465', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Robin Williams freestyles on 90 Minutes Live', 'description': 'Wacky American comedian Robin Williams shows off his infamous "freestyle" comedic talents while being interviewed on CBC\'s 90 Minutes Live.', - 'upload_date': '19700101', - }, - 'params': { - # rtmp download - 'skip_download': True, + 'upload_date': '19780210', + 'uploader': 'CBCC-NEW', + 'timestamp': 255977160, }, }, { # multiple iframes 'url': 'http://www.cbc.ca/natureofthings/blog/birds-eye-view-from-vancouvers-burrard-street-bridge-how-we-got-the-shot', 'playlist': [{ + 'md5': '377572d0b49c4ce0c9ad77470e0b96b4', 'info_dict': { 'id': '2680832926', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'An Eagle\'s-Eye View Off Burrard Bridge', 'description': 'Hercules the eagle flies from Vancouver\'s Burrard Bridge down to a nearby park with a mini-camera strapped to his back.', - 'upload_date': '19700101', + 'upload_date': '20160201', + 'timestamp': 1454342820, + 'uploader': 'CBCC-NEW', }, }, { + 'md5': '415a0e3f586113894174dfb31aa5bb1a', 'info_dict': { 'id': '2658915080', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Fly like an eagle!', 'description': 'Eagle equipped with a mini camera flies from the world\'s tallest tower', - 'upload_date': '19700101', + 'upload_date': '20150315', + 'timestamp': 1426443984, + 'uploader': 'CBCC-NEW', }, }], - 'params': { - # rtmp download - 'skip_download': True, + 'skip': 'Geo-restricted to Canada', + }, { + # multiple CBC.APP.Caffeine.initInstance(...) + 'url': 'http://www.cbc.ca/news/canada/calgary/dog-indoor-exercise-winter-1.3928238', + 'info_dict': { + 'title': 'Keep Rover active during the deep freeze with doggie pushups and other fun indoor tasks', + 'id': 'dog-indoor-exercise-winter-1.3928238', + 'description': 'md5:c18552e41726ee95bd75210d1ca9194c', }, + 'playlist_mincount': 6, }] @classmethod def suitable(cls, url): return False if CBCPlayerIE.suitable(url) else super(CBCIE, cls).suitable(url) - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - player_init = self._search_regex( - r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage, 'player init', - default=None) - if player_init: - player_info = self._parse_json(player_init, display_id, js_to_json) - media_id = player_info.get('mediaId') + def _extract_player_init(self, player_init, display_id): + player_info = self._parse_json(player_init, display_id, js_to_json) + media_id = player_info.get('mediaId') + if not media_id: + clip_id = player_info['clipId'] + feed = self._download_json( + 'http://tpfeed.cbc.ca/f/ExhSPC/vms_5akSXx4Ng_Zn?byCustomValue={:mpsReleases}{%s}' % clip_id, + clip_id, fatal=False) + if feed: + media_id = try_get(feed, lambda x: x['entries'][0]['guid'], compat_str) if not media_id: - clip_id = player_info['clipId'] media_id = self._download_json( 'http://feed.theplatform.com/f/h9dtGB/punlNGjMlc1F?fields=id&byContent=byReleases%3DbyId%253D' + clip_id, clip_id)['entries'][0]['id'].split('/')[-1] - return self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) - else: - entries = [self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) for media_id in re.findall(r'<iframe[^>]+src="[^"]+?mediaId=(\d+)"', webpage)] - return self.playlist_result(entries) + return self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + title = self._og_search_title(webpage, default=None) or self._html_search_meta( + 'twitter:title', webpage, 'title', default=None) or self._html_search_regex( + r'<title>([^<]+)', webpage, 'title', fatal=False) + entries = [ + self._extract_player_init(player_init, display_id) + for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)] + media_ids = [] + for media_id_re in ( + r']+src="[^"]+?mediaId=(\d+)"', + r']+\bid=["\']player-(\d+)', + r'guid["\']\s*:\s*["\'](\d+)'): + media_ids.extend(re.findall(media_id_re, webpage)) + entries.extend([ + self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) + for media_id in orderedSet(media_ids)]) + return self.playlist_result( + entries, display_id, strip_or_none(title), + self._og_search_description(webpage)) class CBCPlayerIE(InfoExtractor): + IE_NAME = 'cbc.ca:player' _VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/|i/caffeine/syndicate/\?mediaId=))(?P\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.cbc.ca/player/play/2683190193', + 'md5': '64d25f841ddf4ddb28a235338af32e2c', 'info_dict': { 'id': '2683190193', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Gerry Runs a Sweat Shop', 'description': 'md5:b457e1c01e8ff408d9d801c1c2cd29b0', - 'timestamp': 1455067800, + 'timestamp': 1455071400, 'upload_date': '20160210', + 'uploader': 'CBCC-NEW', + }, + 'skip': 'Geo-restricted to Canada', + }, { + # Redirected from http://www.cbc.ca/player/AudioMobile/All%20in%20a%20Weekend%20Montreal/ID/2657632011/ + 'url': 'http://www.cbc.ca/player/play/2657631896', + 'md5': 'e5e708c34ae6fca156aafe17c43e8b75', + 'info_dict': { + 'id': '2657631896', + 'ext': 'mp3', + 'title': 'CBC Montreal is organizing its first ever community hackathon!', + 'description': 'The modern technology we tend to depend on so heavily, is never without it\'s share of hiccups and headaches. Next weekend - CBC Montreal will be getting members of the public for its first Hackathon.', + 'timestamp': 1425704400, + 'upload_date': '20150307', + 'uploader': 'CBCC-NEW', + }, + }, { + 'url': 'http://www.cbc.ca/player/play/2164402062', + 'md5': '33fcd8f6719b9dd60a5e73adcb83b9f6', + 'info_dict': { + 'id': '2164402062', + 'ext': 'mp4', + 'title': 'Cancer survivor four times over', + 'description': 'Tim Mayer has beaten three different forms of cancer four times in five years.', + 'timestamp': 1320410746, + 'upload_date': '20111104', + 'uploader': 'CBCC-NEW', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return { + '_type': 'url_transparent', + 'ie_key': 'ThePlatform', + 'url': smuggle_url( + 'http://link.theplatform.com/s/ExhSPC/media/guid/2655402169/%s?mbr=true&formats=MPEG4,FLV,MP3' % video_id, { + 'force_smil_url': True + }), + 'id': video_id, + } + + +class CBCWatchBaseIE(InfoExtractor): + _device_id = None + _device_token = None + _API_BASE_URL = 'https://api-cbc.cloud.clearleap.com/cloffice/client/' + _NS_MAP = { + 'media': 'http://search.yahoo.com/mrss/', + 'clearleap': 'http://www.clearleap.com/namespace/clearleap/1.0/', + } + _GEO_COUNTRIES = ['CA'] + + def _call_api(self, path, video_id): + url = path if path.startswith('http') else self._API_BASE_URL + path + for _ in range(2): + try: + result = self._download_xml(url, video_id, headers={ + 'X-Clearleap-DeviceId': self._device_id, + 'X-Clearleap-DeviceToken': self._device_token, + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + # Device token has expired, re-acquiring device token + self._register_device() + continue + raise + error_message = xpath_text(result, 'userMessage') or xpath_text(result, 'systemMessage') + if error_message: + raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message)) + return result + + def _real_initialize(self): + if self._valid_device_token(): + return + device = self._downloader.cache.load('cbcwatch', 'device') or {} + self._device_id, self._device_token = device.get('id'), device.get('token') + if self._valid_device_token(): + return + self._register_device() + + def _valid_device_token(self): + return self._device_id and self._device_token + + def _register_device(self): + self._device_id = self._device_token = None + result = self._download_xml( + self._API_BASE_URL + 'device/register', + None, 'Acquiring device token', + data=b'web') + self._device_id = xpath_text(result, 'deviceId', fatal=True) + self._device_token = xpath_text(result, 'deviceToken', fatal=True) + self._downloader.cache.store( + 'cbcwatch', 'device', { + 'id': self._device_id, + 'token': self._device_token, + }) + + def _parse_rss_feed(self, rss): + channel = xpath_element(rss, 'channel', fatal=True) + + def _add_ns(path): + return xpath_with_ns(path, self._NS_MAP) + + entries = [] + for item in channel.findall('item'): + guid = xpath_text(item, 'guid', fatal=True) + title = xpath_text(item, 'title', fatal=True) + + media_group = xpath_element(item, _add_ns('media:group'), fatal=True) + content = xpath_element(media_group, _add_ns('media:content'), fatal=True) + content_url = content.attrib['url'] + + thumbnails = [] + for thumbnail in media_group.findall(_add_ns('media:thumbnail')): + thumbnail_url = thumbnail.get('url') + if not thumbnail_url: + continue + thumbnails.append({ + 'id': thumbnail.get('profile'), + 'url': thumbnail_url, + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + timestamp = None + release_date = find_xpath_attr( + item, _add_ns('media:credit'), 'role', 'releaseDate') + if release_date is not None: + timestamp = parse_iso8601(release_date.text) + + entries.append({ + '_type': 'url_transparent', + 'url': content_url, + 'id': guid, + 'title': title, + 'description': xpath_text(item, 'description'), + 'timestamp': timestamp, + 'duration': int_or_none(content.get('duration')), + 'age_limit': parse_age_limit(xpath_text(item, _add_ns('media:rating'))), + 'episode': xpath_text(item, _add_ns('clearleap:episode')), + 'episode_number': int_or_none(xpath_text(item, _add_ns('clearleap:episodeInSeason'))), + 'series': xpath_text(item, _add_ns('clearleap:series')), + 'season_number': int_or_none(xpath_text(item, _add_ns('clearleap:season'))), + 'thumbnails': thumbnails, + 'ie_key': 'CBCWatchVideo', + }) + + return self.playlist_result( + entries, xpath_text(channel, 'guid'), + xpath_text(channel, 'title'), + xpath_text(channel, 'description')) + + +class CBCWatchVideoIE(CBCWatchBaseIE): + IE_NAME = 'cbc.ca:watch:video' + _VALID_URL = r'https?://api-cbc\.cloud\.clearleap\.com/cloffice/client/web/play/?\?.*?\bcontentId=(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _TEST = { + # geo-restricted to Canada, bypassable + 'url': 'https://api-cbc.cloud.clearleap.com/cloffice/client/web/play/?contentId=3c84472a-1eea-4dee-9267-2655d5055dcf&categoryId=ebc258f5-ee40-4cca-b66b-ba6bd55b7235', + 'only_matching': True, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + result = self._call_api(url, video_id) + + m3u8_url = xpath_text(result, 'url', fatal=True) + formats = self._extract_m3u8_formats(re.sub(r'/([^/]+)/[^/?]+\.m3u8', r'/\1/\1.m3u8', m3u8_url), video_id, 'mp4', fatal=False) + if len(formats) < 2: + formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') + for f in formats: + format_id = f.get('format_id') + if format_id.startswith('AAC'): + f['acodec'] = 'aac' + elif format_id.startswith('AC3'): + f['acodec'] = 'ac-3' + self._sort_formats(formats) + + info = { + 'id': video_id, + 'title': video_id, + 'formats': formats, + } + + rss = xpath_element(result, 'rss') + if rss: + info.update(self._parse_rss_feed(rss)['entries'][0]) + del info['url'] + del info['_type'] + del info['ie_key'] + return info + + +class CBCWatchIE(CBCWatchBaseIE): + IE_NAME = 'cbc.ca:watch' + _VALID_URL = r'https?://(?:gem|watch)\.cbc\.ca/(?:[^/]+/)+(?P[0-9a-f-]+)' + _TESTS = [{ + # geo-restricted to Canada, bypassable + 'url': 'http://watch.cbc.ca/doc-zone/season-6/customer-disservice/38e815a-009e3ab12e4', + 'info_dict': { + 'id': '9673749a-5e77-484c-8b62-a1092a6b5168', + 'ext': 'mp4', + 'title': 'Customer (Dis)Service', + 'description': 'md5:8bdd6913a0fe03d4b2a17ebe169c7c87', + 'upload_date': '20160219', + 'timestamp': 1455840000, }, 'params': { - # rtmp download + # m3u8 download 'skip_download': True, + 'format': 'bestvideo', }, - } + }, { + # geo-restricted to Canada, bypassable + 'url': 'http://watch.cbc.ca/arthur/all/1ed4b385-cd84-49cf-95f0-80f004680057', + 'info_dict': { + 'id': '1ed4b385-cd84-49cf-95f0-80f004680057', + 'title': 'Arthur', + 'description': 'Arthur, the sweetest 8-year-old aardvark, and his pals solve all kinds of problems with humour, kindness and teamwork.', + }, + 'playlist_mincount': 30, + }, { + 'url': 'https://gem.cbc.ca/media/this-hour-has-22-minutes/season-26/episode-20/38e815a-0108c6c6a42', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - return self.url_result( - 'http://feed.theplatform.com/f/ExhSPC/vms_5akSXx4Ng_Zn?byGuid=%s' % video_id, - 'ThePlatformFeed', video_id) + rss = self._call_api('web/browse/' + video_id, video_id) + return self._parse_rss_feed(rss) + + +class CBCOlympicsIE(InfoExtractor): + IE_NAME = 'cbc.ca:olympics' + _VALID_URL = r'https?://olympics\.cbc\.ca/video/[^/]+/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://olympics.cbc.ca/video/whats-on-tv/olympic-morning-featuring-the-opening-ceremony/', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._hidden_inputs(webpage)['videoId'] + video_doc = self._download_xml( + 'https://olympics.cbc.ca/videodata/%s.xml' % video_id, video_id) + title = xpath_text(video_doc, 'title', fatal=True) + is_live = xpath_text(video_doc, 'kind') == 'Live' + if is_live: + title = self._live_title(title) + + formats = [] + for video_source in video_doc.findall('videoSources/videoSource'): + uri = xpath_text(video_source, 'uri') + if not uri: + continue + tokenize = self._download_json( + 'https://olympics.cbc.ca/api/api-akamai/tokenize', + video_id, data=json.dumps({ + 'VideoSource': uri, + }).encode(), headers={ + 'Content-Type': 'application/json', + 'Referer': url, + # d3.VideoPlayer._init in https://olympics.cbc.ca/components/script/base.js + 'Cookie': '_dvp=TK:C0ObxjerU', # AKAMAI CDN cookie + }, fatal=False) + if not tokenize: + continue + content_url = tokenize['ContentUrl'] + video_source_format = video_source.get('format') + if video_source_format == 'IIS': + formats.extend(self._extract_ism_formats( + content_url, video_id, ism_id=video_source_format, fatal=False)) + else: + formats.extend(self._extract_m3u8_formats( + content_url, video_id, 'mp4', + 'm3u8' if is_live else 'm3u8_native', + m3u8_id=video_source_format, fatal=False)) + self._sort_formats(formats) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': xpath_text(video_doc, 'description'), + 'thumbnail': xpath_text(video_doc, 'thumbnailUrl'), + 'duration': parse_duration(xpath_text(video_doc, 'duration')), + 'formats': formats, + 'is_live': is_live, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/cbsinteractive.py youtube-dl-2019.05.20/youtube_dl/extractor/cbsinteractive.py --- youtube-dl-2016.02.22/youtube_dl/extractor/cbsinteractive.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/cbsinteractive.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,103 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .cbs import CBSIE +from ..utils import int_or_none + + +class CBSInteractiveIE(CBSIE): + _VALID_URL = r'https?://(?:www\.)?(?Pcnet|zdnet)\.com/(?:videos|video(?:/share)?)/(?P[^/?]+)' + _TESTS = [{ + 'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/', + 'info_dict': { + 'id': 'R49SYt__yAfmlXR85z4f7gNmCBDcN_00', + 'display_id': 'hands-on-with-microsofts-windows-8-1-update', + 'ext': 'mp4', + 'title': 'Hands-on with Microsoft Windows 8.1 Update', + 'description': 'The new update to the Windows 8 OS brings improved performance for mouse and keyboard users.', + 'uploader_id': '6085384d-619e-11e3-b231-14feb5ca9861', + 'uploader': 'Sarah Mitroff', + 'duration': 70, + 'timestamp': 1396479627, + 'upload_date': '20140402', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.cnet.com/videos/whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187/', + 'md5': 'f11d27b2fa18597fbf92444d2a9ed386', + 'info_dict': { + 'id': 'kjOJd_OoVJqbg_ZD8MZCOk8Wekb9QccK', + 'display_id': 'whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187', + 'ext': 'mp4', + 'title': 'Whiny potholes tweet at local government when hit by cars (Tomorrow Daily 187)', + 'description': 'md5:d2b9a95a5ffe978ae6fbd4cf944d618f', + 'uploader_id': 'b163284d-6b73-44fc-b3e6-3da66c392d40', + 'uploader': 'Ashley Esqueda', + 'duration': 1482, + 'timestamp': 1433289889, + 'upload_date': '20150603', + }, + }, { + 'url': 'http://www.zdnet.com/video/share/video-keeping-android-smartphones-and-tablets-secure/', + 'info_dict': { + 'id': 'k0r4T_ehht4xW_hAOqiVQPuBDPZ8SRjt', + 'display_id': 'video-keeping-android-smartphones-and-tablets-secure', + 'ext': 'mp4', + 'title': 'Video: Keeping Android smartphones and tablets secure', + 'description': 'Here\'s the best way to keep Android devices secure, and what you do when they\'ve come to the end of their lives.', + 'uploader_id': 'f2d97ea2-8175-11e2-9d12-0018fe8a00b0', + 'uploader': 'Adrian Kingsley-Hughes', + 'duration': 731, + 'timestamp': 1449129925, + 'upload_date': '20151203', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.zdnet.com/video/huawei-matebook-x-video/', + 'only_matching': True, + }] + + MPX_ACCOUNTS = { + 'cnet': 2198311517, + 'zdnet': 2387448114, + } + + def _real_extract(self, url): + site, display_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, display_id) + + data_json = self._html_search_regex( + r"data(?:-(?:cnet|zdnet))?-video(?:-(?:uvp(?:js)?|player))?-options='([^']+)'", + webpage, 'data json') + data = self._parse_json(data_json, display_id) + vdata = data.get('video') or (data.get('videos') or data.get('playlist'))[0] + + video_id = vdata['mpxRefId'] + + title = vdata['title'] + author = vdata.get('author') + if author: + uploader = '%s %s' % (author['firstName'], author['lastName']) + uploader_id = author.get('id') + else: + uploader = None + uploader_id = None + + info = self._extract_video_info(video_id, site, self.MPX_ACCOUNTS[site]) + info.update({ + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'duration': int_or_none(vdata.get('duration')), + 'uploader': uploader, + 'uploader_id': uploader_id, + }) + return info diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/cbslocal.py youtube-dl-2019.05.20/youtube_dl/extractor/cbslocal.py --- youtube-dl-2016.02.22/youtube_dl/extractor/cbslocal.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/cbslocal.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,104 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .anvato import AnvatoIE +from .sendtonews import SendtoNewsIE +from ..compat import compat_urlparse +from ..utils import ( + parse_iso8601, + unified_timestamp, +) + + +class CBSLocalIE(AnvatoIE): + _VALID_URL = r'https?://[a-z]+\.cbslocal\.com/(?:\d+/\d+/\d+|video)/(?P[0-9a-z-]+)' + + _TESTS = [{ + # Anvato backend + 'url': 'http://losangeles.cbslocal.com/2016/05/16/safety-advocates-say-fatal-car-seat-failures-are-public-health-crisis', + 'md5': 'f0ee3081e3843f575fccef901199b212', + 'info_dict': { + 'id': '3401037', + 'ext': 'mp4', + 'title': 'Safety Advocates Say Fatal Car Seat Failures Are \'Public Health Crisis\'', + 'description': 'Collapsing seats have been the focus of scrutiny for decades, though experts say remarkably little has been done to address the issue. Randy Paige reports.', + 'thumbnail': 're:^https?://.*', + 'timestamp': 1463440500, + 'upload_date': '20160516', + 'uploader': 'CBS', + 'subtitles': { + 'en': 'mincount:5', + }, + 'categories': [ + 'Stations\\Spoken Word\\KCBSTV', + 'Syndication\\MSN', + 'Syndication\\NDN', + 'Syndication\\AOL', + 'Syndication\\Yahoo', + 'Syndication\\Tribune', + 'Syndication\\Curb.tv', + 'Content\\News' + ], + 'tags': ['CBS 2 News Evening'], + }, + }, { + # SendtoNews embed + 'url': 'http://cleveland.cbslocal.com/2016/05/16/indians-score-season-high-15-runs-in-blowout-win-over-reds-rapid-reaction/', + 'info_dict': { + 'id': 'GxfCe0Zo7D-175909-5588', + }, + 'playlist_count': 9, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://newyork.cbslocal.com/video/3580809-a-very-blue-anniversary/', + 'info_dict': { + 'id': '3580809', + 'ext': 'mp4', + 'title': 'A Very Blue Anniversary', + 'description': 'CBS2’s Cindy Hsu has more.', + 'thumbnail': 're:^https?://.*', + 'timestamp': int, + 'upload_date': r're:^\d{8}$', + 'uploader': 'CBS', + 'subtitles': { + 'en': 'mincount:5', + }, + 'categories': [ + 'Stations\\Spoken Word\\WCBSTV', + 'Syndication\\AOL', + 'Syndication\\MSN', + 'Syndication\\NDN', + 'Syndication\\Yahoo', + 'Content\\News', + 'Content\\News\\Local News', + ], + 'tags': ['CBS 2 News Weekends', 'Cindy Hsu', 'Blue Man Group'], + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + sendtonews_url = SendtoNewsIE._extract_url(webpage) + if sendtonews_url: + return self.url_result( + compat_urlparse.urljoin(url, sendtonews_url), + ie=SendtoNewsIE.ie_key()) + + info_dict = self._extract_anvato_videos(webpage, display_id) + + timestamp = unified_timestamp(self._html_search_regex( + r'class="(?:entry|post)-date"[^>]*>([^<]+)', webpage, + 'released date', default=None)) or parse_iso8601( + self._html_search_meta('uploadDate', webpage)) + + info_dict.update({ + 'display_id': display_id, + 'timestamp': timestamp, + }) + + return info_dict diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/cbsnews.py youtube-dl-2019.05.20/youtube_dl/extractor/cbsnews.py --- youtube-dl-2016.02.22/youtube_dl/extractor/cbsnews.py 2016-02-20 13:02:05.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/cbsnews.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,27 +1,54 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals +import re +import zlib + from .common import InfoExtractor -from .theplatform import ThePlatformIE +from .cbs import CBSIE +from ..compat import ( + compat_b64decode, + compat_urllib_parse_unquote, +) from ..utils import ( parse_duration, - find_xpath_attr, ) -class CBSNewsIE(ThePlatformIE): +class CBSNewsEmbedIE(CBSIE): + IE_NAME = 'cbsnews:embed' + _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/embed/video[^#]*#(?P.+)' + _TESTS = [{ + 'url': 'https://www.cbsnews.com/embed/video/?v=1.c9b5b61492913d6660db0b2f03579ef25e86307a#1Vb7b9s2EP5XBAHbT6Gt98PAMKTJ0se6LVjWYWtdGBR1stlIpEBSTtwi%2F%2FvuJNkNhmHdGxgM2NL57vjd6zt%2B8PngdN%2Fyg79qeGvhzN%2FLGrS%2F%2BuBLB531V28%2B%2BO7Qg7%2Fy97r2z3xZ42NW8yLhDbA0S0KWlHnIijwKWJBHZZnHBa8Cgbpdf%2F89NM9Hi9fXifhpr8sr%2FlP848tn%2BTdXycX25zh4cdX%2FvHl6PmmPqnWQv9w8Ed%2B9GjYRim07bFEqdG%2BZVHuwTm65A7bVRrYtR5lAyMox7pigF6W4k%2By91mjspGsJ%2BwVae4%2BsvdnaO1p73HkXs%2FVisUDTGm7R8IcdnOROeq%2B19qT1amhA1VJtPenoTUgrtfKc9m7Rq8dP7nnjwOB7wg7ADdNt7VX64DWAWlKhPtmDEq22g4GF99x6Dk9E8OSsankHXqPNKDxC%2FdK7MLKTircTDgsI3mmj4OBdSq64dy7fd1x577RU1rt4cvMtOaulFYOd%2FLewRWvDO9lIgXFpZSnkZmjbv5SxKTPoQXClFbpsf%2Fhbbpzs0IB3vb8KkyzJQ%2BywOAgCrMpgRrz%2BKk4fvb7kFbR4XJCu0gAdtNO7woCwZTu%2BBUs9bam%2Fds71drVerpeisgrubLjAB4nnOSkWQnfr5W6o1ku5Xpr1MgrCbL0M0vUyDtfLLK15WiYp47xKWSLyjFVpwVmVJSLIoCjSOFkv3W7oKsVliwZJcB9nwXpZ5GEQQwY8jNKqKCBrgjTLeFxgdCIpazojDgnRtn43J6kG7nZ6cAbxh0EeFFk4%2B1u867cY5u4344n%2FxXjCqAjucdTHgLKojNKmSfO8KRsOFY%2FzKEYCKEJBzv90QA9nfm9gL%2BHulaFqUkz9ULUYxl62B3U%2FRVNLA8IhggaPycOoBuwOCESciDQVSSUgiOMsROB%2FhKfwCKOzEk%2B4k6rWd4uuT%2FwTDz7K7t3d3WLO8ISD95jSPQbayBacthbz86XVgxHwhex5zawzgDOmtp%2F3GPcXn0VXHdSS029%2Fj99UC%2FwJUvyKQ%2FzKyixIEVlYJOn4RxxuaH43Ty9fbJ5OObykHH435XAzJTHeOF4hhEUXD8URe%2FQ%2FBT%2BMpf8d5GN02Ox%2FfiGsl7TA7POu1xZ5%2BbTzcAVKMe48mqcC21hkacVEVScM26liVVBnrKkC4CLKyzAvHu0lhEaTKMFwI3a4SN9MsrfYzdBLq2vkwRD1gVviLT8kY9h2CHH6Y%2Bix6609weFtey4ESp60WtyeWMy%2BsmBuhsoKIyuoT%2Bq2R%2FrW5qi3g%2FvzS2j40DoixDP8%2BKP0yUdpXJ4l6Vla%2Bg9vce%2BC4yM5YlUcbA%2F0jLKdpmTwvsdN5z88nAIe08%2F0HgxeG1iv%2B6Hlhjh7uiW0SDzYNI92L401uha3JKYk268UVRzdOzNQvAaJqoXzAc80dAV440NZ1WVVAAMRYQ2KrGJFmDUsq8saWSnjvIj8t78y%2FRa3JRnbHVfyFpfwoDiGpPgjzekyUiKNlU3OMlwuLMmzgvEojllYVE2Z1HhImvsnk%2BuhusTEoB21PAtSFodeFK3iYhXEH9WOG2%2FkOE833sfeG%2Ff5cfHtEFNXgYes0%2FXj7aGivUgJ9XpusCtoNcNYVVnJVrrDo0OmJAutHCpuZul4W9lLcfy7BnuLPT02%2ByXsCTk%2B9zhzswIN04YueNSK%2BPtM0jS88QdLqSLJDTLsuGZJNolm2yO0PXh3UPnz9Ix5bfIAqxPjvETQsDCEiPG4QbqNyhBZISxybLnZYCrW5H3Axp690%2F0BJdXtDZ5ITuM4xj3f4oUHGzc5JeJmZKpp%2FjwKh4wMV%2FV1yx3emLoR0MwbG4K%2F%2BZgVep3PnzXGDHZ6a3i%2Fk%2BJrONDN13%2Bnq6tBTYk4o7cLGhBtqCC4KwacGHpEVuoH5JNro%2FE6JfE6d5RydbiR76k%2BW5wioDHBIjw1euhHjUGRB0y5A97KoaPx6MlL%2BwgboUVtUFRI%2FLemgTpdtF59ii7pab08kuPcfWzs0l%2FRI5takWnFpka0zOgWRtYcuf9aIxZMxlwr6IiGpsb6j2DQUXPl%2FimXI599Ev7fWjoPD78A', + 'only_matching': True, + }] + + def _real_extract(self, url): + item = self._parse_json(zlib.decompress(compat_b64decode( + compat_urllib_parse_unquote(self._match_id(url))), + -zlib.MAX_WBITS), None)['video']['items'][0] + return self._extract_video_info(item['mpxRefId'], 'cbsnews') + + +class CBSNewsIE(CBSIE): + IE_NAME = 'cbsnews' IE_DESC = 'CBS News' - _VALID_URL = r'http://(?:www\.)?cbsnews\.com/(?:news|videos)/(?P[\da-z_-]+)' + _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/(?:news|video)/(?P[\da-z_-]+)' _TESTS = [ { - 'url': 'http://www.cbsnews.com/news/tesla-and-spacex-elon-musks-industrial-empire/', + # 60 minutes + 'url': 'http://www.cbsnews.com/news/artificial-intelligence-positioned-to-be-a-game-changer/', 'info_dict': { - 'id': 'tesla-and-spacex-elon-musks-industrial-empire', + 'id': 'Y_nf_aEg6WwO9OLAq0MpKaPgfnBUxfW4', 'ext': 'flv', - 'title': 'Tesla and SpaceX: Elon Musk\'s industrial empire', - 'thumbnail': 'http://beta.img.cbsnews.com/i/2014/03/30/60147937-2f53-4565-ad64-1bdd6eb64679/60-0330-pelley-640x360.jpg', - 'duration': 791, + 'title': 'Artificial Intelligence, real-life applications', + 'description': 'md5:a7aaf27f1b4777244de8b0b442289304', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 317, + 'uploader': 'CBSI-NEW', + 'timestamp': 1476046464, + 'upload_date': '20161009', }, 'params': { # rtmp download @@ -29,12 +56,16 @@ }, }, { - 'url': 'http://www.cbsnews.com/videos/fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack/', + 'url': 'https://www.cbsnews.com/video/fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack/', 'info_dict': { - 'id': 'fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack', + 'id': 'SNJBOYzXiWBOvaLsdzwH8fmtP1SCd91Y', 'ext': 'mp4', 'title': 'Fort Hood shooting: Army downplays mental illness as cause of attack', - 'thumbnail': 're:^https?://.*\.jpg$', + 'description': 'md5:4a6983e480542d8b333a947bfc64ddc7', + 'upload_date': '20140404', + 'timestamp': 1396650660, + 'uploader': 'CBSI-NEW', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 205, 'subtitles': { 'en': [{ @@ -47,86 +78,70 @@ 'skip_download': True, }, }, + { + # 48 hours + 'url': 'http://www.cbsnews.com/news/maria-ridulph-murder-will-the-nations-oldest-cold-case-to-go-to-trial-ever-get-solved/', + 'info_dict': { + 'title': 'Cold as Ice', + 'description': 'Can a childhood memory solve the 1957 murder of 7-year-old Maria Ridulph?', + }, + 'playlist_mincount': 7, + }, ] - def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): - closed_caption_e = find_xpath_attr(smil, self._xpath_ns('.//param', namespace), 'name', 'ClosedCaptionURL') - return { - 'en': [{ - 'ext': 'ttml', - 'url': closed_caption_e.attrib['value'], - }] - } if closed_caption_e is not None and closed_caption_e.attrib.get('value') else [] - def _real_extract(self, url): - video_id = self._match_id(url) + display_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage(url, display_id) - video_info = self._parse_json(self._html_search_regex( - r'(?:', response, + 'errors', default=None) + if errors: + raise ExtractorError('Unable to login: %s' % errors, expected=True) + raise ExtractorError('Unable to log in') + + +class LecturioIE(LecturioBaseIE): + _VALID_URL = r'''(?x) + https:// + (?: + app\.lecturio\.com/[^/]+/(?P[^/?#&]+)\.lecture| + (?:www\.)?lecturio\.de/[^/]+/(?P[^/?#&]+)\.vortrag + ) + ''' + _TESTS = [{ + 'url': 'https://app.lecturio.com/medical-courses/important-concepts-and-terms-introduction-to-microbiology.lecture#tab/videos', + 'md5': 'f576a797a5b7a5e4e4bbdfc25a6a6870', + 'info_dict': { + 'id': '39634', + 'ext': 'mp4', + 'title': 'Important Concepts and Terms – Introduction to Microbiology', + }, + 'skip': 'Requires lecturio account credentials', + }, { + 'url': 'https://www.lecturio.de/jura/oeffentliches-recht-staatsexamen.vortrag', + 'only_matching': True, + }] + + _CC_LANGS = { + 'German': 'de', + 'English': 'en', + 'Spanish': 'es', + 'French': 'fr', + 'Polish': 'pl', + 'Russian': 'ru', + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('id') or mobj.group('id_de') + + webpage = self._download_webpage( + 'https://app.lecturio.com/en/lecture/%s/player.html' % display_id, + display_id) + + lecture_id = self._search_regex( + r'lecture_id\s*=\s*(?:L_)?(\d+)', webpage, 'lecture id') + + api_url = self._search_regex( + r'lectureDataLink\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, + 'api url', group='url') + + video = self._download_json(api_url, display_id) + + title = video['title'].strip() + + formats = [] + for format_ in video['content']['media']: + if not isinstance(format_, dict): + continue + file_ = format_.get('file') + if not file_: + continue + ext = determine_ext(file_) + if ext == 'smil': + # smil contains only broken RTMP formats anyway + continue + file_url = url_or_none(file_) + if not file_url: + continue + label = str_or_none(format_.get('label')) + filesize = int_or_none(format_.get('fileSize')) + formats.append({ + 'url': file_url, + 'format_id': label, + 'filesize': float_or_none(filesize, invscale=1000) + }) + self._sort_formats(formats) + + subtitles = {} + automatic_captions = {} + cc = self._parse_json( + self._search_regex( + r'subtitleUrls\s*:\s*({.+?})\s*,', webpage, 'subtitles', + default='{}'), display_id, fatal=False) + for cc_label, cc_url in cc.items(): + cc_url = url_or_none(cc_url) + if not cc_url: + continue + lang = self._search_regex( + r'/([a-z]{2})_', cc_url, 'lang', + default=cc_label.split()[0] if cc_label else 'en') + original_lang = self._search_regex( + r'/[a-z]{2}_([a-z]{2})_', cc_url, 'original lang', + default=None) + sub_dict = (automatic_captions + if 'auto-translated' in cc_label or original_lang + else subtitles) + sub_dict.setdefault(self._CC_LANGS.get(lang, lang), []).append({ + 'url': cc_url, + }) + + return { + 'id': lecture_id, + 'title': title, + 'formats': formats, + 'subtitles': subtitles, + 'automatic_captions': automatic_captions, + } + + +class LecturioCourseIE(LecturioBaseIE): + _VALID_URL = r'https://app\.lecturio\.com/[^/]+/(?P[^/?#&]+)\.course' + _TEST = { + 'url': 'https://app.lecturio.com/medical-courses/microbiology-introduction.course#/', + 'info_dict': { + 'id': 'microbiology-introduction', + 'title': 'Microbiology: Introduction', + }, + 'playlist_count': 45, + 'skip': 'Requires lecturio account credentials', + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + entries = [] + for mobj in re.finditer( + r'(?s)<[^>]+\bdata-url=(["\'])(?:(?!\1).)+\.lecture\b[^>]+>', + webpage): + params = extract_attributes(mobj.group(0)) + lecture_url = urljoin(url, params.get('data-url')) + lecture_id = params.get('data-id') + entries.append(self.url_result( + lecture_url, ie=LecturioIE.ie_key(), video_id=lecture_id)) + + title = self._search_regex( + r']+class=["\']content-title[^>]+>([^<]+)', webpage, + 'title', default=None) + + return self.playlist_result(entries, display_id, title) + + +class LecturioDeCourseIE(LecturioBaseIE): + _VALID_URL = r'https://(?:www\.)?lecturio\.de/[^/]+/(?P[^/?#&]+)\.kurs' + _TEST = { + 'url': 'https://www.lecturio.de/jura/grundrechte.kurs', + 'only_matching': True, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + entries = [] + for mobj in re.finditer( + r'(?s)]+\bdata-lecture-id=["\'](?P\d+).+?\bhref=(["\'])(?P(?:(?!\2).)+\.vortrag)\b[^>]+>', + webpage): + lecture_url = urljoin(url, mobj.group('url')) + lecture_id = mobj.group('id') + entries.append(self.url_result( + lecture_url, ie=LecturioIE.ie_key(), video_id=lecture_id)) + + title = self._search_regex( + r']*>([^<]+)', webpage, 'title', default=None) + + return self.playlist_result(entries, display_id, title) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/leeco.py youtube-dl-2019.05.20/youtube_dl/extractor/leeco.py --- youtube-dl-2016.02.22/youtube_dl/extractor/leeco.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/leeco.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,368 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import datetime +import hashlib +import re +import time + +from .common import InfoExtractor +from ..compat import ( + compat_b64decode, + compat_ord, + compat_str, + compat_urllib_parse_urlencode, +) +from ..utils import ( + determine_ext, + encode_data_uri, + ExtractorError, + int_or_none, + orderedSet, + parse_iso8601, + str_or_none, + url_basename, + urshift, +) + + +class LeIE(InfoExtractor): + IE_DESC = '乐视网' + _VALID_URL = r'https?://(?:www\.le\.com/ptv/vplay|(?:sports\.le|(?:www\.)?lesports)\.com/(?:match|video))/(?P\d+)\.html' + _GEO_COUNTRIES = ['CN'] + _URL_TEMPLATE = 'http://www.le.com/ptv/vplay/%s.html' + + _TESTS = [{ + 'url': 'http://www.le.com/ptv/vplay/22005890.html', + 'md5': 'edadcfe5406976f42f9f266057ee5e40', + 'info_dict': { + 'id': '22005890', + 'ext': 'mp4', + 'title': '第87届奥斯卡颁奖礼完美落幕 《鸟人》成最大赢家', + 'description': 'md5:a9cb175fd753e2962176b7beca21a47c', + }, + 'params': { + 'hls_prefer_native': True, + }, + }, { + 'url': 'http://www.le.com/ptv/vplay/1415246.html', + 'info_dict': { + 'id': '1415246', + 'ext': 'mp4', + 'title': '美人天下01', + 'description': 'md5:28942e650e82ed4fcc8e4de919ee854d', + }, + 'params': { + 'hls_prefer_native': True, + }, + }, { + 'note': 'This video is available only in Mainland China, thus a proxy is needed', + 'url': 'http://www.le.com/ptv/vplay/1118082.html', + 'md5': '2424c74948a62e5f31988438979c5ad1', + 'info_dict': { + 'id': '1118082', + 'ext': 'mp4', + 'title': '与龙共舞 完整版', + 'description': 'md5:7506a5eeb1722bb9d4068f85024e3986', + }, + 'params': { + 'hls_prefer_native': True, + }, + }, { + 'url': 'http://sports.le.com/video/25737697.html', + 'only_matching': True, + }, { + 'url': 'http://www.lesports.com/match/1023203003.html', + 'only_matching': True, + }, { + 'url': 'http://sports.le.com/match/1023203003.html', + 'only_matching': True, + }] + + # ror() and calc_time_key() are reversed from a embedded swf file in LetvPlayer.swf + def ror(self, param1, param2): + _loc3_ = 0 + while _loc3_ < param2: + param1 = urshift(param1, 1) + ((param1 & 1) << 31) + _loc3_ += 1 + return param1 + + def calc_time_key(self, param1): + _loc2_ = 185025305 + return self.ror(param1, _loc2_ % 17) ^ _loc2_ + + # see M3U8Encryption class in KLetvPlayer.swf + @staticmethod + def decrypt_m3u8(encrypted_data): + if encrypted_data[:5].decode('utf-8').lower() != 'vc_01': + return encrypted_data + encrypted_data = encrypted_data[5:] + + _loc4_ = bytearray(2 * len(encrypted_data)) + for idx, val in enumerate(encrypted_data): + b = compat_ord(val) + _loc4_[2 * idx] = b // 16 + _loc4_[2 * idx + 1] = b % 16 + idx = len(_loc4_) - 11 + _loc4_ = _loc4_[idx:] + _loc4_[:idx] + _loc7_ = bytearray(len(encrypted_data)) + for i in range(len(encrypted_data)): + _loc7_[i] = _loc4_[2 * i] * 16 + _loc4_[2 * i + 1] + + return bytes(_loc7_) + + def _check_errors(self, play_json): + # Check for errors + playstatus = play_json['msgs']['playstatus'] + if playstatus['status'] == 0: + flag = playstatus['flag'] + if flag == 1: + self.raise_geo_restricted() + else: + raise ExtractorError('Generic error. flag = %d' % flag, expected=True) + + def _real_extract(self, url): + media_id = self._match_id(url) + page = self._download_webpage(url, media_id) + + play_json_flash = self._download_json( + 'http://player-pc.le.com/mms/out/video/playJson', + media_id, 'Downloading flash playJson data', query={ + 'id': media_id, + 'platid': 1, + 'splatid': 105, + 'format': 1, + 'source': 1000, + 'tkey': self.calc_time_key(int(time.time())), + 'domain': 'www.le.com', + 'region': 'cn', + }, + headers=self.geo_verification_headers()) + self._check_errors(play_json_flash) + + def get_flash_urls(media_url, format_id): + nodes_data = self._download_json( + media_url, media_id, + 'Download JSON metadata for format %s' % format_id, + query={ + 'm3v': 1, + 'format': 1, + 'expect': 3, + 'tss': 'ios', + }) + + req = self._request_webpage( + nodes_data['nodelist'][0]['location'], media_id, + note='Downloading m3u8 information for format %s' % format_id) + + m3u8_data = self.decrypt_m3u8(req.read()) + + return { + 'hls': encode_data_uri(m3u8_data, 'application/vnd.apple.mpegurl'), + } + + extracted_formats = [] + formats = [] + playurl = play_json_flash['msgs']['playurl'] + play_domain = playurl['domain'][0] + + for format_id, format_data in playurl.get('dispatch', []).items(): + if format_id in extracted_formats: + continue + extracted_formats.append(format_id) + + media_url = play_domain + format_data[0] + for protocol, format_url in get_flash_urls(media_url, format_id).items(): + f = { + 'url': format_url, + 'ext': determine_ext(format_data[1]), + 'format_id': '%s-%s' % (protocol, format_id), + 'protocol': 'm3u8_native' if protocol == 'hls' else 'http', + 'quality': int_or_none(format_id), + } + + if format_id[-1:] == 'p': + f['height'] = int_or_none(format_id[:-1]) + + formats.append(f) + self._sort_formats(formats, ('height', 'quality', 'format_id')) + + publish_time = parse_iso8601(self._html_search_regex( + r'发布时间 ([^<>]+) ', page, 'publish time', default=None), + delimiter=' ', timezone=datetime.timedelta(hours=8)) + description = self._html_search_meta('description', page, fatal=False) + + return { + 'id': media_id, + 'formats': formats, + 'title': playurl['title'], + 'thumbnail': playurl['pic'], + 'description': description, + 'timestamp': publish_time, + } + + +class LePlaylistIE(InfoExtractor): + _VALID_URL = r'https?://[a-z]+\.le\.com/(?!video)[a-z]+/(?P[a-z0-9_]+)' + + _TESTS = [{ + 'url': 'http://www.le.com/tv/46177.html', + 'info_dict': { + 'id': '46177', + 'title': '美人天下', + 'description': 'md5:395666ff41b44080396e59570dbac01c' + }, + 'playlist_count': 35 + }, { + 'url': 'http://tv.le.com/izt/wuzetian/index.html', + 'info_dict': { + 'id': 'wuzetian', + 'title': '武媚娘传奇', + 'description': 'md5:e12499475ab3d50219e5bba00b3cb248' + }, + # This playlist contains some extra videos other than the drama itself + 'playlist_mincount': 96 + }, { + 'url': 'http://tv.le.com/pzt/lswjzzjc/index.shtml', + # This series is moved to http://www.le.com/tv/10005297.html + 'only_matching': True, + }, { + 'url': 'http://www.le.com/comic/92063.html', + 'only_matching': True, + }, { + 'url': 'http://list.le.com/listn/c1009_sc532002_d2_p1_o1.html', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if LeIE.suitable(url) else super(LePlaylistIE, cls).suitable(url) + + def _real_extract(self, url): + playlist_id = self._match_id(url) + page = self._download_webpage(url, playlist_id) + + # Currently old domain names are still used in playlists + media_ids = orderedSet(re.findall( + r']+href="http://www\.letv\.com/ptv/vplay/(\d+)\.html', page)) + entries = [self.url_result(LeIE._URL_TEMPLATE % media_id, ie='Le') + for media_id in media_ids] + + title = self._html_search_meta('keywords', page, + fatal=False).split(',')[0] + description = self._html_search_meta('description', page, fatal=False) + + return self.playlist_result(entries, playlist_id, playlist_title=title, + playlist_description=description) + + +class LetvCloudIE(InfoExtractor): + # Most of *.letv.com is changed to *.le.com on 2016/01/02 + # but yuntv.letv.com is kept, so also keep the extractor name + IE_DESC = '乐视云' + _VALID_URL = r'https?://yuntv\.letv\.com/bcloud.html\?.+' + + _TESTS = [{ + 'url': 'http://yuntv.letv.com/bcloud.html?uu=p7jnfw5hw9&vu=467623dedf', + 'md5': '26450599afd64c513bc77030ad15db44', + 'info_dict': { + 'id': 'p7jnfw5hw9_467623dedf', + 'ext': 'mp4', + 'title': 'Video p7jnfw5hw9_467623dedf', + }, + }, { + 'url': 'http://yuntv.letv.com/bcloud.html?uu=p7jnfw5hw9&vu=ec93197892&pu=2c7cd40209&auto_play=1&gpcflag=1&width=640&height=360', + 'md5': 'e03d9cc8d9c13191e1caf277e42dbd31', + 'info_dict': { + 'id': 'p7jnfw5hw9_ec93197892', + 'ext': 'mp4', + 'title': 'Video p7jnfw5hw9_ec93197892', + }, + }, { + 'url': 'http://yuntv.letv.com/bcloud.html?uu=p7jnfw5hw9&vu=187060b6fd', + 'md5': 'cb988699a776b22d4a41b9d43acfb3ac', + 'info_dict': { + 'id': 'p7jnfw5hw9_187060b6fd', + 'ext': 'mp4', + 'title': 'Video p7jnfw5hw9_187060b6fd', + }, + }] + + @staticmethod + def sign_data(obj): + if obj['cf'] == 'flash': + salt = '2f9d6924b33a165a6d8b5d3d42f4f987' + items = ['cf', 'format', 'ran', 'uu', 'ver', 'vu'] + elif obj['cf'] == 'html5': + salt = 'fbeh5player12c43eccf2bec3300344' + items = ['cf', 'ran', 'uu', 'bver', 'vu'] + input_data = ''.join([item + obj[item] for item in items]) + salt + obj['sign'] = hashlib.md5(input_data.encode('utf-8')).hexdigest() + + def _get_formats(self, cf, uu, vu, media_id): + def get_play_json(cf, timestamp): + data = { + 'cf': cf, + 'ver': '2.2', + 'bver': 'firefox44.0', + 'format': 'json', + 'uu': uu, + 'vu': vu, + 'ran': compat_str(timestamp), + } + self.sign_data(data) + return self._download_json( + 'http://api.letvcloud.com/gpc.php?' + compat_urllib_parse_urlencode(data), + media_id, 'Downloading playJson data for type %s' % cf) + + play_json = get_play_json(cf, time.time()) + # The server time may be different from local time + if play_json.get('code') == 10071: + play_json = get_play_json(cf, play_json['timestamp']) + + if not play_json.get('data'): + if play_json.get('message'): + raise ExtractorError('Letv cloud said: %s' % play_json['message'], expected=True) + elif play_json.get('code'): + raise ExtractorError('Letv cloud returned error %d' % play_json['code'], expected=True) + else: + raise ExtractorError('Letv cloud returned an unknwon error') + + def b64decode(s): + return compat_b64decode(s).decode('utf-8') + + formats = [] + for media in play_json['data']['video_info']['media'].values(): + play_url = media['play_url'] + url = b64decode(play_url['main_url']) + decoded_url = b64decode(url_basename(url)) + formats.append({ + 'url': url, + 'ext': determine_ext(decoded_url), + 'format_id': str_or_none(play_url.get('vtype')), + 'format_note': str_or_none(play_url.get('definition')), + 'width': int_or_none(play_url.get('vwidth')), + 'height': int_or_none(play_url.get('vheight')), + }) + + return formats + + def _real_extract(self, url): + uu_mobj = re.search(r'uu=([\w]+)', url) + vu_mobj = re.search(r'vu=([\w]+)', url) + + if not uu_mobj or not vu_mobj: + raise ExtractorError('Invalid URL: %s' % url, expected=True) + + uu = uu_mobj.group(1) + vu = vu_mobj.group(1) + media_id = uu + '_' + vu + + formats = self._get_formats('flash', uu, vu, media_id) + self._get_formats('html5', uu, vu, media_id) + self._sort_formats(formats) + + return { + 'id': media_id, + 'title': 'Video %s' % media_id, + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/lego.py youtube-dl-2019.05.20/youtube_dl/extractor/lego.py --- youtube-dl-2016.02.22/youtube_dl/extractor/lego.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/lego.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,128 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + unescapeHTML, + parse_duration, + get_element_by_class, +) + + +class LEGOIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?lego\.com/(?P[^/]+)/(?:[^/]+/)*videos/(?:[^/]+/)*[^/?#]+-(?P[0-9a-f]+)' + _TESTS = [{ + 'url': 'http://www.lego.com/en-us/videos/themes/club/blocumentary-kawaguchi-55492d823b1b4d5e985787fa8c2973b1', + 'md5': 'f34468f176cfd76488767fc162c405fa', + 'info_dict': { + 'id': '55492d823b1b4d5e985787fa8c2973b1', + 'ext': 'mp4', + 'title': 'Blocumentary Great Creations: Akiyuki Kawaguchi', + 'description': 'Blocumentary Great Creations: Akiyuki Kawaguchi', + }, + }, { + # geo-restricted but the contentUrl contain a valid url + 'url': 'http://www.lego.com/nl-nl/videos/themes/nexoknights/episode-20-kingdom-of-heroes-13bdc2299ab24d9685701a915b3d71e7##sp=399', + 'md5': '4c3fec48a12e40c6e5995abc3d36cc2e', + 'info_dict': { + 'id': '13bdc2299ab24d9685701a915b3d71e7', + 'ext': 'mp4', + 'title': 'Aflevering 20 - Helden van het koninkrijk', + 'description': 'md5:8ee499aac26d7fa8bcb0cedb7f9c3941', + }, + }, { + # special characters in title + 'url': 'http://www.lego.com/en-us/starwars/videos/lego-star-wars-force-surprise-9685ee9d12e84ff38e84b4e3d0db533d', + 'info_dict': { + 'id': '9685ee9d12e84ff38e84b4e3d0db533d', + 'ext': 'mp4', + 'title': 'Force Surprise – LEGO® Star Wars™ Microfighters', + 'description': 'md5:9c673c96ce6f6271b88563fe9dc56de3', + }, + 'params': { + 'skip_download': True, + }, + }] + _BITRATES = [256, 512, 1024, 1536, 2560] + + def _real_extract(self, url): + locale, video_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, video_id) + title = get_element_by_class('video-header', webpage).strip() + progressive_base = 'https://lc-mediaplayerns-live-s.legocdn.com/' + streaming_base = 'http://legoprod-f.akamaihd.net/' + content_url = self._html_search_meta('contentUrl', webpage) + path = self._search_regex( + r'(?:https?:)?//[^/]+/(?:[iz]/s/)?public/(.+)_[0-9,]+\.(?:mp4|webm)', + content_url, 'video path', default=None) + if not path: + player_url = self._proto_relative_url(self._search_regex( + r']+src="((?:https?)?//(?:www\.)?lego\.com/[^/]+/mediaplayer/video/[^"]+)', + webpage, 'player url', default=None)) + if not player_url: + base_url = self._proto_relative_url(self._search_regex( + r'data-baseurl="([^"]+)"', webpage, 'base url', + default='http://www.lego.com/%s/mediaplayer/video/' % locale)) + player_url = base_url + video_id + player_webpage = self._download_webpage(player_url, video_id) + video_data = self._parse_json(unescapeHTML(self._search_regex( + r"video='([^']+)'", player_webpage, 'video data')), video_id) + progressive_base = self._search_regex( + r'data-video-progressive-url="([^"]+)"', + player_webpage, 'progressive base', default='https://lc-mediaplayerns-live-s.legocdn.com/') + streaming_base = self._search_regex( + r'data-video-streaming-url="([^"]+)"', + player_webpage, 'streaming base', default='http://legoprod-f.akamaihd.net/') + item_id = video_data['ItemId'] + + net_storage_path = video_data.get('NetStoragePath') or '/'.join([item_id[:2], item_id[2:4]]) + base_path = '_'.join([item_id, video_data['VideoId'], video_data['Locale'], compat_str(video_data['VideoVersion'])]) + path = '/'.join([net_storage_path, base_path]) + streaming_path = ','.join(map(lambda bitrate: compat_str(bitrate), self._BITRATES)) + + formats = self._extract_akamai_formats( + '%si/s/public/%s_,%s,.mp4.csmil/master.m3u8' % (streaming_base, path, streaming_path), video_id) + m3u8_formats = list(filter( + lambda f: f.get('protocol') == 'm3u8_native' and f.get('vcodec') != 'none', + formats)) + if len(m3u8_formats) == len(self._BITRATES): + self._sort_formats(m3u8_formats) + for bitrate, m3u8_format in zip(self._BITRATES, m3u8_formats): + progressive_base_url = '%spublic/%s_%d.' % (progressive_base, path, bitrate) + mp4_f = m3u8_format.copy() + mp4_f.update({ + 'url': progressive_base_url + 'mp4', + 'format_id': m3u8_format['format_id'].replace('hls', 'mp4'), + 'protocol': 'http', + }) + web_f = { + 'url': progressive_base_url + 'webm', + 'format_id': m3u8_format['format_id'].replace('hls', 'webm'), + 'width': m3u8_format['width'], + 'height': m3u8_format['height'], + 'tbr': m3u8_format.get('tbr'), + 'ext': 'webm', + } + formats.extend([web_f, mp4_f]) + else: + for bitrate in self._BITRATES: + for ext in ('web', 'mp4'): + formats.append({ + 'format_id': '%s-%s' % (ext, bitrate), + 'url': '%spublic/%s_%d.%s' % (progressive_base, path, bitrate, ext), + 'tbr': bitrate, + 'ext': ext, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': self._html_search_meta('description', webpage), + 'thumbnail': self._html_search_meta('thumbnail', webpage), + 'duration': parse_duration(self._html_search_meta('duration', webpage)), + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/lemonde.py youtube-dl-2019.05.20/youtube_dl/extractor/lemonde.py --- youtube-dl-2016.02.22/youtube_dl/extractor/lemonde.py 2016-01-23 10:08:46.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/lemonde.py 2019-06-07 18:49:07.000000000 +0000 @@ -7,20 +7,40 @@ _VALID_URL = r'https?://(?:.+?\.)?lemonde\.fr/(?:[^/]+/)*(?P[^/]+)\.html' _TESTS = [{ 'url': 'http://www.lemonde.fr/police-justice/video/2016/01/19/comprendre-l-affaire-bygmalion-en-cinq-minutes_4849702_1653578.html', - 'md5': '01fb3c92de4c12c573343d63e163d302', + 'md5': 'da120c8722d8632eec6ced937536cc98', 'info_dict': { 'id': 'lqm3kl', 'ext': 'mp4', 'title': "Comprendre l'affaire Bygmalion en 5 minutes", - 'thumbnail': 're:^https?://.*\.jpg', - 'duration': 320, + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 309, 'upload_date': '20160119', 'timestamp': 1453194778, 'uploader_id': '3pmkp', }, }, { + # standard iframe embed + 'url': 'http://www.lemonde.fr/les-decodeurs/article/2016/10/18/tout-comprendre-du-ceta-le-petit-cousin-du-traite-transatlantique_5015920_4355770.html', + 'info_dict': { + 'id': 'uzsxms', + 'ext': 'mp4', + 'title': "CETA : quelles suites pour l'accord commercial entre l'Europe et le Canada ?", + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 325, + 'upload_date': '20161021', + 'timestamp': 1477044540, + 'uploader_id': '3pmkp', + }, + 'params': { + 'skip_download': True, + }, + }, { 'url': 'http://redaction.actu.lemonde.fr/societe/video/2016/01/18/calais-debut-des-travaux-de-defrichement-dans-la-jungle_4849233_3224.html', 'only_matching': True, + }, { + # YouTube embeds + 'url': 'http://www.lemonde.fr/pixels/article/2016/12/09/pourquoi-pewdiepie-superstar-de-youtube-a-menace-de-fermer-sa-chaine_5046649_4408996.html', + 'only_matching': True, }] def _real_extract(self, url): @@ -30,5 +50,9 @@ digiteka_url = self._proto_relative_url(self._search_regex( r'url\s*:\s*(["\'])(?P(?:https?://)?//(?:www\.)?(?:digiteka\.net|ultimedia\.com)/deliver/.+?)\1', - webpage, 'digiteka url', group='url')) - return self.url_result(digiteka_url, 'Digiteka') + webpage, 'digiteka url', group='url', default=None)) + + if digiteka_url: + return self.url_result(digiteka_url, 'Digiteka') + + return self.url_result(url, 'Generic') diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/lenta.py youtube-dl-2019.05.20/youtube_dl/extractor/lenta.py --- youtube-dl-2016.02.22/youtube_dl/extractor/lenta.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/lenta.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,53 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class LentaIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?lenta\.ru/[^/]+/\d+/\d+/\d+/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://lenta.ru/news/2018/03/22/savshenko_go/', + 'info_dict': { + 'id': '964400', + 'ext': 'mp4', + 'title': 'Надежду Савченко задержали', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 61, + 'view_count': int, + }, + 'params': { + 'skip_download': True, + }, + }, { + # EaglePlatform iframe embed + 'url': 'http://lenta.ru/news/2015/03/06/navalny/', + 'info_dict': { + 'id': '227304', + 'ext': 'mp4', + 'title': 'Навальный вышел на свободу', + 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 87, + 'view_count': int, + 'age_limit': 0, + }, + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + video_id = self._search_regex( + r'vid\s*:\s*["\']?(\d+)', webpage, 'eagleplatform id', + default=None) + if video_id: + return self.url_result( + 'eagleplatform:lentaru.media.eagleplatform.com:%s' % video_id, + ie='EaglePlatform', video_id=video_id) + + return self.url_result(url, ie='Generic') diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/letv.py youtube-dl-2019.05.20/youtube_dl/extractor/letv.py --- youtube-dl-2016.02.22/youtube_dl/extractor/letv.py 2016-01-31 11:57:01.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/letv.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,356 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import datetime -import re -import time -import base64 -import hashlib - -from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse, - compat_ord, - compat_str, -) -from ..utils import ( - determine_ext, - ExtractorError, - parse_iso8601, - sanitized_Request, - int_or_none, - str_or_none, - encode_data_uri, - url_basename, -) - - -class LetvIE(InfoExtractor): - IE_DESC = '乐视网' - _VALID_URL = r'http://www\.letv\.com/ptv/vplay/(?P\d+).html' - - _TESTS = [{ - 'url': 'http://www.letv.com/ptv/vplay/22005890.html', - 'md5': 'edadcfe5406976f42f9f266057ee5e40', - 'info_dict': { - 'id': '22005890', - 'ext': 'mp4', - 'title': '第87届奥斯卡颁奖礼完美落幕 《鸟人》成最大赢家', - 'description': 'md5:a9cb175fd753e2962176b7beca21a47c', - }, - 'params': { - 'hls_prefer_native': True, - }, - }, { - 'url': 'http://www.letv.com/ptv/vplay/1415246.html', - 'info_dict': { - 'id': '1415246', - 'ext': 'mp4', - 'title': '美人天下01', - 'description': 'md5:f88573d9d7225ada1359eaf0dbf8bcda', - }, - 'params': { - 'hls_prefer_native': True, - }, - }, { - 'note': 'This video is available only in Mainland China, thus a proxy is needed', - 'url': 'http://www.letv.com/ptv/vplay/1118082.html', - 'md5': '2424c74948a62e5f31988438979c5ad1', - 'info_dict': { - 'id': '1118082', - 'ext': 'mp4', - 'title': '与龙共舞 完整版', - 'description': 'md5:7506a5eeb1722bb9d4068f85024e3986', - }, - 'params': { - 'hls_prefer_native': True, - }, - 'skip': 'Only available in China', - }] - - @staticmethod - def urshift(val, n): - return val >> n if val >= 0 else (val + 0x100000000) >> n - - # ror() and calc_time_key() are reversed from a embedded swf file in KLetvPlayer.swf - def ror(self, param1, param2): - _loc3_ = 0 - while _loc3_ < param2: - param1 = self.urshift(param1, 1) + ((param1 & 1) << 31) - _loc3_ += 1 - return param1 - - def calc_time_key(self, param1): - _loc2_ = 773625421 - _loc3_ = self.ror(param1, _loc2_ % 13) - _loc3_ = _loc3_ ^ _loc2_ - _loc3_ = self.ror(_loc3_, _loc2_ % 17) - return _loc3_ - - # see M3U8Encryption class in KLetvPlayer.swf - @staticmethod - def decrypt_m3u8(encrypted_data): - if encrypted_data[:5].decode('utf-8').lower() != 'vc_01': - return encrypted_data - encrypted_data = encrypted_data[5:] - - _loc4_ = bytearray() - while encrypted_data: - b = compat_ord(encrypted_data[0]) - _loc4_.extend([b // 16, b & 0x0f]) - encrypted_data = encrypted_data[1:] - idx = len(_loc4_) - 11 - _loc4_ = _loc4_[idx:] + _loc4_[:idx] - _loc7_ = bytearray() - while _loc4_: - _loc7_.append(_loc4_[0] * 16 + _loc4_[1]) - _loc4_ = _loc4_[2:] - - return bytes(_loc7_) - - def _real_extract(self, url): - media_id = self._match_id(url) - page = self._download_webpage(url, media_id) - params = { - 'id': media_id, - 'platid': 1, - 'splatid': 101, - 'format': 1, - 'tkey': self.calc_time_key(int(time.time())), - 'domain': 'www.letv.com' - } - play_json_req = sanitized_Request( - 'http://api.letv.com/mms/out/video/playJson?' + compat_urllib_parse.urlencode(params) - ) - cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') - if cn_verification_proxy: - play_json_req.add_header('Ytdl-request-proxy', cn_verification_proxy) - - play_json = self._download_json( - play_json_req, - media_id, 'Downloading playJson data') - - # Check for errors - playstatus = play_json['playstatus'] - if playstatus['status'] == 0: - flag = playstatus['flag'] - if flag == 1: - msg = 'Country %s auth error' % playstatus['country'] - else: - msg = 'Generic error. flag = %d' % flag - raise ExtractorError(msg, expected=True) - - playurl = play_json['playurl'] - - formats = ['350', '1000', '1300', '720p', '1080p'] - dispatch = playurl['dispatch'] - - urls = [] - for format_id in formats: - if format_id in dispatch: - media_url = playurl['domain'][0] + dispatch[format_id][0] - media_url += '&' + compat_urllib_parse.urlencode({ - 'm3v': 1, - 'format': 1, - 'expect': 3, - 'rateid': format_id, - }) - - nodes_data = self._download_json( - media_url, media_id, - 'Download JSON metadata for format %s' % format_id) - - req = self._request_webpage( - nodes_data['nodelist'][0]['location'], media_id, - note='Downloading m3u8 information for format %s' % format_id) - - m3u8_data = self.decrypt_m3u8(req.read()) - - url_info_dict = { - 'url': encode_data_uri(m3u8_data, 'application/vnd.apple.mpegurl'), - 'ext': determine_ext(dispatch[format_id][1]), - 'format_id': format_id, - 'protocol': 'm3u8', - } - - if format_id[-1:] == 'p': - url_info_dict['height'] = int_or_none(format_id[:-1]) - - urls.append(url_info_dict) - - publish_time = parse_iso8601(self._html_search_regex( - r'发布时间 ([^<>]+) ', page, 'publish time', default=None), - delimiter=' ', timezone=datetime.timedelta(hours=8)) - description = self._html_search_meta('description', page, fatal=False) - - return { - 'id': media_id, - 'formats': urls, - 'title': playurl['title'], - 'thumbnail': playurl['pic'], - 'description': description, - 'timestamp': publish_time, - } - - -class LetvTvIE(InfoExtractor): - _VALID_URL = r'http://www.letv.com/tv/(?P\d+).html' - _TESTS = [{ - 'url': 'http://www.letv.com/tv/46177.html', - 'info_dict': { - 'id': '46177', - 'title': '美人天下', - 'description': 'md5:395666ff41b44080396e59570dbac01c' - }, - 'playlist_count': 35 - }] - - def _real_extract(self, url): - playlist_id = self._match_id(url) - page = self._download_webpage(url, playlist_id) - - media_urls = list(set(re.findall( - r'http://www.letv.com/ptv/vplay/\d+.html', page))) - entries = [self.url_result(media_url, ie='Letv') - for media_url in media_urls] - - title = self._html_search_meta('keywords', page, - fatal=False).split(',')[0] - description = self._html_search_meta('description', page, fatal=False) - - return self.playlist_result(entries, playlist_id, playlist_title=title, - playlist_description=description) - - -class LetvPlaylistIE(LetvTvIE): - _VALID_URL = r'http://tv.letv.com/[a-z]+/(?P[a-z]+)/index.s?html' - _TESTS = [{ - 'url': 'http://tv.letv.com/izt/wuzetian/index.html', - 'info_dict': { - 'id': 'wuzetian', - 'title': '武媚娘传奇', - 'description': 'md5:e12499475ab3d50219e5bba00b3cb248' - }, - # This playlist contains some extra videos other than the drama itself - 'playlist_mincount': 96 - }, { - 'url': 'http://tv.letv.com/pzt/lswjzzjc/index.shtml', - 'info_dict': { - 'id': 'lswjzzjc', - # The title should be "劲舞青春", but I can't find a simple way to - # determine the playlist title - 'title': '乐视午间自制剧场', - 'description': 'md5:b1eef244f45589a7b5b1af9ff25a4489' - }, - 'playlist_mincount': 7 - }] - - -class LetvCloudIE(InfoExtractor): - IE_DESC = '乐视云' - _VALID_URL = r'https?://yuntv\.letv\.com/bcloud.html\?.+' - - _TESTS = [{ - 'url': 'http://yuntv.letv.com/bcloud.html?uu=p7jnfw5hw9&vu=467623dedf', - 'md5': '26450599afd64c513bc77030ad15db44', - 'info_dict': { - 'id': 'p7jnfw5hw9_467623dedf', - 'ext': 'mp4', - 'title': 'Video p7jnfw5hw9_467623dedf', - }, - }, { - 'url': 'http://yuntv.letv.com/bcloud.html?uu=p7jnfw5hw9&vu=ec93197892&pu=2c7cd40209&auto_play=1&gpcflag=1&width=640&height=360', - 'md5': 'e03d9cc8d9c13191e1caf277e42dbd31', - 'info_dict': { - 'id': 'p7jnfw5hw9_ec93197892', - 'ext': 'mp4', - 'title': 'Video p7jnfw5hw9_ec93197892', - }, - }, { - 'url': 'http://yuntv.letv.com/bcloud.html?uu=p7jnfw5hw9&vu=187060b6fd', - 'md5': 'cb988699a776b22d4a41b9d43acfb3ac', - 'info_dict': { - 'id': 'p7jnfw5hw9_187060b6fd', - 'ext': 'mp4', - 'title': 'Video p7jnfw5hw9_187060b6fd', - }, - }] - - @staticmethod - def sign_data(obj): - if obj['cf'] == 'flash': - salt = '2f9d6924b33a165a6d8b5d3d42f4f987' - items = ['cf', 'format', 'ran', 'uu', 'ver', 'vu'] - elif obj['cf'] == 'html5': - salt = 'fbeh5player12c43eccf2bec3300344' - items = ['cf', 'ran', 'uu', 'bver', 'vu'] - input_data = ''.join([item + obj[item] for item in items]) + salt - obj['sign'] = hashlib.md5(input_data.encode('utf-8')).hexdigest() - - def _get_formats(self, cf, uu, vu, media_id): - def get_play_json(cf, timestamp): - data = { - 'cf': cf, - 'ver': '2.2', - 'bver': 'firefox44.0', - 'format': 'json', - 'uu': uu, - 'vu': vu, - 'ran': compat_str(timestamp), - } - self.sign_data(data) - return self._download_json( - 'http://api.letvcloud.com/gpc.php?' + compat_urllib_parse.urlencode(data), - media_id, 'Downloading playJson data for type %s' % cf) - - play_json = get_play_json(cf, time.time()) - # The server time may be different from local time - if play_json.get('code') == 10071: - play_json = get_play_json(cf, play_json['timestamp']) - - if not play_json.get('data'): - if play_json.get('message'): - raise ExtractorError('Letv cloud said: %s' % play_json['message'], expected=True) - elif play_json.get('code'): - raise ExtractorError('Letv cloud returned error %d' % play_json['code'], expected=True) - else: - raise ExtractorError('Letv cloud returned an unknwon error') - - def b64decode(s): - return base64.b64decode(s.encode('utf-8')).decode('utf-8') - - formats = [] - for media in play_json['data']['video_info']['media'].values(): - play_url = media['play_url'] - url = b64decode(play_url['main_url']) - decoded_url = b64decode(url_basename(url)) - formats.append({ - 'url': url, - 'ext': determine_ext(decoded_url), - 'format_id': int_or_none(play_url.get('vtype')), - 'format_note': str_or_none(play_url.get('definition')), - 'width': int_or_none(play_url.get('vwidth')), - 'height': int_or_none(play_url.get('vheight')), - }) - - return formats - - def _real_extract(self, url): - uu_mobj = re.search('uu=([\w]+)', url) - vu_mobj = re.search('vu=([\w]+)', url) - - if not uu_mobj or not vu_mobj: - raise ExtractorError('Invalid URL: %s' % url, expected=True) - - uu = uu_mobj.group(1) - vu = vu_mobj.group(1) - media_id = uu + '_' + vu - - formats = self._get_formats('flash', uu, vu, media_id) + self._get_formats('html5', uu, vu, media_id) - self._sort_formats(formats) - - return { - 'id': media_id, - 'title': 'Video %s' % media_id, - 'formats': formats, - } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/libraryofcongress.py youtube-dl-2019.05.20/youtube_dl/extractor/libraryofcongress.py --- youtube-dl-2016.02.22/youtube_dl/extractor/libraryofcongress.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/libraryofcongress.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,153 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + +from ..utils import ( + determine_ext, + float_or_none, + int_or_none, + parse_filesize, +) + + +class LibraryOfCongressIE(InfoExtractor): + IE_NAME = 'loc' + IE_DESC = 'Library of Congress' + _VALID_URL = r'https?://(?:www\.)?loc\.gov/(?:item/|today/cyberlc/feature_wdesc\.php\?.*\brec=)(?P[0-9a-z_.]+)' + _TESTS = [{ + # embedded via
.+?)\1', + r']+id=(["\'])uuid-(?P.+?)\1', + r']+data-uuid=(["\'])(?P.+?)\1', + r'mediaObjectId\s*:\s*(["\'])(?P.+?)\1', + r'data-tab="share-media-(?P[0-9A-F]{32})"'), + webpage, 'media id', group='id') + + data = self._download_json( + 'https://media.loc.gov/services/v1/media?id=%s&context=json' % media_id, + media_id)['mediaObject'] + + derivative = data['derivatives'][0] + media_url = derivative['derivativeUrl'] + + title = derivative.get('shortName') or data.get('shortName') or self._og_search_title( + webpage) + + # Following algorithm was extracted from setAVSource js function + # found in webpage + media_url = media_url.replace('rtmp', 'https') + + is_video = data.get('mediaType', 'v').lower() == 'v' + ext = determine_ext(media_url) + if ext not in ('mp4', 'mp3'): + media_url += '.mp4' if is_video else '.mp3' + + formats = [] + if '/vod/mp4:' in media_url: + formats.append({ + 'url': media_url.replace('/vod/mp4:', '/hls-vod/media/') + '.m3u8', + 'format_id': 'hls', + 'ext': 'mp4', + 'protocol': 'm3u8_native', + 'quality': 1, + }) + http_format = { + 'url': re.sub(r'(://[^/]+/)(?:[^/]+/)*(?:mp4|mp3):', r'\1', media_url), + 'format_id': 'http', + 'quality': 1, + } + if not is_video: + http_format['vcodec'] = 'none' + formats.append(http_format) + + download_urls = set() + for m in re.finditer( + r']+value=(["\'])(?P.+?)\1[^>]+data-file-download=[^>]+>\s*(?P.+?)(?:(?: |\s+)\((?P.+?)\))?\s*<', webpage): + format_id = m.group('id').lower() + if format_id in ('gif', 'jpeg'): + continue + download_url = m.group('url') + if download_url in download_urls: + continue + download_urls.add(download_url) + formats.append({ + 'url': download_url, + 'format_id': format_id, + 'filesize_approx': parse_filesize(m.group('size')), + }) + + self._sort_formats(formats) + + duration = float_or_none(data.get('duration')) + view_count = int_or_none(data.get('viewCount')) + + subtitles = {} + cc_url = data.get('ccUrl') + if cc_url: + subtitles.setdefault('en', []).append({ + 'url': cc_url, + 'ext': 'ttml', + }) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'duration': duration, + 'view_count': view_count, + 'formats': formats, + 'subtitles': subtitles, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/libsyn.py youtube-dl-2019.05.20/youtube_dl/extractor/libsyn.py --- youtube-dl-2016.02.22/youtube_dl/extractor/libsyn.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/libsyn.py 2019-06-07 18:49:07.000000000 +0000 @@ -4,21 +4,29 @@ import re from .common import InfoExtractor -from ..utils import unified_strdate +from ..utils import ( + clean_html, + get_element_by_class, + parse_duration, + strip_or_none, + unified_strdate, +) class LibsynIE(InfoExtractor): _VALID_URL = r'(?Phttps?://html5-player\.libsyn\.com/embed/episode/id/(?P[0-9]+))' _TESTS = [{ - 'url': 'http://html5-player.libsyn.com/embed/episode/id/3377616/', - 'md5': '443360ee1b58007bc3dcf09b41d093bb', + 'url': 'http://html5-player.libsyn.com/embed/episode/id/6385796/', + 'md5': '2a55e75496c790cdeb058e7e6c087746', 'info_dict': { - 'id': '3377616', + 'id': '6385796', 'ext': 'mp3', - 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart", - 'description': 'md5:601cb790edd05908957dae8aaa866465', - 'upload_date': '20150220', + 'title': "Champion Minded - Developing a Growth Mindset", + # description fetched using another request: + # http://html5-player.libsyn.com/embed/getitemdetails?item_id=6385796 + # 'description': 'In this episode, Allistair talks about the importance of developing a growth mindset, not only in sports, but in life too.', + 'upload_date': '20180320', 'thumbnail': 're:^https?://.*', }, }, { @@ -34,36 +42,52 @@ }] def _real_extract(self, url): - m = re.match(self._VALID_URL, url) - video_id = m.group('id') - url = m.group('mainurl') + url, video_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, video_id) - formats = [{ - 'url': media_url, - } for media_url in set(re.findall('var\s+mediaURL(?:Libsyn)?\s*=\s*"([^"]+)"', webpage))] - - podcast_title = self._search_regex( - r'

([^<]+)

', webpage, 'podcast title', default=None) - episode_title = self._search_regex( - r'(?:
|

)([^<]+)(.+?)'], + webpage, 'episode title') + episode_title = episode_title.strip() + + podcast_title = strip_or_none(clean_html(self._search_regex( + r'

([^<]+)

', webpage, 'podcast title', + default=None) or get_element_by_class('podcast-title', webpage))) title = '%s - %s' % (podcast_title, episode_title) if podcast_title else episode_title + formats = [] + for k, format_id in (('media_url_libsyn', 'libsyn'), ('media_url', 'main'), ('download_link', 'download')): + f_url = data.get(k) + if not f_url: + continue + formats.append({ + 'url': f_url, + 'format_id': format_id, + }) + description = self._html_search_regex( - r'
(.+?)
', webpage, + r'(.+?)

', webpage, 'description', default=None) - thumbnail = self._search_regex( - r']+class="info-show-icon"[^>]+src="([^"]+)"', - webpage, 'thumbnail', fatal=False) + if description: + # Strip non-breaking and normal spaces + description = description.replace('\u00A0', ' ').strip() release_date = unified_strdate(self._search_regex( - r'
Released: ([^<]+)<', webpage, 'release date', fatal=False)) + r'
Released: ([^<]+)<', + webpage, 'release date', default=None) or data.get('release_date')) return { 'id': video_id, 'title': title, 'description': description, - 'thumbnail': thumbnail, + 'thumbnail': data.get('thumbnail_url'), 'upload_date': release_date, + 'duration': parse_duration(data.get('duration')), 'formats': formats, } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/lifenews.py youtube-dl-2019.05.20/youtube_dl/extractor/lifenews.py --- youtube-dl-2016.02.22/youtube_dl/extractor/lifenews.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/lifenews.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,163 +1,234 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import ( + compat_str, + compat_urlparse, +) from ..utils import ( determine_ext, + ExtractorError, int_or_none, + parse_iso8601, remove_end, - unified_strdate, - ExtractorError, ) class LifeNewsIE(InfoExtractor): - IE_NAME = 'lifenews' - IE_DESC = 'LIFE | NEWS' - _VALID_URL = r'http://lifenews\.ru/(?:mobile/)?(?P
news|video)/(?P\d+)' + IE_NAME = 'life' + IE_DESC = 'Life.ru' + _VALID_URL = r'https?://life\.ru/t/[^/]+/(?P\d+)' _TESTS = [{ - 'url': 'http://lifenews.ru/news/126342', - 'md5': 'e1b50a5c5fb98a6a544250f2e0db570a', + # single video embedded via video/source + 'url': 'https://life.ru/t/новости/98736', + 'md5': '77c95eaefaca216e32a76a343ad89d23', 'info_dict': { - 'id': '126342', + 'id': '98736', 'ext': 'mp4', - 'title': 'МВД разыскивает мужчин, оставивших в IKEA сумку с автоматом', - 'description': 'Камеры наблюдения гипермаркета зафиксировали троих мужчин, спрятавших оружейный арсенал в камере хранения.', - 'thumbnail': 're:http://.*\.jpg', - 'upload_date': '20140130', + 'title': 'Мужчина нашел дома архив оборонного завода', + 'description': 'md5:3b06b1b39b5e2bea548e403d99b8bf26', + 'timestamp': 1344154740, + 'upload_date': '20120805', + 'view_count': int, } }, { - # video in ', webpage): url = self._search_regex( @@ -426,7 +501,11 @@ if url: break - mobj = re.match(self._VALID_URL, url) + if not url: + url = self._og_search_url(webpage) + + mobj = re.match( + self._VALID_URL, self._proto_relative_url(url.strip())) player_id = mobj.group('player_id') if not display_id: @@ -436,15 +515,29 @@ url, display_id, note='Downloading player page', errnote='Could not download player page') video_id = self._search_regex( - r'\d+)' + _TEST = { + 'url': 'http://www.pearvideo.com/video_1076290', + 'info_dict': { + 'id': '1076290', + 'ext': 'mp4', + 'title': '小浣熊在主人家玻璃上滚石头:没砸', + 'description': 'md5:01d576b747de71be0ee85eb7cac25f9d', + 'timestamp': 1494275280, + 'upload_date': '20170508', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + quality = qualities( + ('ldflv', 'ld', 'sdflv', 'sd', 'hdflv', 'hd', 'src')) + + formats = [{ + 'url': mobj.group('url'), + 'format_id': mobj.group('id'), + 'quality': quality(mobj.group('id')), + } for mobj in re.finditer( + r'(?P[a-zA-Z]+)Url\s*=\s*(["\'])(?P(?:https?:)?//.+?)\2', + webpage)] + self._sort_formats(formats) + + title = self._search_regex( + (r']+\bclass=(["\'])video-tt\1[^>]*>(?P[^<]+)', + r'<[^>]+\bdata-title=(["\'])(?P(?:(?!\1).)+)\1'), + webpage, 'title', group='value') + description = self._search_regex( + (r']+\bclass=(["\'])summary\1[^>]*>(?P[^<]+)', + r'<[^>]+\bdata-summary=(["\'])(?P(?:(?!\1).)+)\1'), + webpage, 'description', default=None, + group='value') or self._html_search_meta('Description', webpage) + timestamp = unified_timestamp(self._search_regex( + r']+\bclass=["\']date["\'][^>]*>([^<]+)', + webpage, 'timestamp', fatal=False)) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/peertube.py youtube-dl-2019.05.20/youtube_dl/extractor/peertube.py --- youtube-dl-2016.02.22/youtube_dl/extractor/peertube.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/peertube.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,250 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + parse_resolution, + try_get, + unified_timestamp, + url_or_none, + urljoin, +) + + +class PeerTubeIE(InfoExtractor): + _INSTANCES_RE = r'''(?: + # Taken from https://instances.joinpeertube.org/instances + tube\.openalgeria\.org| + peertube\.pointsecu\.fr| + peertube\.nogafa\.org| + peertube\.pl| + megatube\.lilomoino\.fr| + peertube\.tamanoir\.foucry\.net| + peertube\.inapurna\.org| + peertube\.netzspielplatz\.de| + video\.deadsuperhero\.com| + peertube\.devosi\.org| + peertube\.1312\.media| + tube\.worldofhauru\.xyz| + tube\.bootlicker\.party| + skeptikon\.fr| + peertube\.geekshell\.fr| + tube\.opportunis\.me| + peertube\.peshane\.net| + video\.blueline\.mg| + tube\.homecomputing\.fr| + videos\.cloudfrancois\.fr| + peertube\.viviers-fibre\.net| + tube\.ouahpiti\.info| + video\.tedomum\.net| + video\.g3l\.org| + fontube\.fr| + peertube\.gaialabs\.ch| + peertube\.extremely\.online| + peertube\.public-infrastructure\.eu| + tube\.kher\.nl| + peertube\.qtg\.fr| + tube\.22decembre\.eu| + facegirl\.me| + video\.migennes\.net| + janny\.moe| + tube\.p2p\.legal| + video\.atlanti\.se| + troll\.tv| + peertube\.geekael\.fr| + vid\.leotindall\.com| + video\.anormallostpod\.ovh| + p-tube\.h3z\.jp| + tube\.darfweb\.eu| + videos\.iut-orsay\.fr| + peertube\.solidev\.net| + videos\.symphonie-of-code\.fr| + testtube\.ortg\.de| + videos\.cemea\.org| + peertube\.gwendalavir\.eu| + video\.passageenseine\.fr| + videos\.festivalparminous\.org| + peertube\.touhoppai\.moe| + peertube\.duckdns\.org| + sikke\.fi| + peertube\.mastodon\.host| + firedragonvideos\.com| + vidz\.dou\.bet| + peertube\.koehn\.com| + peer\.hostux\.social| + share\.tube| + peertube\.walkingmountains\.fr| + medias\.libox\.fr| + peertube\.moe| + peertube\.xyz| + jp\.peertube\.network| + videos\.benpro\.fr| + tube\.otter\.sh| + peertube\.angristan\.xyz| + peertube\.parleur\.net| + peer\.ecutsa\.fr| + peertube\.heraut\.eu| + peertube\.tifox\.fr| + peertube\.maly\.io| + vod\.mochi\.academy| + exode\.me| + coste\.video| + tube\.aquilenet\.fr| + peertube\.gegeweb\.eu| + framatube\.org| + thinkerview\.video| + tube\.conferences-gesticulees\.net| + peertube\.datagueule\.tv| + video\.lqdn\.fr| + meilleurtube\.delire\.party| + tube\.mochi\.academy| + peertube\.dav\.li| + media\.zat\.im| + pytu\.be| + peertube\.valvin\.fr| + peertube\.nsa\.ovh| + video\.colibris-outilslibres\.org| + video\.hispagatos\.org| + tube\.svnet\.fr| + peertube\.video| + videos\.lecygnenoir\.info| + peertube3\.cpy\.re| + peertube2\.cpy\.re| + videos\.tcit\.fr| + peertube\.cpy\.re + )''' + _UUID_RE = r'[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}' + _VALID_URL = r'''(?x) + (?: + peertube:(?P[^:]+):| + https?://(?P%s)/(?:videos/(?:watch|embed)|api/v\d/videos)/ + ) + (?P%s) + ''' % (_INSTANCES_RE, _UUID_RE) + _TESTS = [{ + 'url': 'https://peertube.moe/videos/watch/2790feb0-8120-4e63-9af3-c943c69f5e6c', + 'md5': '80f24ff364cc9d333529506a263e7feb', + 'info_dict': { + 'id': '2790feb0-8120-4e63-9af3-c943c69f5e6c', + 'ext': 'mp4', + 'title': 'wow', + 'description': 'wow such video, so gif', + 'thumbnail': r're:https?://.*\.(?:jpg|png)', + 'timestamp': 1519297480, + 'upload_date': '20180222', + 'uploader': 'Luclu7', + 'uploader_id': '7fc42640-efdb-4505-a45d-a15b1a5496f1', + 'uploder_url': 'https://peertube.nsa.ovh/accounts/luclu7', + 'license': 'Unknown', + 'duration': 3, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'tags': list, + 'categories': list, + } + }, { + 'url': 'https://peertube.tamanoir.foucry.net/videos/watch/0b04f13d-1e18-4f1d-814e-4979aa7c9c44', + 'only_matching': True, + }, { + # nsfw + 'url': 'https://tube.22decembre.eu/videos/watch/9bb88cd3-9959-46d9-9ab9-33d2bb704c39', + 'only_matching': True, + }, { + 'url': 'https://tube.22decembre.eu/videos/embed/fed67262-6edb-4d1c-833b-daa9085c71d7', + 'only_matching': True, + }, { + 'url': 'https://tube.openalgeria.org/api/v1/videos/c1875674-97d0-4c94-a058-3f7e64c962e8', + 'only_matching': True, + }, { + 'url': 'peertube:video.blender.org:b37a5b9f-e6b5-415c-b700-04a5cd6ec205', + 'only_matching': True, + }] + + @staticmethod + def _extract_peertube_url(webpage, source_url): + mobj = re.match( + r'https?://(?P[^/]+)/videos/watch/(?P%s)' + % PeerTubeIE._UUID_RE, source_url) + if mobj and any(p in webpage for p in ( + 'PeerTube<', + 'There will be other non JS-based clients to access PeerTube', + '>We are sorry but it seems that PeerTube is not compatible with your web browser.<')): + return 'peertube:%s:%s' % mobj.group('host', 'id') + + @staticmethod + def _extract_urls(webpage, source_url): + entries = re.findall( + r'''(?x)<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//%s/videos/embed/%s)''' + % (PeerTubeIE._INSTANCES_RE, PeerTubeIE._UUID_RE), webpage) + if not entries: + peertube_url = PeerTubeIE._extract_peertube_url(webpage, source_url) + if peertube_url: + entries = [peertube_url] + return entries + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') or mobj.group('host_2') + video_id = mobj.group('id') + + video = self._download_json( + 'https://%s/api/v1/videos/%s' % (host, video_id), video_id) + + title = video['name'] + + formats = [] + for file_ in video['files']: + if not isinstance(file_, dict): + continue + file_url = url_or_none(file_.get('fileUrl')) + if not file_url: + continue + file_size = int_or_none(file_.get('size')) + format_id = try_get( + file_, lambda x: x['resolution']['label'], compat_str) + f = parse_resolution(format_id) + f.update({ + 'url': file_url, + 'format_id': format_id, + 'filesize': file_size, + }) + formats.append(f) + self._sort_formats(formats) + + def account_data(field): + return try_get(video, lambda x: x['account'][field], compat_str) + + category = try_get(video, lambda x: x['category']['label'], compat_str) + categories = [category] if category else None + + nsfw = video.get('nsfw') + if nsfw is bool: + age_limit = 18 if nsfw else 0 + else: + age_limit = None + + return { + 'id': video_id, + 'title': title, + 'description': video.get('description'), + 'thumbnail': urljoin(url, video.get('thumbnailPath')), + 'timestamp': unified_timestamp(video.get('publishedAt')), + 'uploader': account_data('displayName'), + 'uploader_id': account_data('uuid'), + 'uploder_url': account_data('url'), + 'license': try_get( + video, lambda x: x['licence']['label'], compat_str), + 'duration': int_or_none(video.get('duration')), + 'view_count': int_or_none(video.get('views')), + 'like_count': int_or_none(video.get('likes')), + 'dislike_count': int_or_none(video.get('dislikes')), + 'age_limit': age_limit, + 'tags': try_get(video, lambda x: x['tags'], list), + 'categories': categories, + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/people.py youtube-dl-2019.05.20/youtube_dl/extractor/people.py --- youtube-dl-2016.02.22/youtube_dl/extractor/people.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/people.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,32 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class PeopleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?people\.com/people/videos/0,,(?P<id>\d+),00\.html' + + _TEST = { + 'url': 'http://www.people.com/people/videos/0,,20995451,00.html', + 'info_dict': { + 'id': 'ref:20995451', + 'ext': 'mp4', + 'title': 'Astronaut Love Triangle Victim Speaks Out: “The Crime in 2007 Hasn’t Defined Us”', + 'description': 'Colleen Shipman speaks to PEOPLE for the first time about life after the attack', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 246.318, + 'timestamp': 1458720585, + 'upload_date': '20160323', + 'uploader_id': '416418724', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['BrightcoveNew'], + } + + def _real_extract(self, url): + return self.url_result( + 'http://players.brightcove.net/416418724/default_default/index.html?videoId=ref:%s' + % self._match_id(url), 'BrightcoveNew') diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/performgroup.py youtube-dl-2019.05.20/youtube_dl/extractor/performgroup.py --- youtube-dl-2016.02.22/youtube_dl/extractor/performgroup.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/performgroup.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,83 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import int_or_none + + +class PerformGroupIE(InfoExtractor): + _VALID_URL = r'https?://player\.performgroup\.com/eplayer(?:/eplayer\.html|\.js)#/?(?P<id>[0-9a-f]{26})\.(?P<auth_token>[0-9a-z]{26})' + _TESTS = [{ + # http://www.faz.net/aktuell/sport/fussball/wm-2018-playoffs-schweiz-besiegt-nordirland-1-0-15286104.html + 'url': 'http://player.performgroup.com/eplayer/eplayer.html#d478c41c5d192f56b9aa859de8.1w4crrej5w14e1ed4s1ce4ykab', + 'md5': '259cb03d142e2e52471e8837ecacb29f', + 'info_dict': { + 'id': 'xgrwobuzumes1lwjxtcdpwgxd', + 'ext': 'mp4', + 'title': 'Liga MX: Keine Einsicht nach Horrorfoul', + 'description': 'md5:7cd3b459c82725b021e046ab10bf1c5b', + 'timestamp': 1511533477, + 'upload_date': '20171124', + } + }] + + def _call_api(self, service, auth_token, content_id, referer_url): + return self._download_json( + 'http://ep3.performfeeds.com/ep%s/%s/%s/' % (service, auth_token, content_id), + content_id, headers={ + 'Referer': referer_url, + 'Origin': 'http://player.performgroup.com', + }, query={ + '_fmt': 'json', + }) + + def _real_extract(self, url): + player_id, auth_token = re.search(self._VALID_URL, url).groups() + bootstrap = self._call_api('bootstrap', auth_token, player_id, url) + video = bootstrap['config']['dataSource']['sourceItems'][0]['videos'][0] + video_id = video['uuid'] + vod = self._call_api('vod', auth_token, video_id, url) + media = vod['videos']['video'][0]['media'] + + formats = [] + hls_url = media.get('hls', {}).get('url') + if hls_url: + formats.extend(self._extract_m3u8_formats(hls_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + + hds_url = media.get('hds', {}).get('url') + if hds_url: + formats.extend(self._extract_f4m_formats(hds_url + '?hdcore', video_id, f4m_id='hds', fatal=False)) + + for c in media.get('content', []): + c_url = c.get('url') + if not c_url: + continue + tbr = int_or_none(c.get('bitrate'), 1000) + format_id = 'http' + if tbr: + format_id += '-%d' % tbr + formats.append({ + 'format_id': format_id, + 'url': c_url, + 'tbr': tbr, + 'width': int_or_none(c.get('width')), + 'height': int_or_none(c.get('height')), + 'filesize': int_or_none(c.get('fileSize')), + 'vcodec': c.get('type'), + 'fps': int_or_none(c.get('videoFrameRate')), + 'vbr': int_or_none(c.get('videoRate'), 1000), + 'abr': int_or_none(c.get('audioRate'), 1000), + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video['title'], + 'description': video.get('description'), + 'thumbnail': video.get('poster'), + 'duration': int_or_none(video.get('duration')), + 'timestamp': int_or_none(video.get('publishedTime'), 1000), + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/periscope.py youtube-dl-2019.05.20/youtube_dl/extractor/periscope.py --- youtube-dl-2016.02.22/youtube_dl/extractor/periscope.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/periscope.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,13 +1,27 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor -from ..utils import parse_iso8601 +from ..utils import ( + int_or_none, + parse_iso8601, + unescapeHTML, +) + + +class PeriscopeBaseIE(InfoExtractor): + def _call_api(self, method, query, item_id): + return self._download_json( + 'https://api.periscope.tv/api/v2/%s' % method, + item_id, query=query) -class PeriscopeIE(InfoExtractor): +class PeriscopeIE(PeriscopeBaseIE): IE_DESC = 'Periscope' - _VALID_URL = r'https?://(?:www\.)?periscope\.tv/[^/]+/(?P<id>[^/?#]+)' + IE_NAME = 'periscope' + _VALID_URL = r'https?://(?:www\.)?(?:periscope|pscp)\.tv/[^/]+/(?P<id>[^/?#]+)' # Alive example URLs can be found here http://onperiscope.com/ _TESTS = [{ 'url': 'https://www.periscope.tv/w/aJUQnjY3MjA3ODF8NTYxMDIyMDl2zCg2pECBgwTqRpQuQD352EMPTKQjT4uqlM3cgWFA-g==', @@ -28,23 +42,31 @@ }, { 'url': 'https://www.periscope.tv/bastaakanoggano/1OdKrlkZZjOJX', 'only_matching': True, + }, { + 'url': 'https://www.periscope.tv/w/1ZkKzPbMVggJv', + 'only_matching': True, }] - def _call_api(self, method, value): - return self._download_json( - 'https://api.periscope.tv/api/v2/%s?broadcast_id=%s' % (method, value), value) + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+src=([\'"])(?P<url>(?:https?:)?//(?:www\.)?(?:periscope|pscp)\.tv/(?:(?!\1).)+)\1', webpage) + if mobj: + return mobj.group('url') def _real_extract(self, url): token = self._match_id(url) - broadcast_data = self._call_api('getBroadcastPublic', token) - broadcast = broadcast_data['broadcast'] - status = broadcast['status'] + stream = self._call_api( + 'accessVideoPublic', {'broadcast_id': token}, token) + + broadcast = stream['broadcast'] + title = broadcast['status'] - uploader = broadcast.get('user_display_name') or broadcast_data.get('user', {}).get('display_name') - uploader_id = broadcast.get('user_id') or broadcast_data.get('user', {}).get('id') + uploader = broadcast.get('user_display_name') or broadcast.get('username') + uploader_id = (broadcast.get('user_id') or broadcast.get('username')) - title = '%s - %s' % (uploader, status) if uploader else status + title = '%s - %s' % (uploader, title) if uploader else title state = broadcast.get('state').lower() if state == 'running': title = self._live_title(title) @@ -54,20 +76,37 @@ 'url': broadcast[image], } for image in ('image_url', 'image_url_small') if broadcast.get(image)] - stream = self._call_api('getAccessPublic', token) + width = int_or_none(broadcast.get('width')) + height = int_or_none(broadcast.get('height')) + def add_width_and_height(f): + for key, val in (('width', width), ('height', height)): + if not f.get(key): + f[key] = val + + video_urls = set() formats = [] - for format_id in ('replay', 'rtmp', 'hls', 'https_hls'): + for format_id in ('replay', 'rtmp', 'hls', 'https_hls', 'lhls', 'lhlsweb'): video_url = stream.get(format_id + '_url') - if not video_url: + if not video_url or video_url in video_urls: continue - f = { + video_urls.add(video_url) + if format_id != 'rtmp': + m3u8_formats = self._extract_m3u8_formats( + video_url, token, 'mp4', + entry_protocol='m3u8_native' + if state in ('ended', 'timed_out') else 'm3u8', + m3u8_id=format_id, fatal=False) + if len(m3u8_formats) == 1: + add_width_and_height(m3u8_formats[0]) + formats.extend(m3u8_formats) + continue + rtmp_format = { 'url': video_url, 'ext': 'flv' if format_id == 'rtmp' else 'mp4', } - if format_id != 'rtmp': - f['protocol'] = 'm3u8_native' if state == 'ended' else 'm3u8' - formats.append(f) + add_width_and_height(rtmp_format) + formats.append(rtmp_format) self._sort_formats(formats) return { @@ -79,3 +118,54 @@ 'thumbnails': thumbnails, 'formats': formats, } + + +class PeriscopeUserIE(PeriscopeBaseIE): + _VALID_URL = r'https?://(?:www\.)?(?:periscope|pscp)\.tv/(?P<id>[^/]+)/?$' + IE_DESC = 'Periscope user videos' + IE_NAME = 'periscope:user' + + _TEST = { + 'url': 'https://www.periscope.tv/LularoeHusbandMike/', + 'info_dict': { + 'id': 'LularoeHusbandMike', + 'title': 'LULAROE HUSBAND MIKE', + 'description': 'md5:6cf4ec8047768098da58e446e82c82f0', + }, + # Periscope only shows videos in the last 24 hours, so it's possible to + # get 0 videos + 'playlist_mincount': 0, + } + + def _real_extract(self, url): + user_name = self._match_id(url) + + webpage = self._download_webpage(url, user_name) + + data_store = self._parse_json( + unescapeHTML(self._search_regex( + r'data-store=(["\'])(?P<data>.+?)\1', + webpage, 'data store', default='{}', group='data')), + user_name) + + user = list(data_store['UserCache']['users'].values())[0]['user'] + user_id = user['id'] + session_id = data_store['SessionToken']['public']['broadcastHistory']['token']['session_id'] + + broadcasts = self._call_api( + 'getUserBroadcastsPublic', + {'user_id': user_id, 'session_id': session_id}, + user_name)['broadcasts'] + + broadcast_ids = [ + broadcast['id'] for broadcast in broadcasts if broadcast.get('id')] + + title = user.get('display_name') or user.get('username') or user_name + description = user.get('description') + + entries = [ + self.url_result( + 'https://www.periscope.tv/%s/%s' % (user_name, broadcast_id)) + for broadcast_id in broadcast_ids] + + return self.playlist_result(entries, user_id, title, description) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/philharmoniedeparis.py youtube-dl-2019.05.20/youtube_dl/extractor/philharmoniedeparis.py --- youtube-dl-2016.02.22/youtube_dl/extractor/philharmoniedeparis.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/philharmoniedeparis.py 2019-06-07 18:49:07.000000000 +0000 @@ -2,31 +2,38 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( - float_or_none, - int_or_none, - parse_iso8601, - xpath_text, + try_get, + urljoin, ) class PhilharmonieDeParisIE(InfoExtractor): IE_DESC = 'Philharmonie de Paris' - _VALID_URL = r'http://live\.philharmoniedeparis\.fr/(?:[Cc]oncert/|misc/Playlist\.ashx\?id=)(?P<id>\d+)' + _VALID_URL = r'''(?x) + https?:// + (?: + live\.philharmoniedeparis\.fr/(?:[Cc]oncert/|misc/Playlist\.ashx\?id=)| + pad\.philharmoniedeparis\.fr/doc/CIMU/ + ) + (?P<id>\d+) + ''' _TESTS = [{ + 'url': 'http://pad.philharmoniedeparis.fr/doc/CIMU/1086697/jazz-a-la-villette-knower', + 'md5': 'a0a4b195f544645073631cbec166a2c2', + 'info_dict': { + 'id': '1086697', + 'ext': 'mp4', + 'title': 'Jazz à la Villette : Knower', + }, + }, { 'url': 'http://live.philharmoniedeparis.fr/concert/1032066.html', 'info_dict': { 'id': '1032066', - 'ext': 'flv', - 'title': 'md5:d1f5585d87d041d07ce9434804bc8425', - 'timestamp': 1428179400, - 'upload_date': '20150404', - 'duration': 6592.278, + 'title': 'md5:0a031b81807b3593cffa3c9a87a167a0', }, - 'params': { - # rtmp download - 'skip_download': True, - } + 'playlist_mincount': 2, }, { 'url': 'http://live.philharmoniedeparis.fr/Concert/1030324.html', 'only_matching': True, @@ -34,45 +41,60 @@ 'url': 'http://live.philharmoniedeparis.fr/misc/Playlist.ashx?id=1030324&track=&lang=fr', 'only_matching': True, }] + _LIVE_URL = 'https://live.philharmoniedeparis.fr' def _real_extract(self, url): video_id = self._match_id(url) - concert = self._download_xml( - 'http://live.philharmoniedeparis.fr/misc/Playlist.ashx?id=%s' % video_id, - video_id).find('./concert') - - formats = [] - info_dict = { - 'id': video_id, - 'title': xpath_text(concert, './titre', 'title', fatal=True), - 'formats': formats, - } - - fichiers = concert.find('./fichiers') - stream = fichiers.attrib['serveurstream'] - for fichier in fichiers.findall('./fichier'): - info_dict['duration'] = float_or_none(fichier.get('timecodefin')) - for quality, (format_id, suffix) in enumerate([('lq', ''), ('hq', '_hd')]): - format_url = fichier.get('url%s' % suffix) - if not format_url: + config = self._download_json( + '%s/otoPlayer/config.ashx' % self._LIVE_URL, video_id, query={ + 'id': video_id, + 'lang': 'fr-FR', + }) + + def extract_entry(source): + if not isinstance(source, dict): + return + title = source.get('title') + if not title: + return + files = source.get('files') + if not isinstance(files, dict): + return + format_urls = set() + formats = [] + for format_id in ('mobile', 'desktop'): + format_url = try_get( + files, lambda x: x[format_id]['file'], compat_str) + if not format_url or format_url in format_urls: continue - formats.append({ - 'url': stream, - 'play_path': format_url, - 'ext': 'flv', - 'format_id': format_id, - 'width': int_or_none(concert.get('largeur%s' % suffix)), - 'height': int_or_none(concert.get('hauteur%s' % suffix)), - 'quality': quality, - }) - self._sort_formats(formats) - - date, hour = concert.get('date'), concert.get('heure') - if date and hour: - info_dict['timestamp'] = parse_iso8601( - '%s-%s-%sT%s:00' % (date[0:4], date[4:6], date[6:8], hour)) - elif date: - info_dict['upload_date'] = date + format_urls.add(format_url) + m3u8_url = urljoin(self._LIVE_URL, format_url) + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + if not formats: + return + self._sort_formats(formats) + return { + 'title': title, + 'formats': formats, + } + + thumbnail = urljoin(self._LIVE_URL, config.get('image')) + + info = extract_entry(config) + if info: + info.update({ + 'id': video_id, + 'thumbnail': thumbnail, + }) + return info + + entries = [] + for num, chapter in enumerate(config['chapters'], start=1): + entry = extract_entry(chapter) + entry['id'] = '%s-%d' % (video_id, num) + entries.append(entry) - return info_dict + return self.playlist_result(entries, video_id, config.get('title')) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/phoenix.py youtube-dl-2019.05.20/youtube_dl/extractor/phoenix.py --- youtube-dl-2016.02.22/youtube_dl/extractor/phoenix.py 2015-12-31 15:50:46.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/phoenix.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,9 +1,9 @@ from __future__ import unicode_literals -from .zdf import ZDFIE +from .dreisat import DreiSatIE -class PhoenixIE(ZDFIE): +class PhoenixIE(DreiSatIE): IE_NAME = 'phoenix.de' _VALID_URL = r'''(?x)https?://(?:www\.)?phoenix\.de/content/ (?: diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/photobucket.py youtube-dl-2019.05.20/youtube_dl/extractor/photobucket.py --- youtube-dl-2016.02.22/youtube_dl/extractor/photobucket.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/photobucket.py 2019-06-07 18:49:07.000000000 +0000 @@ -8,7 +8,7 @@ class PhotobucketIE(InfoExtractor): - _VALID_URL = r'http://(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))' + _VALID_URL = r'https?://(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))' _TEST = { 'url': 'http://media.photobucket.com/user/rachaneronas/media/TiredofLinkBuildingTryBacklinkMyDomaincom_zpsc0c3b9fa.mp4.html?filters[term]=search&filters[primary]=videos&filters[secondary]=images&sort=1&o=0', 'md5': '7dabfb92b0a31f6c16cebc0f8e60ff99', diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/picarto.py youtube-dl-2019.05.20/youtube_dl/extractor/picarto.py --- youtube-dl-2016.02.22/youtube_dl/extractor/picarto.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/picarto.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,153 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import time + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + js_to_json, + try_get, + update_url_query, + urlencode_postdata, +) + + +class PicartoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www.)?picarto\.tv/(?P<id>[a-zA-Z0-9]+)(?:/(?P<token>[a-zA-Z0-9]+))?' + _TEST = { + 'url': 'https://picarto.tv/Setz', + 'info_dict': { + 'id': 'Setz', + 'ext': 'mp4', + 'title': 're:^Setz [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'timestamp': int, + 'is_live': True + }, + 'skip': 'Stream is offline', + } + + @classmethod + def suitable(cls, url): + return False if PicartoVodIE.suitable(url) else super(PicartoIE, cls).suitable(url) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + channel_id = mobj.group('id') + + metadata = self._download_json( + 'https://api.picarto.tv/v1/channel/name/' + channel_id, + channel_id) + + if metadata.get('online') is False: + raise ExtractorError('Stream is offline', expected=True) + + cdn_data = self._download_json( + 'https://picarto.tv/process/channel', channel_id, + data=urlencode_postdata({'loadbalancinginfo': channel_id}), + note='Downloading load balancing info') + + token = mobj.group('token') or 'public' + params = { + 'con': int(time.time() * 1000), + 'token': token, + } + + prefered_edge = cdn_data.get('preferedEdge') + formats = [] + + for edge in cdn_data['edges']: + edge_ep = edge.get('ep') + if not edge_ep or not isinstance(edge_ep, compat_str): + continue + edge_id = edge.get('id') + for tech in cdn_data['techs']: + tech_label = tech.get('label') + tech_type = tech.get('type') + preference = 0 + if edge_id == prefered_edge: + preference += 1 + format_id = [] + if edge_id: + format_id.append(edge_id) + if tech_type == 'application/x-mpegurl' or tech_label == 'HLS': + format_id.append('hls') + formats.extend(self._extract_m3u8_formats( + update_url_query( + 'https://%s/hls/%s/index.m3u8' + % (edge_ep, channel_id), params), + channel_id, 'mp4', preference=preference, + m3u8_id='-'.join(format_id), fatal=False)) + continue + elif tech_type == 'video/mp4' or tech_label == 'MP4': + format_id.append('mp4') + formats.append({ + 'url': update_url_query( + 'https://%s/mp4/%s.mp4' % (edge_ep, channel_id), + params), + 'format_id': '-'.join(format_id), + 'preference': preference, + }) + else: + # rtmp format does not seem to work + continue + self._sort_formats(formats) + + mature = metadata.get('adult') + if mature is None: + age_limit = None + else: + age_limit = 18 if mature is True else 0 + + return { + 'id': channel_id, + 'title': self._live_title(metadata.get('title') or channel_id), + 'is_live': True, + 'thumbnail': try_get(metadata, lambda x: x['thumbnails']['web']), + 'channel': channel_id, + 'channel_url': 'https://picarto.tv/%s' % channel_id, + 'age_limit': age_limit, + 'formats': formats, + } + + +class PicartoVodIE(InfoExtractor): + _VALID_URL = r'https?://(?:www.)?picarto\.tv/videopopout/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://picarto.tv/videopopout/ArtofZod_2017.12.12.00.13.23.flv', + 'md5': '3ab45ba4352c52ee841a28fb73f2d9ca', + 'info_dict': { + 'id': 'ArtofZod_2017.12.12.00.13.23.flv', + 'ext': 'mp4', + 'title': 'ArtofZod_2017.12.12.00.13.23.flv', + 'thumbnail': r're:^https?://.*\.jpg' + }, + }, { + 'url': 'https://picarto.tv/videopopout/Plague', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + vod_info = self._parse_json( + self._search_regex( + r'(?s)#vod-player["\']\s*,\s*(\{.+?\})\s*\)', webpage, + video_id), + video_id, transform_source=js_to_json) + + formats = self._extract_m3u8_formats( + vod_info['vod'], video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_id, + 'thumbnail': vod_info.get('vodThumb'), + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/piksel.py youtube-dl-2019.05.20/youtube_dl/extractor/piksel.py --- youtube-dl-2016.02.22/youtube_dl/extractor/piksel.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/piksel.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,123 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + dict_get, + int_or_none, + unescapeHTML, + parse_iso8601, +) + + +class PikselIE(InfoExtractor): + _VALID_URL = r'https?://player\.piksel\.com/v/(?P<id>[a-z0-9]+)' + _TESTS = [ + { + 'url': 'http://player.piksel.com/v/nv60p12f', + 'md5': 'd9c17bbe9c3386344f9cfd32fad8d235', + 'info_dict': { + 'id': 'nv60p12f', + 'ext': 'mp4', + 'title': 'فن الحياة - الحلقة 1', + 'description': 'احدث برامج الداعية الاسلامي " مصطفي حسني " فى رمضان 2016علي النهار نور', + 'timestamp': 1465231790, + 'upload_date': '20160606', + } + }, + { + # Original source: http://www.uscourts.gov/cameras-courts/state-washington-vs-donald-j-trump-et-al + 'url': 'https://player.piksel.com/v/v80kqp41', + 'md5': '753ddcd8cc8e4fa2dda4b7be0e77744d', + 'info_dict': { + 'id': 'v80kqp41', + 'ext': 'mp4', + 'title': 'WAW- State of Washington vs. Donald J. Trump, et al', + 'description': 'State of Washington vs. Donald J. Trump, et al, Case Number 17-CV-00141-JLR, TRO Hearing, Civil Rights Case, 02/3/2017, 1:00 PM (PST), Seattle Federal Courthouse, Seattle, WA, Judge James L. Robart presiding.', + 'timestamp': 1486171129, + 'upload_date': '20170204', + } + } + ] + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+src=["\'](?P<url>(?:https?:)?//player\.piksel\.com/v/[a-z0-9]+)', + webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + app_token = self._search_regex([ + r'clientAPI\s*:\s*"([^"]+)"', + r'data-de-api-key\s*=\s*"([^"]+)"' + ], webpage, 'app token') + response = self._download_json( + 'http://player.piksel.com/ws/ws_program/api/%s/mode/json/apiv/5' % app_token, + video_id, query={ + 'v': video_id + })['response'] + failure = response.get('failure') + if failure: + raise ExtractorError(response['failure']['reason'], expected=True) + video_data = response['WsProgramResponse']['program']['asset'] + title = video_data['title'] + + formats = [] + + m3u8_url = dict_get(video_data, [ + 'm3u8iPadURL', + 'ipadM3u8Url', + 'm3u8AndroidURL', + 'm3u8iPhoneURL', + 'iphoneM3u8Url']) + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + + asset_type = dict_get(video_data, ['assetType', 'asset_type']) + for asset_file in video_data.get('assetFiles', []): + # TODO: extract rtmp formats + http_url = asset_file.get('http_url') + if not http_url: + continue + tbr = None + vbr = int_or_none(asset_file.get('videoBitrate'), 1024) + abr = int_or_none(asset_file.get('audioBitrate'), 1024) + if asset_type == 'video': + tbr = vbr + abr + elif asset_type == 'audio': + tbr = abr + + format_id = ['http'] + if tbr: + format_id.append(compat_str(tbr)) + + formats.append({ + 'format_id': '-'.join(format_id), + 'url': unescapeHTML(http_url), + 'vbr': vbr, + 'abr': abr, + 'width': int_or_none(asset_file.get('videoWidth')), + 'height': int_or_none(asset_file.get('videoHeight')), + 'filesize': int_or_none(asset_file.get('filesize')), + 'tbr': tbr, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'thumbnail': video_data.get('thumbnailUrl'), + 'timestamp': parse_iso8601(video_data.get('dateadd')), + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/pinkbike.py youtube-dl-2019.05.20/youtube_dl/extractor/pinkbike.py --- youtube-dl-2016.02.22/youtube_dl/extractor/pinkbike.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/pinkbike.py 2019-06-07 18:49:07.000000000 +0000 @@ -23,7 +23,7 @@ 'ext': 'mp4', 'title': 'Brandon Semenuk - RAW 100', 'description': 'Official release: www.redbull.ca/rupertwalker', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 100, 'upload_date': '20150406', 'uploader': 'revelco', @@ -64,7 +64,8 @@ 'video:duration', webpage, 'duration')) uploader = self._search_regex( - r'un:\s*"([^"]+)"', webpage, 'uploader', fatal=False) + r'<a[^>]+\brel=["\']author[^>]+>([^<]+)', webpage, + 'uploader', fatal=False) upload_date = unified_strdate(self._search_regex( r'class="fullTime"[^>]+title="([^"]+)"', webpage, 'upload date', fatal=False)) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/pladform.py youtube-dl-2019.05.20/youtube_dl/extractor/pladform.py --- youtube-dl-2016.02.22/youtube_dl/extractor/pladform.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/pladform.py 2019-06-07 18:49:07.000000000 +0000 @@ -4,7 +4,9 @@ import re from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( + determine_ext, ExtractorError, int_or_none, xpath_text, @@ -26,17 +28,15 @@ (?P<id>\d+) ''' _TESTS = [{ - # http://muz-tv.ru/kinozal/view/7400/ - 'url': 'http://out.pladform.ru/player?pl=24822&videoid=100183293', - 'md5': '61f37b575dd27f1bb2e1854777fe31f4', + 'url': 'https://out.pladform.ru/player?pl=64471&videoid=3777899&vk_puid15=0&vk_puid34=0', + 'md5': '53362fac3a27352da20fa2803cc5cd6f', 'info_dict': { - 'id': '100183293', + 'id': '3777899', 'ext': 'mp4', - 'title': 'Тайны перевала Дятлова • 1 серия 2 часть', - 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века', - 'thumbnail': 're:^https?://.*\.jpg$', - 'duration': 694, - 'age_limit': 0, + 'title': 'СТУДИЯ СОЮЗ • Шоу Студия Союз, 24 выпуск (01.02.2018) Нурлан Сабуров и Слава Комиссаренко', + 'description': 'md5:05140e8bf1b7e2d46e7ba140be57fd95', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 3190, }, }, { 'url': 'http://static.pladform.ru/player.swf?pl=21469&videoid=100183293&vkcid=0', @@ -49,29 +49,55 @@ @staticmethod def _extract_url(webpage): mobj = re.search( - r'<iframe[^>]+src="(?P<url>(?:https?:)?//out\.pladform\.ru/player\?.+?)"', webpage) + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//out\.pladform\.ru/player\?.+?)\1', webpage) if mobj: return mobj.group('url') def _real_extract(self, url): video_id = self._match_id(url) + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + pl = qs.get('pl', ['1'])[0] + video = self._download_xml( - 'http://out.pladform.ru/getVideo?pl=1&videoid=%s' % video_id, - video_id) + 'http://out.pladform.ru/getVideo', video_id, query={ + 'pl': pl, + 'videoid': video_id, + }) - if video.tag == 'error': + def fail(text): raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, video.text), + '%s returned error: %s' % (self.IE_NAME, text), expected=True) + if video.tag == 'error': + fail(video.text) + quality = qualities(('ld', 'sd', 'hd')) - formats = [{ - 'url': src.text, - 'format_id': src.get('quality'), - 'quality': quality(src.get('quality')), - } for src in video.findall('./src')] + formats = [] + for src in video.findall('./src'): + if src is None: + continue + format_url = src.text + if not format_url: + continue + if src.get('type') == 'hls' or determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': src.text, + 'format_id': src.get('quality'), + 'quality': quality(src.get('quality')), + }) + + if not formats: + error = xpath_text(video, './cap', 'error', default=None) + if error: + fail(error) + self._sort_formats(formats) webpage = self._download_webpage( diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/planetaplay.py youtube-dl-2019.05.20/youtube_dl/extractor/planetaplay.py --- youtube-dl-2016.02.22/youtube_dl/extractor/planetaplay.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/planetaplay.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,61 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ExtractorError - - -class PlanetaPlayIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?planetaplay\.com/\?sng=(?P<id>[0-9]+)' - _API_URL = 'http://planetaplay.com/action/playlist/?sng={0:}' - _THUMBNAIL_URL = 'http://planetaplay.com/img/thumb/{thumb:}' - _TEST = { - 'url': 'http://planetaplay.com/?sng=3586', - 'md5': '9d569dceb7251a4e01355d5aea60f9db', - 'info_dict': { - 'id': '3586', - 'ext': 'flv', - 'title': 'md5:e829428ee28b1deed00de90de49d1da1', - }, - 'skip': 'Not accessible from Travis CI server', - } - - _SONG_FORMATS = { - 'lq': (0, 'http://www.planetaplay.com/videoplayback/{med_hash:}'), - 'hq': (1, 'http://www.planetaplay.com/videoplayback/hi/{med_hash:}'), - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - response = self._download_json( - self._API_URL.format(video_id), video_id)['response'] - try: - data = response.get('data')[0] - except IndexError: - raise ExtractorError( - '%s: failed to get the playlist' % self.IE_NAME, expected=True) - - title = '{song_artists:} - {sng_name:}'.format(**data) - thumbnail = self._THUMBNAIL_URL.format(**data) - - formats = [] - for format_id, (quality, url_template) in self._SONG_FORMATS.items(): - formats.append({ - 'format_id': format_id, - 'url': url_template.format(**data), - 'quality': quality, - 'ext': 'flv', - }) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'thumbnail': thumbnail, - } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/platzi.py youtube-dl-2019.05.20/youtube_dl/extractor/platzi.py --- youtube-dl-2016.02.22/youtube_dl/extractor/platzi.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/platzi.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,217 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_b64decode, + compat_str, +) +from ..utils import ( + clean_html, + ExtractorError, + int_or_none, + str_or_none, + try_get, + url_or_none, + urlencode_postdata, + urljoin, +) + + +class PlatziIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + platzi\.com/clases| # es version + courses\.platzi\.com/classes # en version + )/[^/]+/(?P<id>\d+)-[^/?\#&]+ + ''' + _LOGIN_URL = 'https://platzi.com/login/' + _NETRC_MACHINE = 'platzi' + + _TESTS = [{ + 'url': 'https://platzi.com/clases/1311-next-js/12074-creando-nuestra-primera-pagina/', + 'md5': '8f56448241005b561c10f11a595b37e3', + 'info_dict': { + 'id': '12074', + 'ext': 'mp4', + 'title': 'Creando nuestra primera página', + 'description': 'md5:4c866e45034fc76412fbf6e60ae008bc', + 'duration': 420, + }, + 'skip': 'Requires platzi account credentials', + }, { + 'url': 'https://courses.platzi.com/classes/1367-communication-codestream/13430-background/', + 'info_dict': { + 'id': '13430', + 'ext': 'mp4', + 'title': 'Background', + 'description': 'md5:49c83c09404b15e6e71defaf87f6b305', + 'duration': 360, + }, + 'skip': 'Requires platzi account credentials', + 'params': { + 'skip_download': True, + }, + }] + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page') + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'email': username, + 'password': password, + }) + + urlh = self._request_webpage( + self._LOGIN_URL, None, 'Logging in', + data=urlencode_postdata(login_form), + headers={'Referer': self._LOGIN_URL}) + + # login succeeded + if 'platzi.com/login' not in compat_str(urlh.geturl()): + return + + login_error = self._webpage_read_content( + urlh, self._LOGIN_URL, None, 'Downloading login error page') + + login = self._parse_json( + self._search_regex( + r'login\s*=\s*({.+?})(?:\s*;|\s*</script)', login_error, 'login'), + None) + + for kind in ('error', 'password', 'nonFields'): + error = str_or_none(login.get('%sError' % kind)) + if error: + raise ExtractorError( + 'Unable to login: %s' % error, expected=True) + raise ExtractorError('Unable to log in') + + def _real_extract(self, url): + lecture_id = self._match_id(url) + + webpage = self._download_webpage(url, lecture_id) + + data = self._parse_json( + self._search_regex( + r'client_data\s*=\s*({.+?})\s*;', webpage, 'client data'), + lecture_id) + + material = data['initialState']['material'] + desc = material['description'] + title = desc['title'] + + formats = [] + for server_id, server in material['videos'].items(): + if not isinstance(server, dict): + continue + for format_id in ('hls', 'dash'): + format_url = url_or_none(server.get(format_id)) + if not format_url: + continue + if format_id == 'hls': + formats.extend(self._extract_m3u8_formats( + format_url, lecture_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id=format_id, + note='Downloading %s m3u8 information' % server_id, + fatal=False)) + elif format_id == 'dash': + formats.extend(self._extract_mpd_formats( + format_url, lecture_id, mpd_id=format_id, + note='Downloading %s MPD manifest' % server_id, + fatal=False)) + self._sort_formats(formats) + + content = str_or_none(desc.get('content')) + description = (clean_html(compat_b64decode(content).decode('utf-8')) + if content else None) + duration = int_or_none(material.get('duration'), invscale=60) + + return { + 'id': lecture_id, + 'title': title, + 'description': description, + 'duration': duration, + 'formats': formats, + } + + +class PlatziCourseIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + platzi\.com/clases| # es version + courses\.platzi\.com/classes # en version + )/(?P<id>[^/?\#&]+) + ''' + _TESTS = [{ + 'url': 'https://platzi.com/clases/next-js/', + 'info_dict': { + 'id': '1311', + 'title': 'Curso de Next.js', + }, + 'playlist_count': 22, + }, { + 'url': 'https://courses.platzi.com/classes/communication-codestream/', + 'info_dict': { + 'id': '1367', + 'title': 'Codestream Course', + }, + 'playlist_count': 14, + }] + + @classmethod + def suitable(cls, url): + return False if PlatziIE.suitable(url) else super(PlatziCourseIE, cls).suitable(url) + + def _real_extract(self, url): + course_name = self._match_id(url) + + webpage = self._download_webpage(url, course_name) + + props = self._parse_json( + self._search_regex(r'data\s*=\s*({.+?})\s*;', webpage, 'data'), + course_name)['initialProps'] + + entries = [] + for chapter_num, chapter in enumerate(props['concepts'], 1): + if not isinstance(chapter, dict): + continue + materials = chapter.get('materials') + if not materials or not isinstance(materials, list): + continue + chapter_title = chapter.get('title') + chapter_id = str_or_none(chapter.get('id')) + for material in materials: + if not isinstance(material, dict): + continue + if material.get('material_type') != 'video': + continue + video_url = urljoin(url, material.get('url')) + if not video_url: + continue + entries.append({ + '_type': 'url_transparent', + 'url': video_url, + 'title': str_or_none(material.get('name')), + 'id': str_or_none(material.get('id')), + 'ie_key': PlatziIE.ie_key(), + 'chapter': chapter_title, + 'chapter_number': chapter_num, + 'chapter_id': chapter_id, + }) + + course_id = compat_str(try_get(props, lambda x: x['course']['id'])) + course_title = try_get(props, lambda x: x['course']['name'], compat_str) + + return self.playlist_result(entries, course_id, course_title) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/played.py youtube-dl-2019.05.20/youtube_dl/extractor/played.py --- youtube-dl-2016.02.22/youtube_dl/extractor/played.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/played.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,60 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -import os.path - -from .common import InfoExtractor -from ..compat import compat_urllib_parse -from ..utils import ( - ExtractorError, - sanitized_Request, -) - - -class PlayedIE(InfoExtractor): - IE_NAME = 'played.to' - _VALID_URL = r'https?://(?:www\.)?played\.to/(?P<id>[a-zA-Z0-9_-]+)' - - _TEST = { - 'url': 'http://played.to/j2f2sfiiukgt', - 'md5': 'c2bd75a368e82980e7257bf500c00637', - 'info_dict': { - 'id': 'j2f2sfiiukgt', - 'ext': 'flv', - 'title': 'youtube-dl_test_video.mp4', - }, - 'skip': 'Removed for copyright infringement.', # oh wow - } - - def _real_extract(self, url): - video_id = self._match_id(url) - orig_webpage = self._download_webpage(url, video_id) - - m_error = re.search( - r'(?s)Reason for deletion:.*?<b class="err"[^>]*>(?P<msg>[^<]+)</b>', orig_webpage) - if m_error: - raise ExtractorError(m_error.group('msg'), expected=True) - - data = self._hidden_inputs(orig_webpage) - - self._sleep(2, video_id) - - post = compat_urllib_parse.urlencode(data) - headers = { - b'Content-Type': b'application/x-www-form-urlencoded', - } - req = sanitized_Request(url, post, headers) - webpage = self._download_webpage( - req, video_id, note='Downloading video page ...') - - title = os.path.splitext(data['fname'])[0] - - video_url = self._search_regex( - r'file: "?(.+?)",', webpage, 'video URL') - - return { - 'id': video_id, - 'title': title, - 'url': video_url, - } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/playplustv.py youtube-dl-2019.05.20/youtube_dl/extractor/playplustv.py --- youtube-dl-2016.02.22/youtube_dl/extractor/playplustv.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/playplustv.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,109 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( + clean_html, + ExtractorError, + int_or_none, + PUTRequest, +) + + +class PlayPlusTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?playplus\.(?:com|tv)/VOD/(?P<project_id>[0-9]+)/(?P<id>[0-9a-f]{32})' + _TEST = { + 'url': 'https://www.playplus.tv/VOD/7572/db8d274a5163424e967f35a30ddafb8e', + 'md5': 'd078cb89d7ab6b9df37ce23c647aef72', + 'info_dict': { + 'id': 'db8d274a5163424e967f35a30ddafb8e', + 'ext': 'mp4', + 'title': 'Capítulo 179 - Final', + 'description': 'md5:01085d62d8033a1e34121d3c3cabc838', + 'timestamp': 1529992740, + 'upload_date': '20180626', + }, + 'skip': 'Requires account credential', + } + _NETRC_MACHINE = 'playplustv' + _GEO_COUNTRIES = ['BR'] + _token = None + _profile_id = None + + def _call_api(self, resource, video_id=None, query=None): + return self._download_json('https://api.playplus.tv/api/media/v2/get' + resource, video_id, headers={ + 'Authorization': 'Bearer ' + self._token, + }, query=query) + + def _real_initialize(self): + email, password = self._get_login_info() + if email is None: + self.raise_login_required() + + req = PUTRequest( + 'https://api.playplus.tv/api/web/login', json.dumps({ + 'email': email, + 'password': password, + }).encode(), { + 'Content-Type': 'application/json; charset=utf-8', + }) + + try: + self._token = self._download_json(req, None)['token'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + raise ExtractorError(self._parse_json( + e.cause.read(), None)['errorMessage'], expected=True) + raise + + self._profile = self._call_api('Profiles')['list'][0]['_id'] + + def _real_extract(self, url): + project_id, media_id = re.match(self._VALID_URL, url).groups() + media = self._call_api( + 'Media', media_id, { + 'profileId': self._profile, + 'projectId': project_id, + 'mediaId': media_id, + })['obj'] + title = media['title'] + + formats = [] + for f in media.get('files', []): + f_url = f.get('url') + if not f_url: + continue + file_info = f.get('fileInfo') or {} + formats.append({ + 'url': f_url, + 'width': int_or_none(file_info.get('width')), + 'height': int_or_none(file_info.get('height')), + }) + self._sort_formats(formats) + + thumbnails = [] + for thumb in media.get('thumbs', []): + thumb_url = thumb.get('url') + if not thumb_url: + continue + thumbnails.append({ + 'url': thumb_url, + 'width': int_or_none(thumb.get('width')), + 'height': int_or_none(thumb.get('height')), + }) + + return { + 'id': media_id, + 'title': title, + 'formats': formats, + 'thumbnails': thumbnails, + 'description': clean_html(media.get('description')) or media.get('shortDescription'), + 'timestamp': int_or_none(media.get('publishDate'), 1000), + 'view_count': int_or_none(media.get('numberOfViews')), + 'comment_count': int_or_none(media.get('numberOfComments')), + 'tags': media.get('tags'), + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/plays.py youtube-dl-2019.05.20/youtube_dl/extractor/plays.py --- youtube-dl-2016.02.22/youtube_dl/extractor/plays.py 2016-02-12 20:57:05.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/plays.py 2019-06-07 18:49:07.000000000 +0000 @@ -8,30 +8,31 @@ class PlaysTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?plays\.tv/video/(?P<id>[0-9a-f]{18})' - _TEST = { - 'url': 'http://plays.tv/video/56af17f56c95335490/when-you-outplay-the-azir-wall', + _VALID_URL = r'https?://(?:www\.)?plays\.tv/(?:video|embeds)/(?P<id>[0-9a-f]{18})' + _TESTS = [{ + 'url': 'https://plays.tv/video/56af17f56c95335490/when-you-outplay-the-azir-wall', 'md5': 'dfeac1198506652b5257a62762cec7bc', 'info_dict': { 'id': '56af17f56c95335490', 'ext': 'mp4', - 'title': 'When you outplay the Azir wall', + 'title': 'Bjergsen - When you outplay the Azir wall', 'description': 'Posted by Bjergsen', } - } + }, { + 'url': 'https://plays.tv/embeds/56af17f56c95335490', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage( + 'https://plays.tv/video/%s' % video_id, video_id) + + info = self._search_json_ld(webpage, video_id,) - title = self._og_search_title(webpage) - content = self._parse_json( - self._search_regex( - r'R\.bindContent\(({.+?})\);', webpage, - 'content'), video_id)['content'] mpd_url, sources = re.search( r'(?s)<video[^>]+data-mpd="([^"]+)"[^>]*>(.+?)</video>', - content).groups() + webpage).groups() formats = self._extract_mpd_formats( self._proto_relative_url(mpd_url), video_id, mpd_id='DASH') for format_id, height, format_url in re.findall(r'<source\s+res="((\d+)h?)"\s+src="([^"]+)"', sources): @@ -42,10 +43,11 @@ }) self._sort_formats(formats) - return { + info.update({ 'id': video_id, - 'title': title, 'description': self._og_search_description(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), + 'thumbnail': info.get('thumbnail') or self._og_search_thumbnail(webpage), 'formats': formats, - } + }) + + return info diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/playtvak.py youtube-dl-2019.05.20/youtube_dl/extractor/playtvak.py --- youtube-dl-2016.02.22/youtube_dl/extractor/playtvak.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/playtvak.py 2019-06-07 18:49:07.000000000 +0000 @@ -4,7 +4,7 @@ from .common import InfoExtractor from ..compat import ( compat_urlparse, - compat_urllib_parse, + compat_urllib_parse_urlencode, ) from ..utils import ( ExtractorError, @@ -24,8 +24,8 @@ 'id': 'A150730_150323_hodinovy-manzel_kuko', 'ext': 'mp4', 'title': 'Vyžeňte vosy a sršně ze zahrady', - 'description': 'md5:f93d398691044d303bc4a3de62f3e976', - 'thumbnail': 're:(?i)^https?://.*\.(?:jpg|png)$', + 'description': 'md5:4436e61b7df227a093778efb7e373571', + 'thumbnail': r're:(?i)^https?://.*\.(?:jpg|png)$', 'duration': 279, 'timestamp': 1438732860, 'upload_date': '20150805', @@ -36,9 +36,19 @@ 'info_dict': { 'id': 'A150624_164934_planespotting_cat', 'ext': 'flv', - 'title': 're:^Přímý přenos iDNES.cz [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'title': 're:^Planespotting [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'description': 'Sledujte provoz na ranveji Letiště Václava Havla v Praze', - 'thumbnail': 're:(?i)^https?://.*\.(?:jpg|png)$', + 'is_live': True, + }, + 'params': { + 'skip_download': True, # requires rtmpdump + }, + }, { # another live stream, this one without Misc.videoFLV + 'url': 'https://slowtv.playtvak.cz/zive-sledujte-vlaky-v-primem-prenosu-dwi-/hlavni-nadrazi.aspx?c=A151218_145728_hlavni-nadrazi_plap', + 'info_dict': { + 'id': 'A151218_145728_hlavni-nadrazi_plap', + 'ext': 'flv', + 'title': 're:^Hlavní nádraží [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'is_live': True, }, 'params': { @@ -52,7 +62,7 @@ 'ext': 'mp4', 'title': 'Zavřeli jsme mraženou pizzu do auta. Upekla se', 'description': 'md5:01e73f02329e2e5760bd5eed4d42e3c2', - 'thumbnail': 're:(?i)^https?://.*\.(?:jpg|png)$', + 'thumbnail': r're:(?i)^https?://.*\.(?:jpg|png)$', 'duration': 39, 'timestamp': 1438969140, 'upload_date': '20150807', @@ -66,7 +76,7 @@ 'ext': 'mp4', 'title': 'Táhni! Demonstrace proti imigrantům budila emoce', 'description': 'md5:97c81d589a9491fbfa323c9fa3cca72c', - 'thumbnail': 're:(?i)^https?://.*\.(?:jpg|png)$', + 'thumbnail': r're:(?i)^https?://.*\.(?:jpg|png)$', 'timestamp': 1439052180, 'upload_date': '20150808', 'is_live': False, @@ -79,7 +89,7 @@ 'ext': 'mp4', 'title': 'Recesisté udělali z billboardu kolotoč', 'description': 'md5:7369926049588c3989a66c9c1a043c4c', - 'thumbnail': 're:(?i)^https?://.*\.(?:jpg|png)$', + 'thumbnail': r're:(?i)^https?://.*\.(?:jpg|png)$', 'timestamp': 1415725500, 'upload_date': '20141111', 'is_live': False, @@ -95,7 +105,7 @@ webpage = self._download_webpage(url, video_id) info_url = self._html_search_regex( - r'Misc\.videoFLV\(\s*{\s*data\s*:\s*"([^"]+)"', webpage, 'info url') + r'Misc\.video(?:FLV)?\(\s*{\s*data\s*:\s*"([^"]+)"', webpage, 'info url') parsed_url = compat_urlparse.urlparse(info_url) @@ -106,7 +116,7 @@ }) info_url = compat_urlparse.urlunparse( - parsed_url._replace(query=compat_urllib_parse.urlencode(qs, True))) + parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True))) json_info = self._download_json( info_url, video_id, @@ -160,7 +170,7 @@ if is_live: title = self._live_title(title) description = self._og_search_description(webpage, default=None) or self._html_search_meta( - 'description', webpage, 'description') + 'description', webpage, 'description', default=None) timestamp = None duration = None if not is_live: diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/playvid.py youtube-dl-2019.05.20/youtube_dl/extractor/playvid.py --- youtube-dl-2016.02.22/youtube_dl/extractor/playvid.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/playvid.py 2019-06-07 18:49:07.000000000 +0000 @@ -14,8 +14,8 @@ class PlayvidIE(InfoExtractor): - _VALID_URL = r'https?://www\.playvid\.com/watch(\?v=|/)(?P<id>.+?)(?:#|$)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?playvid\.com/watch(\?v=|/)(?P<id>.+?)(?:#|$)' + _TESTS = [{ 'url': 'http://www.playvid.com/watch/RnmBNgtrrJu', 'md5': 'ffa2f6b2119af359f544388d8c01eb6c', 'info_dict': { @@ -24,8 +24,19 @@ 'title': 'md5:9256d01c6317e3f703848b5906880dc8', 'duration': 82, 'age_limit': 18, - } - } + }, + 'skip': 'Video removed due to ToS', + }, { + 'url': 'http://www.playvid.com/watch/hwb0GpNkzgH', + 'md5': '39d49df503ad7b8f23a4432cbf046477', + 'info_dict': { + 'id': 'hwb0GpNkzgH', + 'ext': 'mp4', + 'title': 'Ellen Euro Cutie Blond Takes a Sexy Survey Get Facial in The Park', + 'age_limit': 18, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + }] def _real_extract(self, url): video_id = self._match_id(url) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/playwire.py youtube-dl-2019.05.20/youtube_dl/extractor/playwire.py --- youtube-dl-2016.02.22/youtube_dl/extractor/playwire.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/playwire.py 2019-06-07 18:49:07.000000000 +0000 @@ -4,9 +4,8 @@ from .common import InfoExtractor from ..utils import ( - xpath_text, + dict_get, float_or_none, - int_or_none, ) @@ -19,10 +18,23 @@ 'id': '3353705', 'ext': 'mp4', 'title': 'S04_RM_UCL_Rus', - 'thumbnail': 're:^https?://.*\.png$', + 'thumbnail': r're:^https?://.*\.png$', 'duration': 145.94, }, }, { + # m3u8 in f4m + 'url': 'http://config.playwire.com/21772/videos/v2/4840492/zeus.json', + 'info_dict': { + 'id': '4840492', + 'ext': 'mp4', + 'title': 'ITV EL SHOW FULL', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # Multiple resolutions while bitrates missing 'url': 'http://cdn.playwire.com/11625/embed/85228.html', 'only_matching': True, }, { @@ -48,25 +60,10 @@ thumbnail = content.get('poster') src = content['media']['f4m'] - f4m = self._download_xml(src, video_id) - base_url = xpath_text(f4m, './{http://ns.adobe.com/f4m/1.0}baseURL', 'base url', fatal=True) - formats = [] - for media in f4m.findall('./{http://ns.adobe.com/f4m/1.0}media'): - media_url = media.get('url') - if not media_url: - continue - tbr = int_or_none(media.get('bitrate')) - width = int_or_none(media.get('width')) - height = int_or_none(media.get('height')) - f = { - 'url': '%s/%s' % (base_url, media.attrib['url']), - 'tbr': tbr, - 'width': width, - 'height': height, - } - if not (tbr or width or height): - f['quality'] = 1 if '-hd.' in media_url else 0 - formats.append(f) + formats = self._extract_f4m_formats(src, video_id, m3u8_id='hls') + for a_format in formats: + if not dict_get(a_format, ['tbr', 'width', 'height']): + a_format['quality'] = 1 if '-hd.' in a_format['url'] else 0 self._sort_formats(formats) return { diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/pluralsight.py youtube-dl-2019.05.20/youtube_dl/extractor/pluralsight.py --- youtube-dl-2016.02.22/youtube_dl/extractor/pluralsight.py 2016-01-14 09:35:12.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/pluralsight.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,32 +1,118 @@ from __future__ import unicode_literals -import re +import collections import json +import os import random -import collections +import re from .common import InfoExtractor from ..compat import ( compat_str, - compat_urllib_parse, compat_urlparse, ) from ..utils import ( + dict_get, ExtractorError, + float_or_none, int_or_none, parse_duration, qualities, - sanitized_Request, + srt_subtitles_timecode, + try_get, + update_url_query, + urlencode_postdata, ) class PluralsightBaseIE(InfoExtractor): - _API_BASE = 'http://app.pluralsight.com' + _API_BASE = 'https://app.pluralsight.com' + + _GRAPHQL_EP = '%s/player/api/graphql' % _API_BASE + _GRAPHQL_HEADERS = { + 'Content-Type': 'application/json;charset=UTF-8', + } + _GRAPHQL_COURSE_TMPL = ''' +query BootstrapPlayer { + rpc { + bootstrapPlayer { + profile { + firstName + lastName + email + username + userHandle + authed + isAuthed + plan + } + course(courseId: "%s") { + name + title + courseHasCaptions + translationLanguages { + code + name + } + supportsWideScreenVideoFormats + timestamp + modules { + name + title + duration + formattedDuration + author + authorized + clips { + authorized + clipId + duration + formattedDuration + id + index + moduleIndex + moduleTitle + name + title + watched + } + } + } + } + } +}''' + + def _download_course(self, course_id, url, display_id): + try: + return self._download_course_rpc(course_id, url, display_id) + except ExtractorError: + # Old API fallback + return self._download_json( + 'https://app.pluralsight.com/player/user/api/v1/player/payload', + display_id, data=urlencode_postdata({'courseId': course_id}), + headers={'Referer': url}) + + def _download_course_rpc(self, course_id, url, display_id): + response = self._download_json( + self._GRAPHQL_EP, display_id, data=json.dumps({ + 'query': self._GRAPHQL_COURSE_TMPL % course_id, + 'variables': {} + }).encode('utf-8'), headers=self._GRAPHQL_HEADERS) + + course = try_get( + response, lambda x: x['data']['rpc']['bootstrapPlayer']['course'], + dict) + if course: + return course + + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, response['error']['message']), + expected=True) class PluralsightIE(PluralsightBaseIE): IE_NAME = 'pluralsight' - _VALID_URL = r'https?://(?:(?:www|app)\.)?pluralsight\.com/training/player\?' + _VALID_URL = r'https?://(?:(?:www|app)\.)?pluralsight\.com/(?:training/)?player\?' _LOGIN_URL = 'https://app.pluralsight.com/id/' _NETRC_MACHINE = 'pluralsight' @@ -37,7 +123,7 @@ 'info_dict': { 'id': 'hosting-sql-server-windows-azure-iaas-m7-mgmt-04', 'ext': 'mp4', - 'title': 'Management of SQL Server - Demo Monitoring', + 'title': 'Demo Monitoring', 'duration': 338, }, 'skip': 'Requires pluralsight account credentials', @@ -48,13 +134,38 @@ # available without pluralsight account 'url': 'http://app.pluralsight.com/training/player?author=scott-allen&name=angularjs-get-started-m1-introduction&mode=live&clip=0&course=angularjs-get-started', 'only_matching': True, + }, { + 'url': 'https://app.pluralsight.com/player?course=ccna-intro-networking&author=ross-bagurdes&name=ccna-intro-networking-m06&clip=0', + 'only_matching': True, }] + GRAPHQL_VIEWCLIP_TMPL = ''' +query viewClip { + viewClip(input: { + author: "%(author)s", + clipIndex: %(clipIndex)d, + courseName: "%(courseName)s", + includeCaptions: %(includeCaptions)s, + locale: "%(locale)s", + mediaType: "%(mediaType)s", + moduleName: "%(moduleName)s", + quality: "%(quality)s" + }) { + urls { + url + cdn + rank + source + }, + status + } +}''' + def _real_initialize(self): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return @@ -64,8 +175,8 @@ login_form = self._hidden_inputs(login_page) login_form.update({ - 'Username': username.encode('utf-8'), - 'Password': password.encode('utf-8'), + 'Username': username, + 'Password': password, }) post_url = self._search_regex( @@ -75,12 +186,10 @@ if not post_url.startswith('http'): post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) - request = sanitized_Request( - post_url, compat_urllib_parse.urlencode(login_form).encode('utf-8')) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') - response = self._download_webpage( - request, None, 'Logging in as %s' % username) + post_url, None, 'Logging in', + data=urlencode_postdata(login_form), + headers={'Content-Type': 'application/x-www-form-urlencoded'}) error = self._search_regex( r'<span[^>]+class="field-validation-error"[^>]*>([^<]+)</span>', @@ -88,56 +197,118 @@ if error: raise ExtractorError('Unable to login: %s' % error, expected=True) - if all(p not in response for p in ('__INITIAL_STATE__', '"currentUser"')): + if all(not re.search(p, response) for p in ( + r'__INITIAL_STATE__', r'["\']currentUser["\']', + # new layout? + r'>\s*Sign out\s*<')): + BLOCKED = 'Your account has been blocked due to suspicious activity' + if BLOCKED in response: + raise ExtractorError( + 'Unable to login: %s' % BLOCKED, expected=True) + MUST_AGREE = 'To continue using Pluralsight, you must agree to' + if any(p in response for p in (MUST_AGREE, '>Disagree<', '>Agree<')): + raise ExtractorError( + 'Unable to login: %s some documents. Go to pluralsight.com, ' + 'log in and agree with what Pluralsight requires.' + % MUST_AGREE, expected=True) + raise ExtractorError('Unable to log in') + def _get_subtitles(self, author, clip_idx, clip_id, lang, name, duration, video_id): + captions = None + if clip_id: + captions = self._download_json( + '%s/transcript/api/v1/caption/json/%s/%s' + % (self._API_BASE, clip_id, lang), video_id, + 'Downloading captions JSON', 'Unable to download captions JSON', + fatal=False) + if not captions: + captions_post = { + 'a': author, + 'cn': int(clip_idx), + 'lc': lang, + 'm': name, + } + captions = self._download_json( + '%s/player/retrieve-captions' % self._API_BASE, video_id, + 'Downloading captions JSON', 'Unable to download captions JSON', + fatal=False, data=json.dumps(captions_post).encode('utf-8'), + headers={'Content-Type': 'application/json;charset=utf-8'}) + if captions: + return { + lang: [{ + 'ext': 'json', + 'data': json.dumps(captions), + }, { + 'ext': 'srt', + 'data': self._convert_subtitles(duration, captions), + }] + } + + @staticmethod + def _convert_subtitles(duration, subs): + srt = '' + TIME_OFFSET_KEYS = ('displayTimeOffset', 'DisplayTimeOffset') + TEXT_KEYS = ('text', 'Text') + for num, current in enumerate(subs): + current = subs[num] + start, text = ( + float_or_none(dict_get(current, TIME_OFFSET_KEYS, skip_false_values=False)), + dict_get(current, TEXT_KEYS)) + if start is None or text is None: + continue + end = duration if num == len(subs) - 1 else float_or_none( + dict_get(subs[num + 1], TIME_OFFSET_KEYS, skip_false_values=False)) + if end is None: + continue + srt += os.linesep.join( + ( + '%d' % num, + '%s --> %s' % ( + srt_subtitles_timecode(start), + srt_subtitles_timecode(end)), + text, + os.linesep, + )) + return srt + def _real_extract(self, url): qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) author = qs.get('author', [None])[0] name = qs.get('name', [None])[0] - clip_id = qs.get('clip', [None])[0] - course = qs.get('course', [None])[0] + clip_idx = qs.get('clip', [None])[0] + course_name = qs.get('course', [None])[0] - if any(not f for f in (author, name, clip_id, course,)): + if any(not f for f in (author, name, clip_idx, course_name,)): raise ExtractorError('Invalid URL', expected=True) - display_id = '%s-%s' % (name, clip_id) - - webpage = self._download_webpage(url, display_id) + display_id = '%s-%s' % (name, clip_idx) - modules = self._search_regex( - r'moduleCollection\s*:\s*new\s+ModuleCollection\((\[.+?\])\s*,\s*\$rootScope\)', - webpage, 'modules', default=None) + course = self._download_course(course_name, url, display_id) - if modules: - collection = self._parse_json(modules, display_id) - else: - # Webpage may be served in different layout (see - # https://github.com/rg3/youtube-dl/issues/7607) - collection = self._parse_json( - self._search_regex( - r'var\s+initialState\s*=\s*({.+?});\n', webpage, 'initial state'), - display_id)['course']['modules'] + collection = course['modules'] - module, clip = None, None + clip = None for module_ in collection: if name in (module_.get('moduleName'), module_.get('name')): - module = module_ for clip_ in module_.get('clips', []): clip_index = clip_.get('clipIndex') if clip_index is None: clip_index = clip_.get('index') if clip_index is None: continue - if compat_str(clip_index) == clip_id: + if compat_str(clip_index) == clip_idx: clip = clip_ break if not clip: raise ExtractorError('Unable to resolve clip') + title = clip['title'] + clip_id = clip.get('clipName') or clip.get('name') or clip['clipId'] + QUALITIES = { 'low': {'width': 640, 'height': 480}, 'medium': {'width': 848, 'height': 640}, @@ -156,9 +327,8 @@ ) # Some courses also offer widescreen resolution for high quality (see - # https://github.com/rg3/youtube-dl/issues/7766) - widescreen = True if re.search( - r'courseSupportsWidescreenVideoFormats\s*:\s*true', webpage) else False + # https://github.com/ytdl-org/youtube-dl/issues/7766) + widescreen = course.get('supportsWideScreenVideoFormats') is True best_quality = 'high-widescreen' if widescreen else 'high' if widescreen: for allowed_quality in ALLOWED_QUALITIES: @@ -175,6 +345,7 @@ req_format_split = req_format.split('-', 1) if len(req_format_split) > 1: req_ext, req_quality = req_format_split + req_quality = '-'.join(req_quality.split('-')[:2]) for allowed_quality in ALLOWED_QUALITIES: if req_ext == allowed_quality.ext and req_quality in allowed_quality.qualities: return (AllowedQuality(req_ext, (req_quality, )), ) @@ -187,56 +358,82 @@ for quality in qualities_: f = QUALITIES[quality].copy() clip_post = { - 'a': author, - 'cap': 'false', - 'cn': clip_id, - 'course': course, - 'lc': 'en', - 'm': name, - 'mt': ext, - 'q': '%dx%d' % (f['width'], f['height']), + 'author': author, + 'includeCaptions': 'false', + 'clipIndex': int(clip_idx), + 'courseName': course_name, + 'locale': 'en', + 'moduleName': name, + 'mediaType': ext, + 'quality': '%dx%d' % (f['width'], f['height']), } - request = sanitized_Request( - '%s/training/Player/ViewClip' % self._API_BASE, - json.dumps(clip_post).encode('utf-8')) - request.add_header('Content-Type', 'application/json;charset=utf-8') format_id = '%s-%s' % (ext, quality) - clip_url = self._download_webpage( - request, display_id, 'Downloading %s URL' % format_id, fatal=False) + + try: + viewclip = self._download_json( + self._GRAPHQL_EP, display_id, + 'Downloading %s viewclip graphql' % format_id, + data=json.dumps({ + 'query': self.GRAPHQL_VIEWCLIP_TMPL % clip_post, + 'variables': {} + }).encode('utf-8'), + headers=self._GRAPHQL_HEADERS)['data']['viewClip'] + except ExtractorError: + # Still works but most likely will go soon + viewclip = self._download_json( + '%s/video/clips/viewclip' % self._API_BASE, display_id, + 'Downloading %s viewclip JSON' % format_id, fatal=False, + data=json.dumps(clip_post).encode('utf-8'), + headers={'Content-Type': 'application/json;charset=utf-8'}) # Pluralsight tracks multiple sequential calls to ViewClip API and start # to return 429 HTTP errors after some time (see - # https://github.com/rg3/youtube-dl/pull/6989). Moreover it may even lead - # to account ban (see https://github.com/rg3/youtube-dl/issues/6842). + # https://github.com/ytdl-org/youtube-dl/pull/6989). Moreover it may even lead + # to account ban (see https://github.com/ytdl-org/youtube-dl/issues/6842). # To somewhat reduce the probability of these consequences # we will sleep random amount of time before each call to ViewClip. self._sleep( random.randint(2, 5), display_id, '%(video_id)s: Waiting for %(timeout)s seconds to avoid throttling') - if not clip_url: + if not viewclip: continue - f.update({ - 'url': clip_url, - 'ext': ext, - 'format_id': format_id, - 'quality': quality_key(quality), - }) - formats.append(f) + + clip_urls = viewclip.get('urls') + if not isinstance(clip_urls, list): + continue + + for clip_url_data in clip_urls: + clip_url = clip_url_data.get('url') + if not clip_url: + continue + cdn = clip_url_data.get('cdn') + clip_f = f.copy() + clip_f.update({ + 'url': clip_url, + 'ext': ext, + 'format_id': '%s-%s' % (format_id, cdn) if cdn else format_id, + 'quality': quality_key(quality), + 'source_preference': int_or_none(clip_url_data.get('rank')), + }) + formats.append(clip_f) + self._sort_formats(formats) - # TODO: captions - # http://www.pluralsight.com/training/Player/ViewClip + cap = true - # or - # http://www.pluralsight.com/training/Player/Captions - # { a = author, cn = clip_id, lc = end, m = name } + duration = int_or_none( + clip.get('duration')) or parse_duration(clip.get('formattedDuration')) + + # TODO: other languages? + subtitles = self.extract_subtitles( + author, clip_idx, clip.get('clipId'), 'en', name, duration, display_id) return { - 'id': clip.get('clipName') or clip['name'], - 'title': '%s - %s' % (module['title'], clip['title']), - 'duration': int_or_none(clip.get('duration')) or parse_duration(clip.get('formattedDuration')), + 'id': clip_id, + 'title': title, + 'duration': duration, 'creator': author, - 'formats': formats + 'formats': formats, + 'subtitles': subtitles, } @@ -267,25 +464,38 @@ # TODO: PSM cookie - course = self._download_json( - '%s/data/course/%s' % (self._API_BASE, course_id), - course_id, 'Downloading course JSON') + course = self._download_course(course_id, url, course_id) title = course['title'] + course_name = course['name'] + course_data = course['modules'] description = course.get('description') or course.get('shortDescription') - course_data = self._download_json( - '%s/data/course/content/%s' % (self._API_BASE, course_id), - course_id, 'Downloading course data JSON') - entries = [] - for module in course_data: + for num, module in enumerate(course_data, 1): + author = module.get('author') + module_name = module.get('name') + if not author or not module_name: + continue for clip in module.get('clips', []): - player_parameters = clip.get('playerParameters') - if not player_parameters: + clip_index = int_or_none(clip.get('index')) + if clip_index is None: continue - entries.append(self.url_result( - '%s/training/player?%s' % (self._API_BASE, player_parameters), - 'Pluralsight')) + clip_url = update_url_query( + '%s/player' % self._API_BASE, query={ + 'mode': 'live', + 'course': course_name, + 'author': author, + 'name': module_name, + 'clip': clip_index, + }) + entries.append({ + '_type': 'url_transparent', + 'url': clip_url, + 'ie_key': PluralsightIE.ie_key(), + 'chapter': module.get('title'), + 'chapter_number': num, + 'chapter_id': module.get('moduleRef'), + }) return self.playlist_result(entries, course_id, title, description) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/podomatic.py youtube-dl-2019.05.20/youtube_dl/extractor/podomatic.py --- youtube-dl-2016.02.22/youtube_dl/extractor/podomatic.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/podomatic.py 2019-06-07 18:49:07.000000000 +0000 @@ -9,42 +9,49 @@ class PodomaticIE(InfoExtractor): IE_NAME = 'podomatic' - _VALID_URL = r'^(?P<proto>https?)://(?P<channel>[^.]+)\.podomatic\.com/entry/(?P<id>[^?]+)' + _VALID_URL = r'''(?x) + (?P<proto>https?):// + (?: + (?P<channel>[^.]+)\.podomatic\.com/entry| + (?:www\.)?podomatic\.com/podcasts/(?P<channel_2>[^/]+)/episodes + )/ + (?P<id>[^/?#&]+) + ''' - _TESTS = [ - { - 'url': 'http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00', - 'md5': '84bb855fcf3429e6bf72460e1eed782d', - 'info_dict': { - 'id': '2009-01-02T16_03_35-08_00', - 'ext': 'mp3', - 'uploader': 'Science Teaching Tips', - 'uploader_id': 'scienceteachingtips', - 'title': '64. When the Moon Hits Your Eye', - 'duration': 446, - } - }, - { - 'url': 'http://ostbahnhof.podomatic.com/entry/2013-11-15T16_31_21-08_00', - 'md5': 'd2cf443931b6148e27638650e2638297', - 'info_dict': { - 'id': '2013-11-15T16_31_21-08_00', - 'ext': 'mp3', - 'uploader': 'Ostbahnhof / Techno Mix', - 'uploader_id': 'ostbahnhof', - 'title': 'Einunddreizig', - 'duration': 3799, - } - }, - ] + _TESTS = [{ + 'url': 'http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00', + 'md5': '84bb855fcf3429e6bf72460e1eed782d', + 'info_dict': { + 'id': '2009-01-02T16_03_35-08_00', + 'ext': 'mp3', + 'uploader': 'Science Teaching Tips', + 'uploader_id': 'scienceteachingtips', + 'title': '64. When the Moon Hits Your Eye', + 'duration': 446, + } + }, { + 'url': 'http://ostbahnhof.podomatic.com/entry/2013-11-15T16_31_21-08_00', + 'md5': 'd2cf443931b6148e27638650e2638297', + 'info_dict': { + 'id': '2013-11-15T16_31_21-08_00', + 'ext': 'mp3', + 'uploader': 'Ostbahnhof / Techno Mix', + 'uploader_id': 'ostbahnhof', + 'title': 'Einunddreizig', + 'duration': 3799, + } + }, { + 'url': 'https://www.podomatic.com/podcasts/scienceteachingtips/episodes/2009-01-02T16_03_35-08_00', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - channel = mobj.group('channel') + channel = mobj.group('channel') or mobj.group('channel_2') - json_url = (('%s://%s.podomatic.com/entry/embed_params/%s' + - '?permalink=true&rtmp=0') % + json_url = (('%s://%s.podomatic.com/entry/embed_params/%s' + + '?permalink=true&rtmp=0') % (mobj.group('proto'), channel, video_id)) data_json = self._download_webpage( json_url, video_id, 'Downloading video info') diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/pokemon.py youtube-dl-2019.05.20/youtube_dl/extractor/pokemon.py --- youtube-dl-2016.02.22/youtube_dl/extractor/pokemon.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/pokemon.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,75 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + int_or_none, +) + + +class PokemonIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pokemon\.com/[a-z]{2}(?:.*?play=(?P<id>[a-z0-9]{32})|/(?:[^/]+/)+(?P<display_id>[^/?#&]+))' + _TESTS = [{ + 'url': 'https://www.pokemon.com/us/pokemon-episodes/20_30-the-ol-raise-and-switch/', + 'md5': '2fe8eaec69768b25ef898cda9c43062e', + 'info_dict': { + 'id': 'afe22e30f01c41f49d4f1d9eab5cd9a4', + 'ext': 'mp4', + 'title': 'The Ol’ Raise and Switch!', + 'description': 'md5:7db77f7107f98ba88401d3adc80ff7af', + 'timestamp': 1511824728, + 'upload_date': '20171127', + }, + 'add_id': ['LimelightMedia'], + }, { + # no data-video-title + 'url': 'https://www.pokemon.com/us/pokemon-episodes/pokemon-movies/pokemon-the-rise-of-darkrai-2008', + 'info_dict': { + 'id': '99f3bae270bf4e5097274817239ce9c8', + 'ext': 'mp4', + 'title': 'Pokémon: The Rise of Darkrai', + 'description': 'md5:ea8fbbf942e1e497d54b19025dd57d9d', + 'timestamp': 1417778347, + 'upload_date': '20141205', + }, + 'add_id': ['LimelightMedia'], + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.pokemon.com/uk/pokemon-episodes/?play=2e8b5c761f1d4a9286165d7748c1ece2', + 'only_matching': True, + }, { + 'url': 'http://www.pokemon.com/fr/episodes-pokemon/18_09-un-hiver-inattendu/', + 'only_matching': True, + }, { + 'url': 'http://www.pokemon.com/de/pokemon-folgen/01_20-bye-bye-smettbo/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id, display_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, video_id or display_id) + video_data = extract_attributes(self._search_regex( + r'(<[^>]+data-video-id="%s"[^>]*>)' % (video_id if video_id else '[a-z0-9]{32}'), + webpage, 'video data element')) + video_id = video_data['data-video-id'] + title = video_data.get('data-video-title') or self._html_search_meta( + 'pkm-title', webpage, ' title', default=None) or self._search_regex( + r'<h1[^>]+\bclass=["\']us-title[^>]+>([^<]+)', webpage, 'title') + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': 'limelight:media:%s' % video_id, + 'title': title, + 'description': video_data.get('data-video-summary'), + 'thumbnail': video_data.get('data-video-poster'), + 'series': 'Pokémon', + 'season_number': int_or_none(video_data.get('data-video-season')), + 'episode': title, + 'episode_number': int_or_none(video_data.get('data-video-episode')), + 'ie_key': 'LimelightMedia', + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/polskieradio.py youtube-dl-2019.05.20/youtube_dl/extractor/polskieradio.py --- youtube-dl-2016.02.22/youtube_dl/extractor/polskieradio.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/polskieradio.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,180 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools +import re + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urllib_parse_unquote, + compat_urlparse +) +from ..utils import ( + extract_attributes, + int_or_none, + strip_or_none, + unified_timestamp, +) + + +class PolskieRadioIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+/\d+/Artykul/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943,Prof-Andrzej-Nowak-o-historii-nie-da-sie-myslec-beznamietnie', + 'info_dict': { + 'id': '1587943', + 'title': 'Prof. Andrzej Nowak: o historii nie da się myśleć beznamiętnie', + 'description': 'md5:12f954edbf3120c5e7075e17bf9fc5c5', + }, + 'playlist': [{ + 'md5': '2984ee6ce9046d91fc233bc1a864a09a', + 'info_dict': { + 'id': '1540576', + 'ext': 'mp3', + 'title': 'md5:d4623290d4ac983bf924061c75c23a0d', + 'timestamp': 1456594200, + 'upload_date': '20160227', + 'duration': 2364, + 'thumbnail': r're:^https?://static\.prsa\.pl/images/.*\.jpg$' + }, + }], + }, { + 'url': 'http://www.polskieradio.pl/265/5217/Artykul/1635803,Euro-2016-nie-ma-miejsca-na-blad-Polacy-graja-ze-Szwajcaria-o-cwiercfinal', + 'info_dict': { + 'id': '1635803', + 'title': 'Euro 2016: nie ma miejsca na błąd. Polacy grają ze Szwajcarią o ćwierćfinał', + 'description': 'md5:01cb7d0cad58664095d72b51a1ebada2', + }, + 'playlist_mincount': 12, + }, { + 'url': 'http://polskieradio.pl/9/305/Artykul/1632955,Bardzo-popularne-slowo-remis', + 'only_matching': True, + }, { + 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943', + 'only_matching': True, + }, { + # with mp4 video + 'url': 'http://www.polskieradio.pl/9/299/Artykul/1634903,Brexit-Leszek-Miller-swiat-sie-nie-zawali-Europa-bedzie-trwac-dalej', + 'only_matching': True, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + content = self._search_regex( + r'(?s)<div[^>]+class="\s*this-article\s*"[^>]*>(.+?)<div[^>]+class="tags"[^>]*>', + webpage, 'content') + + timestamp = unified_timestamp(self._html_search_regex( + r'(?s)<span[^>]+id="datetime2"[^>]*>(.+?)</span>', + webpage, 'timestamp', fatal=False)) + + thumbnail_url = self._og_search_thumbnail(webpage) + + entries = [] + + media_urls = set() + + for data_media in re.findall(r'<[^>]+data-media=({[^>]+})', content): + media = self._parse_json(data_media, playlist_id, fatal=False) + if not media.get('file') or not media.get('desc'): + continue + media_url = self._proto_relative_url(media['file'], 'http:') + if media_url in media_urls: + continue + media_urls.add(media_url) + entries.append({ + 'id': compat_str(media['id']), + 'url': media_url, + 'title': compat_urllib_parse_unquote(media['desc']), + 'duration': int_or_none(media.get('length')), + 'vcodec': 'none' if media.get('provider') == 'audio' else None, + 'timestamp': timestamp, + 'thumbnail': thumbnail_url + }) + + title = self._og_search_title(webpage).strip() + description = strip_or_none(self._og_search_description(webpage)) + + return self.playlist_result(entries, playlist_id, title, description) + + +class PolskieRadioCategoryIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+(?:,[^/]+)?/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://www.polskieradio.pl/7/5102,HISTORIA-ZYWA', + 'info_dict': { + 'id': '5102', + 'title': 'HISTORIA ŻYWA', + }, + 'playlist_mincount': 38, + }, { + 'url': 'http://www.polskieradio.pl/7/4807', + 'info_dict': { + 'id': '4807', + 'title': 'Vademecum 1050. rocznicy Chrztu Polski' + }, + 'playlist_mincount': 5 + }, { + 'url': 'http://www.polskieradio.pl/7/129,Sygnaly-dnia?ref=source', + 'only_matching': True + }, { + 'url': 'http://www.polskieradio.pl/37,RedakcjaKatolicka/4143,Kierunek-Krakow', + 'info_dict': { + 'id': '4143', + 'title': 'Kierunek Kraków', + }, + 'playlist_mincount': 61 + }, { + 'url': 'http://www.polskieradio.pl/10,czworka/214,muzyka', + 'info_dict': { + 'id': '214', + 'title': 'Muzyka', + }, + 'playlist_mincount': 61 + }, { + 'url': 'http://www.polskieradio.pl/7,Jedynka/5102,HISTORIA-ZYWA', + 'only_matching': True, + }, { + 'url': 'http://www.polskieradio.pl/8,Dwojka/196,Publicystyka', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if PolskieRadioIE.suitable(url) else super(PolskieRadioCategoryIE, cls).suitable(url) + + def _entries(self, url, page, category_id): + content = page + for page_num in itertools.count(2): + for a_entry, entry_id in re.findall( + r'(?s)<article[^>]+>.*?(<a[^>]+href=["\']/\d+/\d+/Artykul/(\d+)[^>]+>).*?</article>', + content): + entry = extract_attributes(a_entry) + href = entry.get('href') + if not href: + continue + yield self.url_result( + compat_urlparse.urljoin(url, href), PolskieRadioIE.ie_key(), + entry_id, entry.get('title')) + mobj = re.search( + r'<div[^>]+class=["\']next["\'][^>]*>\s*<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1', + content) + if not mobj: + break + next_url = compat_urlparse.urljoin(url, mobj.group('url')) + content = self._download_webpage( + next_url, category_id, 'Downloading page %s' % page_num) + + def _real_extract(self, url): + category_id = self._match_id(url) + webpage = self._download_webpage(url, category_id) + title = self._html_search_regex( + r'<title>([^<]+) - [^<]+ - [^<]+', + webpage, 'title', fatal=False) + return self.playlist_result( + self._entries(url, webpage, category_id), + category_id, title) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/popcorntv.py youtube-dl-2019.05.20/youtube_dl/extractor/popcorntv.py --- youtube-dl-2016.02.22/youtube_dl/extractor/popcorntv.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/popcorntv.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,76 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + int_or_none, + unified_timestamp, +) + + +class PopcornTVIE(InfoExtractor): + _VALID_URL = r'https?://[^/]+\.popcorntv\.it/guarda/(?P[^/]+)/(?P\d+)' + _TESTS = [{ + 'url': 'https://animemanga.popcorntv.it/guarda/food-wars-battaglie-culinarie-episodio-01/9183', + 'md5': '47d65a48d147caf692ab8562fe630b45', + 'info_dict': { + 'id': '9183', + 'display_id': 'food-wars-battaglie-culinarie-episodio-01', + 'ext': 'mp4', + 'title': 'Food Wars, Battaglie Culinarie | Episodio 01', + 'description': 'md5:b8bea378faae4651d3b34c6e112463d0', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1497610857, + 'upload_date': '20170616', + 'duration': 1440, + 'view_count': int, + }, + }, { + 'url': 'https://cinema.popcorntv.it/guarda/smash-cut/10433', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id, video_id = mobj.group('display_id', 'id') + + webpage = self._download_webpage(url, display_id) + + m3u8_url = extract_attributes( + self._search_regex( + r'(]+itemprop=["\'](?:content|embed)Url[^>]*>)', + webpage, 'content' + ))['href'] + + formats = self._extract_m3u8_formats( + m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + + title = self._search_regex( + r']+itemprop=["\']name[^>]*>([^<]+)', webpage, + 'title', default=None) or self._og_search_title(webpage) + + description = self._html_search_regex( + r'(?s)]+itemprop=["\']description[^>]*>(.+?)', + webpage, 'description', fatal=False) + thumbnail = self._og_search_thumbnail(webpage) + timestamp = unified_timestamp(self._html_search_meta( + 'uploadDate', webpage, 'timestamp')) + duration = int_or_none(self._html_search_meta( + 'duration', webpage), invscale=60) + view_count = int_or_none(self._html_search_meta( + 'interactionCount', webpage, 'view count')) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'view_count': view_count, + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/porn91.py youtube-dl-2019.05.20/youtube_dl/extractor/porn91.py --- youtube-dl-2016.02.22/youtube_dl/extractor/porn91.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/porn91.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,7 +1,6 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals -from ..compat import compat_urllib_parse from .common import InfoExtractor from ..utils import ( parse_duration, @@ -16,7 +15,7 @@ _TEST = { 'url': 'http://91porn.com/view_video.php?viewkey=7e42283b4f5ab36da134', - 'md5': '6df8f6d028bc8b14f5dbd73af742fb20', + 'md5': '7fcdb5349354f40d41689bd0fa8db05a', 'info_dict': { 'id': '7e42283b4f5ab36da134', 'title': '18岁大一漂亮学妹,水嫩性感,再爽一次!', @@ -28,9 +27,10 @@ def _real_extract(self, url): video_id = self._match_id(url) - url = 'http://91porn.com/view_video.php?viewkey=%s' % video_id self._set_cookie('91porn.com', 'language', 'cn_CN') - webpage = self._download_webpage(url, video_id, 'get HTML content') + + webpage = self._download_webpage( + 'http://91porn.com/view_video.php?viewkey=%s' % video_id, video_id) if '作为游客,你每天只可观看10个视频' in webpage: raise ExtractorError('91 Porn says: Daily limit 10 videos exceeded', expected=True) @@ -39,23 +39,7 @@ r'
([^<]+)
', webpage, 'title') title = title.replace('\n', '') - # get real url - file_id = self._search_regex( - r'so.addVariable\(\'file\',\'(\d+)\'', webpage, 'file id') - sec_code = self._search_regex( - r'so.addVariable\(\'seccode\',\'([^\']+)\'', webpage, 'sec code') - max_vid = self._search_regex( - r'so.addVariable\(\'max_vid\',\'(\d+)\'', webpage, 'max vid') - url_params = compat_urllib_parse.urlencode({ - 'VID': file_id, - 'mp4': '1', - 'seccode': sec_code, - 'max_vid': max_vid, - }) - info_cn = self._download_webpage( - 'http://91porn.com/getfile.php?' + url_params, video_id, - 'get real video url') - video_url = self._search_regex(r'file=([^&]+)&', info_cn, 'url') + info_dict = self._parse_html5_media_entries(url, webpage, video_id)[0] duration = parse_duration(self._search_regex( r'时长:\s*\s*(\d+:\d+)', webpage, 'duration', fatal=False)) @@ -63,11 +47,12 @@ comment_count = int_or_none(self._search_regex( r'留言:\s*\s*(\d+)', webpage, 'comment count', fatal=False)) - return { + info_dict.update({ 'id': video_id, 'title': title, - 'url': video_url, 'duration': duration, 'comment_count': comment_count, 'age_limit': self._rta_search(webpage), - } + }) + + return info_dict diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/porncom.py youtube-dl-2019.05.20/youtube_dl/extractor/porncom.py --- youtube-dl-2016.02.22/youtube_dl/extractor/porncom.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/porncom.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,103 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + int_or_none, + js_to_json, + parse_filesize, + str_to_int, +) + + +class PornComIE(InfoExtractor): + _VALID_URL = r'https?://(?:[a-zA-Z]+\.)?porn\.com/videos/(?:(?P[^/]+)-)?(?P\d+)' + _TESTS = [{ + 'url': 'http://www.porn.com/videos/teen-grabs-a-dildo-and-fucks-her-pussy-live-on-1hottie-i-rec-2603339', + 'md5': '3f30ce76267533cd12ba999263156de7', + 'info_dict': { + 'id': '2603339', + 'display_id': 'teen-grabs-a-dildo-and-fucks-her-pussy-live-on-1hottie-i-rec', + 'ext': 'mp4', + 'title': 'Teen grabs a dildo and fucks her pussy live on 1hottie, I rec', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 551, + 'view_count': int, + 'age_limit': 18, + 'categories': list, + 'tags': list, + }, + }, { + 'url': 'http://se.porn.com/videos/marsha-may-rides-seth-on-top-of-his-thick-cock-2658067', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') or video_id + + webpage = self._download_webpage(url, display_id) + + config = self._parse_json( + self._search_regex( + (r'=\s*({.+?})\s*;\s*v1ar\b', + r'=\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*='), + webpage, 'config', default='{}'), + display_id, transform_source=js_to_json, fatal=False) + + if config: + title = config['title'] + formats = [{ + 'url': stream['url'], + 'format_id': stream.get('id'), + 'height': int_or_none(self._search_regex( + r'^(\d+)[pP]', stream.get('id') or '', 'height', default=None)) + } for stream in config['streams'] if stream.get('url')] + thumbnail = (compat_urlparse.urljoin( + config['thumbCDN'], config['poster']) + if config.get('thumbCDN') and config.get('poster') else None) + duration = int_or_none(config.get('length')) + else: + title = self._search_regex( + (r'([^<]+)', r']*>([^<]+)'), + webpage, 'title') + formats = [{ + 'url': compat_urlparse.urljoin(url, format_url), + 'format_id': '%sp' % height, + 'height': int(height), + 'filesize_approx': parse_filesize(filesize), + } for format_url, height, filesize in re.findall( + r']+href="(/download/[^"]+)">[^<]*?(\d+)p]*>(\d+\s*[a-zA-Z]+)<', + webpage)] + thumbnail = None + duration = None + + self._sort_formats(formats) + + view_count = str_to_int(self._search_regex( + (r'Views:\s*\s*\s*([\d,.]+)', + r'class=["\']views["\'][^>]*>

([\d,.]+)'), webpage, + 'view count', fatal=False)) + + def extract_list(kind): + s = self._search_regex( + (r'(?s)%s:\s*\s*(.+?)' % kind.capitalize(), + r'(?s)]*>%s:(.+?)

' % kind.capitalize()), + webpage, kind, fatal=False) + return re.findall(r']+>([^<]+)', s or '') + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'view_count': view_count, + 'formats': formats, + 'age_limit': 18, + 'categories': extract_list('categories'), + 'tags': extract_list('tags'), + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/pornflip.py youtube-dl-2019.05.20/youtube_dl/extractor/pornflip.py --- youtube-dl-2016.02.22/youtube_dl/extractor/pornflip.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/pornflip.py 2019-06-07 18:47:18.000000000 +0000 @@ -0,0 +1,101 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_str, +) +from ..utils import ( + int_or_none, + try_get, + unified_timestamp, +) + + +class PornFlipIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pornflip\.com/(?:v|embed)/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.pornflip.com/v/wz7DfNhMmep', + 'md5': '98c46639849145ae1fd77af532a9278c', + 'info_dict': { + 'id': 'wz7DfNhMmep', + 'ext': 'mp4', + 'title': '2 Amateurs swallow make his dream cumshots true', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 112, + 'timestamp': 1481655502, + 'upload_date': '20161213', + 'uploader_id': '106786', + 'uploader': 'figifoto', + 'view_count': int, + 'age_limit': 18, + } + }, { + 'url': 'https://www.pornflip.com/embed/wz7DfNhMmep', + 'only_matching': True, + }, { + 'url': 'https://www.pornflip.com/v/EkRD6-vS2-s', + 'only_matching': True, + }, { + 'url': 'https://www.pornflip.com/embed/EkRD6-vS2-s', + 'only_matching': True, + }, { + 'url': 'https://www.pornflip.com/v/NG9q6Pb_iK8', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'https://www.pornflip.com/v/%s' % video_id, video_id) + + flashvars = compat_parse_qs(self._search_regex( + r']+flashvars=(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'flashvars', group='flashvars')) + + title = flashvars['video_vars[title]'][0] + + def flashvar(kind): + return try_get( + flashvars, lambda x: x['video_vars[%s]' % kind][0], compat_str) + + formats = [] + for key, value in flashvars.items(): + if not (value and isinstance(value, list)): + continue + format_url = value[0] + if key == 'video_vars[hds_manifest]': + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False)) + continue + height = self._search_regex( + r'video_vars\[video_urls\]\[(\d+)', key, 'height', default=None) + if not height: + continue + formats.append({ + 'url': format_url, + 'format_id': 'http-%s' % height, + 'height': int_or_none(height), + }) + self._sort_formats(formats) + + uploader = self._html_search_regex( + (r']+class="name"[^>]*>\s*]+>\s*(?P[^<]+)', + r']+content=(["\'])[^>]*\buploaded by (?P.+?)\1'), + webpage, 'uploader', fatal=False, group='uploader') + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'thumbnail': flashvar('big_thumb'), + 'duration': int_or_none(flashvar('duration')), + 'timestamp': unified_timestamp(self._html_search_meta( + 'uploadDate', webpage, 'timestamp')), + 'uploader_id': flashvar('author_id'), + 'uploader': uploader, + 'view_count': int_or_none(flashvar('views')), + 'age_limit': 18, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/pornhd.py youtube-dl-2019.05.20/youtube_dl/extractor/pornhd.py --- youtube-dl-2016.02.22/youtube_dl/extractor/pornhd.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/pornhd.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,19 +1,35 @@ from __future__ import unicode_literals import re -import json from .common import InfoExtractor from ..utils import ( + determine_ext, + ExtractorError, int_or_none, js_to_json, - qualities, + urljoin, ) class PornHdIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P\d+)(?:/(?P.+))?' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P\d+)(?:/(?P.+))?' + _TESTS = [{ + 'url': 'http://www.pornhd.com/videos/9864/selfie-restroom-masturbation-fun-with-chubby-cutie-hd-porn-video', + 'md5': '87f1540746c1d32ec7a2305c12b96b25', + 'info_dict': { + 'id': '9864', + 'display_id': 'selfie-restroom-masturbation-fun-with-chubby-cutie-hd-porn-video', + 'ext': 'mp4', + 'title': 'Restroom selfie masturbation', + 'description': 'md5:3748420395e03e31ac96857a8f125b2b', + 'thumbnail': r're:^https?://.*\.jpg', + 'view_count': int, + 'like_count': int, + 'age_limit': 18, + } + }, { + # removed video 'url': 'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video', 'md5': '956b8ca569f7f4d8ec563e2c41598441', 'info_dict': { @@ -22,11 +38,13 @@ 'ext': 'mp4', 'title': 'Sierra loves doing laundry', 'description': 'md5:8ff0523848ac2b8f9b065ba781ccf294', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'view_count': int, + 'like_count': int, 'age_limit': 18, - } - } + }, + 'skip': 'Not available anymore', + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -38,28 +56,46 @@ title = self._html_search_regex( [r']+class=["\']video-name["\'][^>]*>([^<]+)', r'(.+?) - .*?[Pp]ornHD.*?'], webpage, 'title') - description = self._html_search_regex( - r'
([^<]+)
', webpage, 'description', fatal=False) - view_count = int_or_none(self._html_search_regex( - r'(\d+) views\s*
', webpage, 'view count', fatal=False)) - thumbnail = self._search_regex( - r"'poster'\s*:\s*'([^']+)'", webpage, 'thumbnail', fatal=False) - quality = qualities(['sd', 'hd']) - sources = json.loads(js_to_json(self._search_regex( - r"(?s)'sources'\s*:\s*(\{.+?\})\s*\}[;,)]", - webpage, 'sources'))) + sources = self._parse_json(js_to_json(self._search_regex( + r"(?s)sources'?\s*[:=]\s*(\{.+?\})", + webpage, 'sources', default='{}')), video_id) + + if not sources: + message = self._html_search_regex( + r'(?s)<(div|p)[^>]+class="no-video"[^>]*>(?P.+?)]+class="description"[^>]*>(?P[^<]+)(?:(?!\1).)+)\1", webpage, + 'thumbnail', fatal=False, group='url') + + like_count = int_or_none(self._search_regex( + (r'(\d+)\s*]+>(?: |\s)*\blikes', + r'class=["\']save-count["\'][^>]*>\s*(\d+)'), + webpage, 'like count', fatal=False)) + return { 'id': video_id, 'display_id': display_id, @@ -67,6 +103,7 @@ 'description': description, 'thumbnail': thumbnail, 'view_count': view_count, + 'like_count': like_count, 'formats': formats, 'age_limit': 18, } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/pornhub.py youtube-dl-2019.05.20/youtube_dl/extractor/pornhub.py --- youtube-dl-2016.02.22/youtube_dl/extractor/pornhub.py 2016-02-20 13:02:05.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/pornhub.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,27 +1,61 @@ +# coding: utf-8 from __future__ import unicode_literals -import os +import functools +import itertools +import operator import re from .common import InfoExtractor from ..compat import ( - compat_urllib_parse_unquote, - compat_urllib_parse_unquote_plus, - compat_urllib_parse_urlparse, + compat_HTTPError, + compat_str, + compat_urllib_request, ) +from .openload import PhantomJSwrapper from ..utils import ( + determine_ext, ExtractorError, int_or_none, - sanitized_Request, + orderedSet, + remove_quotes, str_to_int, -) -from ..aes import ( - aes_decrypt_text + url_or_none, ) -class PornHubIE(InfoExtractor): - _VALID_URL = r'https?://(?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)(?P[0-9a-z]+)' +class PornHubBaseIE(InfoExtractor): + def _download_webpage_handle(self, *args, **kwargs): + def dl(*args, **kwargs): + return super(PornHubBaseIE, self)._download_webpage_handle(*args, **kwargs) + + webpage, urlh = dl(*args, **kwargs) + + if any(re.search(p, webpage) for p in ( + r']+\bonload=["\']go\(\)', + r'document\.cookie\s*=\s*["\']RNKEY=', + r'document\.location\.reload\(true\)')): + url_or_request = args[0] + url = (url_or_request.get_full_url() + if isinstance(url_or_request, compat_urllib_request.Request) + else url_or_request) + phantom = PhantomJSwrapper(self, required_version='2.0') + phantom.get(url, html=webpage) + webpage, urlh = dl(*args, **kwargs) + + return webpage, urlh + + +class PornHubIE(PornHubBaseIE): + IE_DESC = 'PornHub and Thumbzilla' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:[^/]+\.)?(?Ppornhub\.(?:com|net))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| + (?:www\.)?thumbzilla\.com/video/ + ) + (?P[\da-z]+) + ''' _TESTS = [{ 'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015', 'md5': '1e19b41231a02eba417839222ac9d58e', @@ -30,65 +64,238 @@ 'ext': 'mp4', 'title': 'Seductive Indian beauty strips down and fingers her pink pussy', 'uploader': 'Babes', + 'upload_date': '20130628', 'duration': 361, 'view_count': int, 'like_count': int, 'dislike_count': int, 'comment_count': int, 'age_limit': 18, - } + 'tags': list, + 'categories': list, + }, + }, { + # non-ASCII title + 'url': 'http://www.pornhub.com/view_video.php?viewkey=1331683002', + 'info_dict': { + 'id': '1331683002', + 'ext': 'mp4', + 'title': '重庆婷婷女王足交', + 'uploader': 'Unknown', + 'upload_date': '20150213', + 'duration': 1753, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'age_limit': 18, + 'tags': list, + 'categories': list, + }, + 'params': { + 'skip_download': True, + }, + }, { + # subtitles + 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5af5fef7c2aa7', + 'info_dict': { + 'id': 'ph5af5fef7c2aa7', + 'ext': 'mp4', + 'title': 'BFFS - Cute Teen Girls Share Cock On the Floor', + 'uploader': 'BFFs', + 'duration': 622, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'age_limit': 18, + 'tags': list, + 'categories': list, + 'subtitles': { + 'en': [{ + "ext": 'srt' + }] + }, + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d', 'only_matching': True, }, { + # removed at the request of cam4.com 'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862', 'only_matching': True, + }, { + # removed at the request of the copyright owner + 'url': 'http://www.pornhub.com/view_video.php?viewkey=788152859', + 'only_matching': True, + }, { + # removed by uploader + 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph572716d15a111', + 'only_matching': True, + }, { + # private video + 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph56fd731fce6b7', + 'only_matching': True, + }, { + 'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex', + 'only_matching': True, + }, { + 'url': 'http://www.pornhub.com/video/show?viewkey=648719015', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.net/view_video.php?viewkey=203640933', + 'only_matching': True, }] - @classmethod - def _extract_url(cls, webpage): - mobj = re.search( - r']+?src=(["\'])(?P(?:https?:)?//(?:www\.)?pornhub\.com/embed/\d+)\1', webpage) - if mobj: - return mobj.group('url') + @staticmethod + def _extract_urls(webpage): + return re.findall( + r']+?src=["\'](?P(?:https?:)?//(?:www\.)?pornhub\.(?:com|net)/embed/[\da-z]+)', + webpage) def _extract_count(self, pattern, webpage, name): return str_to_int(self._search_regex( pattern, webpage, '%s count' % name, fatal=False)) def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') or 'pornhub.com' + video_id = mobj.group('id') + + self._set_cookie(host, 'age_verified', '1') + + def dl_webpage(platform): + self._set_cookie(host, 'platform', platform) + return self._download_webpage( + 'https://www.%s/view_video.php?viewkey=%s' % (host, video_id), + video_id, 'Downloading %s webpage' % platform) - req = sanitized_Request( - 'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id) - req.add_header('Cookie', 'age_verified=1') - webpage = self._download_webpage(req, video_id) + webpage = dl_webpage('pc') error_msg = self._html_search_regex( - r'(?s)
(.*?)
', - webpage, 'error message', default=None) + r'(?s)]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P.+?)
', + webpage, 'error message', default=None, group='error') if error_msg: error_msg = re.sub(r'\s+', ' ', error_msg) raise ExtractorError( 'PornHub said: %s' % error_msg, expected=True, video_id=video_id) + # video_title from flashvars contains whitespace instead of non-ASCII (see + # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying + # on that anymore. + title = self._html_search_meta( + 'twitter:title', webpage, default=None) or self._search_regex( + (r']+class=["\']title["\'][^>]*>(?P[^<]+)', + r'<div[^>]+data-video-title=(["\'])(?P<title>.+?)\1', + r'shareTitle\s*=\s*(["\'])(?P<title>.+?)\1'), + webpage, 'title', group='title') + + video_urls = [] + video_urls_set = set() + subtitles = {} + flashvars = self._parse_json( self._search_regex( - r'var\s+flashv1ars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'), + r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'), video_id) if flashvars: - video_title = flashvars.get('video_title') + subtitle_url = url_or_none(flashvars.get('closedCaptionsFile')) + if subtitle_url: + subtitles.setdefault('en', []).append({ + 'url': subtitle_url, + 'ext': 'srt', + }) thumbnail = flashvars.get('image_url') duration = int_or_none(flashvars.get('video_duration')) + media_definitions = flashvars.get('mediaDefinitions') + if isinstance(media_definitions, list): + for definition in media_definitions: + if not isinstance(definition, dict): + continue + video_url = definition.get('videoUrl') + if not video_url or not isinstance(video_url, compat_str): + continue + if video_url in video_urls_set: + continue + video_urls_set.add(video_url) + video_urls.append( + (video_url, int_or_none(definition.get('quality')))) else: - video_title, thumbnail, duration = [None] * 3 + thumbnail, duration = [None] * 2 - if not video_title: - video_title = self._html_search_regex(r'<h1 [^>]+>([^<]+)', webpage, 'title') + if not video_urls: + tv_webpage = dl_webpage('tv') + + assignments = self._search_regex( + r'(var.+?mediastring.+?)</script>', tv_webpage, + 'encoded url').split(';') + + js_vars = {} + + def parse_js_value(inp): + inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp) + if '+' in inp: + inps = inp.split('+') + return functools.reduce( + operator.concat, map(parse_js_value, inps)) + inp = inp.strip() + if inp in js_vars: + return js_vars[inp] + return remove_quotes(inp) + + for assn in assignments: + assn = assn.strip() + if not assn: + continue + assn = re.sub(r'var\s+', '', assn) + vname, value = assn.split('=', 1) + js_vars[vname] = parse_js_value(value) + + video_url = js_vars['mediastring'] + if video_url not in video_urls_set: + video_urls.append((video_url, None)) + video_urls_set.add(video_url) + + for mobj in re.finditer( + r'<a[^>]+\bclass=["\']downloadBtn\b[^>]+\bhref=(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage): + video_url = mobj.group('url') + if video_url not in video_urls_set: + video_urls.append((video_url, None)) + video_urls_set.add(video_url) + + upload_date = None + formats = [] + for video_url, height in video_urls: + if not upload_date: + upload_date = self._search_regex( + r'/(\d{6}/\d{2})/', video_url, 'upload data', default=None) + if upload_date: + upload_date = upload_date.replace('/', '') + if determine_ext(video_url) == 'mpd': + formats.extend(self._extract_mpd_formats( + video_url, video_id, mpd_id='dash', fatal=False)) + continue + tbr = None + mobj = re.search(r'(?P<height>\d+)[pP]?_(?P<tbr>\d+)[kK]', video_url) + if mobj: + if not height: + height = int(mobj.group('height')) + tbr = int(mobj.group('tbr')) + formats.append({ + 'url': video_url, + 'format_id': '%dp' % height if height else None, + 'height': height, + 'tbr': tbr, + }) + self._sort_formats(formats) video_uploader = self._html_search_regex( - r'(?s)From: .+?<(?:a href="/users/|a href="/channels/|span class="username)[^>]+>(.+?)<', + r'(?s)From: .+?<(?:a\b[^>]+\bhref=["\']/(?:(?:user|channel)s|model|pornstar)/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<', webpage, 'uploader', fatal=False) view_count = self._extract_count( @@ -100,41 +307,18 @@ comment_count = self._extract_count( r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment') - video_urls = list(map(compat_urllib_parse_unquote, re.findall(r"player_quality_[0-9]{3}p\s*=\s*'([^']+)'", webpage))) - if webpage.find('"encrypted":true') != -1: - password = compat_urllib_parse_unquote_plus( - self._search_regex(r'"video_title":"([^"]+)', webpage, 'password')) - video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls)) - - formats = [] - for video_url in video_urls: - path = compat_urllib_parse_urlparse(video_url).path - extension = os.path.splitext(path)[1][1:] - format = path.split('/')[5].split('_')[:2] - format = '-'.join(format) - - m = re.match(r'^(?P<height>[0-9]+)[pP]-(?P<tbr>[0-9]+)[kK]$', format) - if m is None: - height = None - tbr = None - else: - height = int(m.group('height')) - tbr = int(m.group('tbr')) - - formats.append({ - 'url': video_url, - 'ext': extension, - 'format': format, - 'format_id': format, - 'tbr': tbr, - 'height': height, - }) - self._sort_formats(formats) + def extract_list(meta_key): + div = self._search_regex( + r'(?s)<div[^>]+\bclass=["\'].*?\b%sWrapper[^>]*>(.+?)</div>' + % meta_key, webpage, meta_key, default=None) + if div: + return re.findall(r'<a[^>]+\bhref=[^>]+>([^<]+)', div) return { 'id': video_id, 'uploader': video_uploader, - 'title': video_title, + 'upload_date': upload_date, + 'title': title, 'thumbnail': thumbnail, 'duration': duration, 'view_count': view_count, @@ -143,58 +327,125 @@ 'comment_count': comment_count, 'formats': formats, 'age_limit': 18, + 'tags': extract_list('tags'), + 'categories': extract_list('categories'), + 'subtitles': subtitles, } -class PornHubPlaylistBaseIE(InfoExtractor): - def _extract_entries(self, webpage): +class PornHubPlaylistBaseIE(PornHubBaseIE): + def _extract_entries(self, webpage, host): + # Only process container div with main playlist content skipping + # drop-down menu that uses similar pattern for videos (see + # https://github.com/ytdl-org/youtube-dl/issues/11594). + container = self._search_regex( + r'(?s)(<div[^>]+class=["\']container.+)', webpage, + 'container', default=webpage) + return [ - self.url_result('http://www.pornhub.com/%s' % video_url, PornHubIE.ie_key()) - for video_url in set(re.findall( - r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"', webpage)) + self.url_result( + 'http://www.%s/%s' % (host, video_url), + PornHubIE.ie_key(), video_title=title) + for video_url, title in orderedSet(re.findall( + r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"', + container)) ] def _real_extract(self, url): - playlist_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') + playlist_id = mobj.group('id') webpage = self._download_webpage(url, playlist_id) - entries = self._extract_entries(webpage) + entries = self._extract_entries(webpage, host) playlist = self._parse_json( self._search_regex( - r'playlistObject\s*=\s*({.+?});', webpage, 'playlist'), - playlist_id) + r'(?:playlistObject|PLAYLIST_VIEW)\s*=\s*({.+?});', webpage, + 'playlist', default='{}'), + playlist_id, fatal=False) + title = playlist.get('title') or self._search_regex( + r'>Videos\s+in\s+(.+?)\s+[Pp]laylist<', webpage, 'title', fatal=False) return self.playlist_result( - entries, playlist_id, playlist.get('title'), playlist.get('description')) + entries, playlist_id, title, playlist.get('description')) class PornHubPlaylistIE(PornHubPlaylistBaseIE): - _VALID_URL = r'https?://(?:www\.)?pornhub\.com/playlist/(?P<id>\d+)' + _VALID_URL = r'https?://(?:[^/]+\.)?(?P<host>pornhub\.(?:com|net))/playlist/(?P<id>\d+)' _TESTS = [{ - 'url': 'http://www.pornhub.com/playlist/6201671', + 'url': 'http://www.pornhub.com/playlist/4667351', 'info_dict': { - 'id': '6201671', - 'title': 'P0p4', + 'id': '4667351', + 'title': 'Nataly Hot', }, - 'playlist_mincount': 35, + 'playlist_mincount': 2, + }, { + 'url': 'https://de.pornhub.com/playlist/4667351', + 'only_matching': True, }] class PornHubUserVideosIE(PornHubPlaylistBaseIE): - _VALID_URL = r'https?://(?:www\.)?pornhub\.com/users/(?P<id>[^/]+)/videos' + _VALID_URL = r'https?://(?:[^/]+\.)?(?P<host>pornhub\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos' _TESTS = [{ + 'url': 'http://www.pornhub.com/users/zoe_ph/videos/public', + 'info_dict': { + 'id': 'zoe_ph', + }, + 'playlist_mincount': 171, + }, { 'url': 'http://www.pornhub.com/users/rushandlia/videos', + 'only_matching': True, + }, { + # default sorting as Top Rated Videos + 'url': 'https://www.pornhub.com/channels/povd/videos', 'info_dict': { - 'id': 'rushandlia', + 'id': 'povd', }, - 'playlist_mincount': 13, + 'playlist_mincount': 293, + }, { + # Top Rated Videos + 'url': 'https://www.pornhub.com/channels/povd/videos?o=ra', + 'only_matching': True, + }, { + # Most Recent Videos + 'url': 'https://www.pornhub.com/channels/povd/videos?o=da', + 'only_matching': True, + }, { + # Most Viewed Videos + 'url': 'https://www.pornhub.com/channels/povd/videos?o=vi', + 'only_matching': True, + }, { + 'url': 'http://www.pornhub.com/users/zoe_ph/videos/public', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/model/jayndrea/videos/upload', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload', + 'only_matching': True, }] def _real_extract(self, url): - user_id = self._match_id(url) - - webpage = self._download_webpage(url, user_id) + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') + user_id = mobj.group('id') + + entries = [] + for page_num in itertools.count(1): + try: + webpage = self._download_webpage( + url, user_id, 'Downloading page %d' % page_num, + query={'page': page_num}) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: + break + raise + page_entries = self._extract_entries(webpage, host) + if not page_entries: + break + entries.extend(page_entries) - return self.playlist_result(self._extract_entries(webpage), user_id) + return self.playlist_result(entries, user_id) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/pornotube.py youtube-dl-2019.05.20/youtube_dl/extractor/pornotube.py --- youtube-dl-2016.02.22/youtube_dl/extractor/pornotube.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/pornotube.py 2019-06-07 18:49:07.000000000 +0000 @@ -3,10 +3,7 @@ import json from .common import InfoExtractor -from ..utils import ( - int_or_none, - sanitized_Request, -) +from ..utils import int_or_none class PornotubeIE(InfoExtractor): @@ -22,7 +19,7 @@ 'description': 'md5:a8304bef7ef06cb4ab476ca6029b01b0', 'categories': ['Adult Humor', 'Blondes'], 'uploader': 'Alpha Blue Archives', - 'thumbnail': 're:^https?://.*\\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': 1417582800, 'age_limit': 18, } @@ -31,59 +28,55 @@ def _real_extract(self, url): video_id = self._match_id(url) - # Fetch origin token - js_config = self._download_webpage( - 'http://www.pornotube.com/assets/src/app/config.js', video_id, - note='Download JS config') - originAuthenticationSpaceKey = self._search_regex( - r"constant\('originAuthenticationSpaceKey',\s*'([^']+)'", - js_config, 'originAuthenticationSpaceKey') - - # Fetch actual token - token_req_data = { - 'authenticationSpaceKey': originAuthenticationSpaceKey, - 'credentials': 'Clip Application', - } - token_req = sanitized_Request( - 'https://api.aebn.net/auth/v1/token/primal', - data=json.dumps(token_req_data).encode('utf-8')) - token_req.add_header('Content-Type', 'application/json') - token_req.add_header('Origin', 'http://www.pornotube.com') - token_answer = self._download_json( - token_req, video_id, note='Requesting primal token') - token = token_answer['tokenKey'] - - # Get video URL - delivery_req = sanitized_Request( - 'https://api.aebn.net/delivery/v1/clips/%s/MP4' % video_id) - delivery_req.add_header('Authorization', token) - delivery_info = self._download_json( - delivery_req, video_id, note='Downloading delivery information') - video_url = delivery_info['mediaUrl'] - - # Get additional info (title etc.) - info_req = sanitized_Request( - 'https://api.aebn.net/content/v1/clips/%s?expand=' - 'title,description,primaryImageNumber,startSecond,endSecond,' - 'movie.title,movie.MovieId,movie.boxCoverFront,movie.stars,' - 'movie.studios,stars.name,studios.name,categories.name,' - 'clipActive,movieActive,publishDate,orientations' % video_id) - info_req.add_header('Authorization', token) + token = self._download_json( + 'https://api.aebn.net/auth/v2/origins/authenticate', + video_id, note='Downloading token', + data=json.dumps({'credentials': 'Clip Application'}).encode('utf-8'), + headers={ + 'Content-Type': 'application/json', + 'Origin': 'http://www.pornotube.com', + })['tokenKey'] + + video_url = self._download_json( + 'https://api.aebn.net/delivery/v1/clips/%s/MP4' % video_id, + video_id, note='Downloading delivery information', + headers={'Authorization': token})['mediaUrl'] + + FIELDS = ( + 'title', 'description', 'startSecond', 'endSecond', 'publishDate', + 'studios{name}', 'categories{name}', 'movieId', 'primaryImageNumber' + ) + info = self._download_json( - info_req, video_id, note='Downloading metadata') + 'https://api.aebn.net/content/v2/clips/%s?fields=%s' + % (video_id, ','.join(FIELDS)), video_id, + note='Downloading metadata', + headers={'Authorization': token}) + + if isinstance(info, list): + info = info[0] + + title = info['title'] timestamp = int_or_none(info.get('publishDate'), scale=1000) uploader = info.get('studios', [{}])[0].get('name') - movie_id = info['movie']['movieId'] - thumbnail = 'http://pic.aebn.net/dis/t/%s/%s_%08d.jpg' % ( - movie_id, movie_id, info['primaryImageNumber']) - categories = [c['name'] for c in info.get('categories')] + movie_id = info.get('movieId') + primary_image_number = info.get('primaryImageNumber') + thumbnail = None + if movie_id and primary_image_number: + thumbnail = 'http://pic.aebn.net/dis/t/%s/%s_%08d.jpg' % ( + movie_id, movie_id, primary_image_number) + start = int_or_none(info.get('startSecond')) + end = int_or_none(info.get('endSecond')) + duration = end - start if start and end else None + categories = [c['name'] for c in info.get('categories', []) if c.get('name')] return { 'id': video_id, 'url': video_url, - 'title': info['title'], + 'title': title, 'description': info.get('description'), + 'duration': duration, 'timestamp': timestamp, 'uploader': uploader, 'thumbnail': thumbnail, diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/pornovoisines.py youtube-dl-2019.05.20/youtube_dl/extractor/pornovoisines.py --- youtube-dl-2016.02.22/youtube_dl/extractor/pornovoisines.py 2016-02-14 22:26:17.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/pornovoisines.py 2019-06-07 18:49:07.000000000 +0000 @@ -2,7 +2,6 @@ from __future__ import unicode_literals import re -import random from .common import InfoExtractor from ..utils import ( @@ -13,61 +12,69 @@ class PornoVoisinesIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?pornovoisines\.com/showvideo/(?P<id>\d+)/(?P<display_id>[^/]+)' - - _VIDEO_URL_TEMPLATE = 'http://stream%d.pornovoisines.com' \ - '/static/media/video/transcoded/%s-640x360-1000-trscded.mp4' - - _SERVER_NUMBERS = (1, 2) + _VALID_URL = r'https?://(?:www\.)?pornovoisines\.com/videos/show/(?P<id>\d+)/(?P<display_id>[^/.]+)' _TEST = { - 'url': 'http://www.pornovoisines.com/showvideo/1285/recherche-appartement/', - 'md5': '5ac670803bc12e9e7f9f662ce64cf1d1', + 'url': 'http://www.pornovoisines.com/videos/show/919/recherche-appartement.html', + 'md5': '6f8aca6a058592ab49fe701c8ba8317b', 'info_dict': { - 'id': '1285', + 'id': '919', 'display_id': 'recherche-appartement', 'ext': 'mp4', 'title': 'Recherche appartement', - 'description': 'md5:819ea0b785e2a04667a1a01cdc89594e', - 'thumbnail': 're:^https?://.*\.jpg$', + 'description': 'md5:fe10cb92ae2dd3ed94bb4080d11ff493', + 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20140925', 'duration': 120, 'view_count': int, 'average_rating': float, - 'categories': ['Débutantes', 'Scénario', 'Sodomie'], + 'categories': ['Débutante', 'Débutantes', 'Scénario', 'Sodomie'], 'age_limit': 18, + 'subtitles': { + 'fr': [{ + 'ext': 'vtt', + }] + }, } } - @classmethod - def build_video_url(cls, num): - return cls._VIDEO_URL_TEMPLATE % (random.choice(cls._SERVER_NUMBERS), num) - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') display_id = mobj.group('display_id') + settings_url = self._download_json( + 'http://www.pornovoisines.com/api/video/%s/getsettingsurl/' % video_id, + video_id, note='Getting settings URL')['video_settings_url'] + settings = self._download_json(settings_url, video_id)['data'] + + formats = [] + for kind, data in settings['variants'].items(): + if kind == 'HLS': + formats.extend(self._extract_m3u8_formats( + data, video_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls')) + elif kind == 'MP4': + for item in data: + formats.append({ + 'url': item['url'], + 'height': item.get('height'), + 'bitrate': item.get('bitrate'), + }) + self._sort_formats(formats) + webpage = self._download_webpage(url, video_id) - video_url = self.build_video_url(video_id) + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) - title = self._html_search_regex( - r'<h1>(.+?)</h1>', webpage, 'title', flags=re.DOTALL) - description = self._html_search_regex( - r'<article id="descriptif">(.+?)</article>', - webpage, 'description', fatal=False, flags=re.DOTALL) - - thumbnail = self._search_regex( - r'<div id="mediaspace%s">\s*<img src="/?([^"]+)"' % video_id, - webpage, 'thumbnail', fatal=False) - if thumbnail: - thumbnail = 'http://www.pornovoisines.com/%s' % thumbnail + # The webpage has a bug - there's no space between "thumb" and src= + thumbnail = self._html_search_regex( + r'<img[^>]+class=([\'"])thumb\1[^>]*src=([\'"])(?P<url>[^"]+)\2', + webpage, 'thumbnail', fatal=False, group='url') upload_date = unified_strdate(self._search_regex( - r'Publié le ([\d-]+)', webpage, 'upload date', fatal=False)) - duration = int_or_none(self._search_regex( - 'Durée (\d+)', webpage, 'duration', fatal=False)) + r'Le\s*<b>([\d/]+)', webpage, 'upload date', fatal=False)) + duration = settings.get('main', {}).get('duration') view_count = int_or_none(self._search_regex( r'(\d+) vues', webpage, 'view count', fatal=False)) average_rating = self._search_regex( @@ -75,15 +82,19 @@ if average_rating: average_rating = float_or_none(average_rating.replace(',', '.')) - categories = self._html_search_meta( - 'keywords', webpage, 'categories', fatal=False) + categories = self._html_search_regex( + r'(?s)Catégories\s*:\s*<b>(.+?)</b>', webpage, 'categories', fatal=False) if categories: categories = [category.strip() for category in categories.split(',')] + subtitles = {'fr': [{ + 'url': subtitle, + } for subtitle in settings.get('main', {}).get('vtt_tracks', {}).values()]} + return { 'id': video_id, 'display_id': display_id, - 'url': video_url, + 'formats': formats, 'title': title, 'description': description, 'thumbnail': thumbnail, @@ -93,4 +104,5 @@ 'average_rating': average_rating, 'categories': categories, 'age_limit': 18, + 'subtitles': subtitles, } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/pornoxo.py youtube-dl-2019.05.20/youtube_dl/extractor/pornoxo.py --- youtube-dl-2016.02.22/youtube_dl/extractor/pornoxo.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/pornoxo.py 2019-06-07 18:49:07.000000000 +0000 @@ -17,32 +17,24 @@ 'id': '7564', 'ext': 'flv', 'title': 'Striptease From Sexy Secretary!', - 'description': 'Striptease From Sexy Secretary!', + 'display_id': 'striptease-from-sexy-secretary', + 'description': 'md5:0ee35252b685b3883f4a1d38332f9980', 'categories': list, # NSFW - 'thumbnail': 're:https?://.*\.jpg$', + 'thumbnail': r're:https?://.*\.jpg$', 'age_limit': 18, } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id, display_id = mobj.groups() webpage = self._download_webpage(url, video_id) - - video_url = self._html_search_regex( - r'\'file\'\s*:\s*"([^"]+)"', webpage, 'video_url') + video_data = self._extract_jwplayer_data(webpage, video_id, require_title=False) title = self._html_search_regex( r'<title>([^<]+)\s*-\s*PornoXO', webpage, 'title') - description = self._html_search_regex( - r'<meta name="description" content="([^"]+)\s*featuring', - webpage, 'description', fatal=False) - - thumbnail = self._html_search_regex( - r'\'image\'\s*:\s*"([^"]+)"', webpage, 'thumbnail', fatal=False) - view_count = str_to_int(self._html_search_regex( r'[vV]iews:\s*([0-9,]+)', webpage, 'view count', fatal=False)) @@ -53,13 +45,14 @@ None if categories_str is None else categories_str.split(',')) - return { + video_data.update({ 'id': video_id, - 'url': video_url, 'title': title, - 'description': description, - 'thumbnail': thumbnail, + 'display_id': display_id, + 'description': self._html_search_meta('description', webpage), 'categories': categories, 'view_count': view_count, 'age_limit': 18, - } + }) + + return video_data diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/presstv.py youtube-dl-2019.05.20/youtube_dl/extractor/presstv.py --- youtube-dl-2016.02.22/youtube_dl/extractor/presstv.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/presstv.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,74 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import remove_start + + +class PressTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?presstv\.ir/[^/]+/(?P<y>\d+)/(?P<m>\d+)/(?P<d>\d+)/(?P<id>\d+)/(?P<display_id>[^/]+)?' + + _TEST = { + 'url': 'http://www.presstv.ir/Detail/2016/04/09/459911/Australian-sewerage-treatment-facility-/', + 'md5': '5d7e3195a447cb13e9267e931d8dd5a5', + 'info_dict': { + 'id': '459911', + 'display_id': 'Australian-sewerage-treatment-facility-', + 'ext': 'mp4', + 'title': 'Organic mattresses used to clean waste water', + 'upload_date': '20160409', + 'thumbnail': r're:^https?://.*\.jpg', + 'description': 'md5:20002e654bbafb6908395a5c0cfcd125' + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') or video_id + + webpage = self._download_webpage(url, display_id) + + # extract video URL from webpage + video_url = self._hidden_inputs(webpage)['inpPlayback'] + + # build list of available formats + # specified in http://www.presstv.ir/Scripts/playback.js + base_url = 'http://192.99.219.222:82/presstv' + _formats = [ + (180, '_low200.mp4'), + (360, '_low400.mp4'), + (720, '_low800.mp4'), + (1080, '.mp4') + ] + + formats = [{ + 'url': base_url + video_url[:-4] + extension, + 'format_id': '%dp' % height, + 'height': height, + } for height, extension in _formats] + + # extract video metadata + title = remove_start( + self._html_search_meta('title', webpage, fatal=True), 'PressTV-') + + thumbnail = self._og_search_thumbnail(webpage) + description = self._og_search_description(webpage) + + upload_date = '%04d%02d%02d' % ( + int(mobj.group('y')), + int(mobj.group('m')), + int(mobj.group('d')), + ) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'description': description + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/primesharetv.py youtube-dl-2019.05.20/youtube_dl/extractor/primesharetv.py --- youtube-dl-2016.02.22/youtube_dl/extractor/primesharetv.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/primesharetv.py 2019-06-07 18:47:18.000000000 +0000 @@ -1,10 +1,10 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, sanitized_Request, + urlencode_postdata, ) @@ -42,7 +42,7 @@ self._sleep(wait_time, video_id) req = sanitized_Request( - url, compat_urllib_parse.urlencode(fields), headers) + url, urlencode_postdata(fields), headers) video_page = self._download_webpage( req, video_id, 'Downloading video page') diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/promptfile.py youtube-dl-2019.05.20/youtube_dl/extractor/promptfile.py --- youtube-dl-2016.02.22/youtube_dl/extractor/promptfile.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/promptfile.py 2019-06-07 18:49:07.000000000 +0000 @@ -4,24 +4,23 @@ import re from .common import InfoExtractor -from ..compat import compat_urllib_parse from ..utils import ( determine_ext, ExtractorError, - sanitized_Request, + urlencode_postdata, ) class PromptFileIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?promptfile\.com/l/(?P<id>[0-9A-Z\-]+)' _TEST = { - 'url': 'http://www.promptfile.com/l/D21B4746E9-F01462F0FF', - 'md5': 'd1451b6302da7215485837aaea882c4c', + 'url': 'http://www.promptfile.com/l/86D1CE8462-576CAAE416', + 'md5': '5a7e285a26e0d66d9a263fae91bc92ce', 'info_dict': { - 'id': 'D21B4746E9-F01462F0FF', + 'id': '86D1CE8462-576CAAE416', 'ext': 'mp4', - 'title': 'Birds.mp4', - 'thumbnail': 're:^https?://.*\.jpg$', + 'title': 'oceans.mp4', + 'thumbnail': r're:^https?://.*\.jpg$', } } @@ -33,14 +32,23 @@ raise ExtractorError('Video %s does not exist' % video_id, expected=True) + chash = self._search_regex( + r'val\("([^"]*)"\s*\+\s*\$\("#chash"\)', webpage, 'chash') fields = self._hidden_inputs(webpage) - post = compat_urllib_parse.urlencode(fields) - req = sanitized_Request(url, post) - req.add_header('Content-type', 'application/x-www-form-urlencoded') - webpage = self._download_webpage( - req, video_id, 'Downloading video page') + keys = list(fields.keys()) + chash_key = keys[0] if len(keys) == 1 else next( + key for key in keys if key.startswith('cha')) + fields[chash_key] = chash + fields[chash_key] - url = self._html_search_regex(r'url:\s*\'([^\']+)\'', webpage, 'URL') + webpage = self._download_webpage( + url, video_id, 'Downloading video page', + data=urlencode_postdata(fields), + headers={'Content-type': 'application/x-www-form-urlencoded'}) + + video_url = self._search_regex( + (r'<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1[^>]*>\s*Download File', + r'<a[^>]+href=(["\'])(?P<url>https?://(?:www\.)?promptfile\.com/file/(?:(?!\1).)+)\1'), + webpage, 'video url', group='url') title = self._html_search_regex( r'<span.+title="([^"]+)">', webpage, 'title') thumbnail = self._html_search_regex( @@ -49,7 +57,7 @@ formats = [{ 'format_id': 'sd', - 'url': url, + 'url': video_url, 'ext': determine_ext(title), }] self._sort_formats(formats) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/prosiebensat1.py youtube-dl-2019.05.20/youtube_dl/extractor/prosiebensat1.py --- youtube-dl-2016.02.22/youtube_dl/extractor/prosiebensat1.py 2016-01-15 18:42:53.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/prosiebensat1.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,13 +1,11 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re from hashlib import sha1 from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse, -) +from ..compat import compat_str from ..utils import ( ExtractorError, determine_ext, @@ -17,31 +15,185 @@ ) -class ProSiebenSat1IE(InfoExtractor): +class ProSiebenSat1BaseIE(InfoExtractor): + _GEO_COUNTRIES = ['DE'] + _ACCESS_ID = None + _SUPPORTED_PROTOCOLS = 'dash:clear,hls:clear,progressive:clear' + _V4_BASE_URL = 'https://vas-v4.p7s1video.net/4.0/get' + + def _extract_video_info(self, url, clip_id): + client_location = url + + video = self._download_json( + 'http://vas.sim-technik.de/vas/live/v2/videos', + clip_id, 'Downloading videos JSON', query={ + 'access_token': self._TOKEN, + 'client_location': client_location, + 'client_name': self._CLIENT_NAME, + 'ids': clip_id, + })[0] + + if video.get('is_protected') is True: + raise ExtractorError('This video is DRM protected.', expected=True) + + formats = [] + if self._ACCESS_ID: + raw_ct = self._ENCRYPTION_KEY + clip_id + self._IV + self._ACCESS_ID + server_token = (self._download_json( + self._V4_BASE_URL + 'protocols', clip_id, + 'Downloading protocols JSON', + headers=self.geo_verification_headers(), query={ + 'access_id': self._ACCESS_ID, + 'client_token': sha1((raw_ct).encode()).hexdigest(), + 'video_id': clip_id, + }, fatal=False) or {}).get('server_token') + if server_token: + urls = (self._download_json( + self._V4_BASE_URL + 'urls', clip_id, 'Downloading urls JSON', query={ + 'access_id': self._ACCESS_ID, + 'client_token': sha1((raw_ct + server_token + self._SUPPORTED_PROTOCOLS).encode()).hexdigest(), + 'protocols': self._SUPPORTED_PROTOCOLS, + 'server_token': server_token, + 'video_id': clip_id, + }, fatal=False) or {}).get('urls') or {} + for protocol, variant in urls.items(): + source_url = variant.get('clear', {}).get('url') + if not source_url: + continue + if protocol == 'dash': + formats.extend(self._extract_mpd_formats( + source_url, clip_id, mpd_id=protocol, fatal=False)) + elif protocol == 'hls': + formats.extend(self._extract_m3u8_formats( + source_url, clip_id, 'mp4', 'm3u8_native', + m3u8_id=protocol, fatal=False)) + else: + formats.append({ + 'url': source_url, + 'format_id': protocol, + }) + if not formats: + source_ids = [compat_str(source['id']) for source in video['sources']] + + client_id = self._SALT[:2] + sha1(''.join([clip_id, self._SALT, self._TOKEN, client_location, self._SALT, self._CLIENT_NAME]).encode('utf-8')).hexdigest() + + sources = self._download_json( + 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources' % clip_id, + clip_id, 'Downloading sources JSON', query={ + 'access_token': self._TOKEN, + 'client_id': client_id, + 'client_location': client_location, + 'client_name': self._CLIENT_NAME, + }) + server_id = sources['server_id'] + + def fix_bitrate(bitrate): + bitrate = int_or_none(bitrate) + if not bitrate: + return None + return (bitrate // 1000) if bitrate % 1000 == 0 else bitrate + + for source_id in source_ids: + client_id = self._SALT[:2] + sha1(''.join([self._SALT, clip_id, self._TOKEN, server_id, client_location, source_id, self._SALT, self._CLIENT_NAME]).encode('utf-8')).hexdigest() + urls = self._download_json( + 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url' % clip_id, + clip_id, 'Downloading urls JSON', fatal=False, query={ + 'access_token': self._TOKEN, + 'client_id': client_id, + 'client_location': client_location, + 'client_name': self._CLIENT_NAME, + 'server_id': server_id, + 'source_ids': source_id, + }) + if not urls: + continue + if urls.get('status_code') != 0: + raise ExtractorError('This video is unavailable', expected=True) + urls_sources = urls['sources'] + if isinstance(urls_sources, dict): + urls_sources = urls_sources.values() + for source in urls_sources: + source_url = source.get('url') + if not source_url: + continue + protocol = source.get('protocol') + mimetype = source.get('mimetype') + if mimetype == 'application/f4m+xml' or 'f4mgenerator' in source_url or determine_ext(source_url) == 'f4m': + formats.extend(self._extract_f4m_formats( + source_url, clip_id, f4m_id='hds', fatal=False)) + elif mimetype == 'application/x-mpegURL': + formats.extend(self._extract_m3u8_formats( + source_url, clip_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif mimetype == 'application/dash+xml': + formats.extend(self._extract_mpd_formats( + source_url, clip_id, mpd_id='dash', fatal=False)) + else: + tbr = fix_bitrate(source['bitrate']) + if protocol in ('rtmp', 'rtmpe'): + mobj = re.search(r'^(?P<url>rtmpe?://[^/]+)/(?P<path>.+)$', source_url) + if not mobj: + continue + path = mobj.group('path') + mp4colon_index = path.rfind('mp4:') + app = path[:mp4colon_index] + play_path = path[mp4colon_index:] + formats.append({ + 'url': '%s/%s' % (mobj.group('url'), app), + 'app': app, + 'play_path': play_path, + 'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf', + 'page_url': 'http://www.prosieben.de', + 'tbr': tbr, + 'ext': 'flv', + 'format_id': 'rtmp%s' % ('-%d' % tbr if tbr else ''), + }) + else: + formats.append({ + 'url': source_url, + 'tbr': tbr, + 'format_id': 'http%s' % ('-%d' % tbr if tbr else ''), + }) + self._sort_formats(formats) + + return { + 'duration': float_or_none(video.get('duration')), + 'formats': formats, + } + + +class ProSiebenSat1IE(ProSiebenSat1BaseIE): IE_NAME = 'prosiebensat1' IE_DESC = 'ProSiebenSat.1 Digital' - _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|the-voice-of-germany|7tv)\.(?:de|at|ch)|ran\.de|fem\.com)/(?P<id>.+)' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)? + (?: + (?:beta\.)? + (?: + prosieben(?:maxx)?|sixx|sat1(?:gold)?|kabeleins(?:doku)?|the-voice-of-germany|7tv|advopedia + )\.(?:de|at|ch)| + ran\.de|fem\.com|advopedia\.de|galileo\.tv/video + ) + /(?P<id>.+) + ''' _TESTS = [ { - # Tests changes introduced in https://github.com/rg3/youtube-dl/pull/6242 - # in response to fixing https://github.com/rg3/youtube-dl/issues/6215: + # Tests changes introduced in https://github.com/ytdl-org/youtube-dl/pull/6242 + # in response to fixing https://github.com/ytdl-org/youtube-dl/issues/6215: # - malformed f4m manifest support # - proper handling of URLs starting with `https?://` in 2.0 manifests # - recursive child f4m manifests extraction 'url': 'http://www.prosieben.de/tv/circus-halligalli/videos/218-staffel-2-episode-18-jahresrueckblick-ganze-folge', 'info_dict': { 'id': '2104602', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Episode 18 - Staffel 2', 'description': 'md5:8733c81b702ea472e069bc48bb658fc1', 'upload_date': '20131231', 'duration': 5845.04, }, - 'params': { - # rtmp download - 'skip_download': True, - }, }, { 'url': 'http://www.prosieben.de/videokatalog/Gesellschaft/Leben/Trends/video-Lady-Umstyling-f%C3%BCr-Audrina-Rebekka-Audrina-Fergen-billig-aussehen-Battal-Modica-700544.html', @@ -73,6 +225,7 @@ # rtmp download 'skip_download': True, }, + 'skip': 'This video is unavailable', }, { 'url': 'http://www.sixx.de/stars-style/video/sexy-laufen-in-ugg-boots-clip', @@ -88,6 +241,7 @@ # rtmp download 'skip_download': True, }, + 'skip': 'This video is unavailable', }, { 'url': 'http://www.sat1.de/film/der-ruecktritt/video/im-interview-kai-wiesinger-clip', @@ -103,6 +257,7 @@ # rtmp download 'skip_download': True, }, + 'skip': 'This video is unavailable', }, { 'url': 'http://www.kabeleins.de/tv/rosins-restaurants/videos/jagd-auf-fertigkost-im-elsthal-teil-2-ganze-folge', @@ -118,6 +273,7 @@ # rtmp download 'skip_download': True, }, + 'skip': 'This video is unavailable', }, { 'url': 'http://www.ran.de/fussball/bundesliga/video/schalke-toennies-moechte-raul-zurueck-ganze-folge', @@ -133,12 +289,13 @@ # rtmp download 'skip_download': True, }, + 'skip': 'This video is unavailable', }, { 'url': 'http://www.the-voice-of-germany.de/video/31-andreas-kuemmert-rocket-man-clip', 'info_dict': { 'id': '2572814', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Andreas Kümmert: Rocket Man', 'description': 'md5:6ddb02b0781c6adf778afea606652e38', 'upload_date': '20131017', @@ -152,7 +309,7 @@ 'url': 'http://www.fem.com/wellness/videos/wellness-video-clip-kurztripps-zum-valentinstag.html', 'info_dict': { 'id': '2156342', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Kurztrips zum Valentinstag', 'description': 'Romantischer Kurztrip zum Valentinstag? Nina Heinemann verrät, was sich hier wirklich lohnt.', 'duration': 307.24, @@ -169,12 +326,13 @@ 'description': 'md5:63b8963e71f481782aeea877658dec84', }, 'playlist_count': 2, + 'skip': 'This video is unavailable', }, { 'url': 'http://www.7tv.de/circus-halligalli/615-best-of-circus-halligalli-ganze-folge', 'info_dict': { 'id': '4187506', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Best of Circus HalliGalli', 'description': 'md5:8849752efd90b9772c9db6fdf87fb9e9', 'upload_date': '20151229', @@ -183,14 +341,62 @@ 'skip_download': True, }, }, + { + # title in <h2 class="subtitle"> + 'url': 'http://www.prosieben.de/stars/oscar-award/videos/jetzt-erst-enthuellt-das-geheimnis-von-emma-stones-oscar-robe-clip', + 'info_dict': { + 'id': '4895826', + 'ext': 'mp4', + 'title': 'Jetzt erst enthüllt: Das Geheimnis von Emma Stones Oscar-Robe', + 'description': 'md5:e5ace2bc43fadf7b63adc6187e9450b9', + 'upload_date': '20170302', + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'geo restricted to Germany', + }, + { + # geo restricted to Germany + 'url': 'http://www.kabeleinsdoku.de/tv/mayday-alarm-im-cockpit/video/102-notlandung-im-hudson-river-ganze-folge', + 'only_matching': True, + }, + { + # geo restricted to Germany + 'url': 'http://www.sat1gold.de/tv/edel-starck/video/11-staffel-1-episode-1-partner-wider-willen-ganze-folge', + 'only_matching': True, + }, + { + # geo restricted to Germany + 'url': 'https://www.galileo.tv/video/diese-emojis-werden-oft-missverstanden', + 'only_matching': True, + }, + { + 'url': 'http://www.sat1gold.de/tv/edel-starck/playlist/die-gesamte-1-staffel', + 'only_matching': True, + }, + { + 'url': 'http://www.advopedia.de/videos/lenssen-klaert-auf/lenssen-klaert-auf-folge-8-staffel-3-feiertage-und-freie-tage', + 'only_matching': True, + }, ] + _TOKEN = 'prosieben' + _SALT = '01!8d8F_)r9]4s[qeuXfP%' + _CLIENT_NAME = 'kolibri-2.0.19-splec4' + + _ACCESS_ID = 'x_prosiebenmaxx-de' + _ENCRYPTION_KEY = 'Eeyeey9oquahthainoofashoyoikosag' + _IV = 'Aeluchoc6aevechuipiexeeboowedaok' + _CLIPID_REGEXES = [ r'"clip_id"\s*:\s+"(\d+)"', r'clipid: "(\d+)"', r'clip[iI]d=(\d+)', - r'clip[iI]d\s*=\s*["\'](\d+)', + r'clip[iI][dD]\s*=\s*["\'](\d+)', r"'itemImageUrl'\s*:\s*'/dynamic/thumbnails/full/\d+/(\d+)", + r'proMamsId"\s*:\s*"(\d+)', + r'proMamsId"\s*:\s*"(\d+)', ] _TITLE_REGEXES = [ r'<h2 class="subtitle" itemprop="name">\s*(.+?)</h2>', @@ -200,6 +406,7 @@ r'<header class="module_header">\s*<h2>([^<]+)</h2>\s*</header>', r'<h2 class="video-title" itemprop="name">\s*(.+?)</h2>', r'<div[^>]+id="veeseoTitle"[^>]*>(.+?)</div>', + r'<h2[^>]+class="subtitle"[^>]*>([^<]+)</h2>', ] _DESCRIPTION_REGEXES = [ r'<p itemprop="description">\s*(.+?)</p>', @@ -229,134 +436,54 @@ ] def _extract_clip(self, url, webpage): - clip_id = self._html_search_regex(self._CLIPID_REGEXES, webpage, 'clip id') - - access_token = 'prosieben' - client_name = 'kolibri-2.0.19-splec4' - client_location = url - - videos_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos?%s' % compat_urllib_parse.urlencode({ - 'access_token': access_token, - 'client_location': client_location, - 'client_name': client_name, - 'ids': clip_id, - }) - - video = self._download_json(videos_api_url, clip_id, 'Downloading videos JSON')[0] - - if video.get('is_protected') is True: - raise ExtractorError('This video is DRM protected.', expected=True) - - duration = float_or_none(video.get('duration')) - source_ids = [source['id'] for source in video['sources']] - source_ids_str = ','.join(map(str, source_ids)) - - g = '01!8d8F_)r9]4s[qeuXfP%' - - client_id = g[:2] + sha1(''.join([clip_id, g, access_token, client_location, g, client_name]) - .encode('utf-8')).hexdigest() - - sources_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources?%s' % (clip_id, compat_urllib_parse.urlencode({ - 'access_token': access_token, - 'client_id': client_id, - 'client_location': client_location, - 'client_name': client_name, - })) - - sources = self._download_json(sources_api_url, clip_id, 'Downloading sources JSON') - server_id = sources['server_id'] - - client_id = g[:2] + sha1(''.join([g, clip_id, access_token, server_id, - client_location, source_ids_str, g, client_name]) - .encode('utf-8')).hexdigest() - - url_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url?%s' % (clip_id, compat_urllib_parse.urlencode({ - 'access_token': access_token, - 'client_id': client_id, - 'client_location': client_location, - 'client_name': client_name, - 'server_id': server_id, - 'source_ids': source_ids_str, - })) - - urls = self._download_json(url_api_url, clip_id, 'Downloading urls JSON') - - title = self._html_search_regex(self._TITLE_REGEXES, webpage, 'title') - description = self._html_search_regex(self._DESCRIPTION_REGEXES, webpage, 'description', fatal=False) + clip_id = self._html_search_regex( + self._CLIPID_REGEXES, webpage, 'clip id') + title = self._html_search_regex( + self._TITLE_REGEXES, webpage, 'title', + default=None) or self._og_search_title(webpage) + info = self._extract_video_info(url, clip_id) + description = self._html_search_regex( + self._DESCRIPTION_REGEXES, webpage, 'description', default=None) + if description is None: + description = self._og_search_description(webpage) thumbnail = self._og_search_thumbnail(webpage) - upload_date = unified_strdate(self._html_search_regex( self._UPLOAD_DATE_REGEXES, webpage, 'upload date', default=None)) - formats = [] - - urls_sources = urls['sources'] - if isinstance(urls_sources, dict): - urls_sources = urls_sources.values() - - def fix_bitrate(bitrate): - bitrate = int_or_none(bitrate) - if not bitrate: - return None - return (bitrate // 1000) if bitrate % 1000 == 0 else bitrate - - for source in urls_sources: - protocol = source['protocol'] - source_url = source['url'] - if protocol == 'rtmp' or protocol == 'rtmpe': - mobj = re.search(r'^(?P<url>rtmpe?://[^/]+)/(?P<path>.+)$', source_url) - if not mobj: - continue - path = mobj.group('path') - mp4colon_index = path.rfind('mp4:') - app = path[:mp4colon_index] - play_path = path[mp4colon_index:] - formats.append({ - 'url': '%s/%s' % (mobj.group('url'), app), - 'app': app, - 'play_path': play_path, - 'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf', - 'page_url': 'http://www.prosieben.de', - 'vbr': fix_bitrate(source['bitrate']), - 'ext': 'mp4', - 'format_id': '%s_%s' % (source['cdn'], source['bitrate']), - }) - elif 'f4mgenerator' in source_url or determine_ext(source_url) == 'f4m': - formats.extend(self._extract_f4m_formats(source_url, clip_id)) - else: - formats.append({ - 'url': source_url, - 'vbr': fix_bitrate(source['bitrate']), - }) - - self._sort_formats(formats) - - return { + info.update({ 'id': clip_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'upload_date': upload_date, - 'duration': duration, - 'formats': formats, - } + }) + return info def _extract_playlist(self, url, webpage): playlist_id = self._html_search_regex( self._PLAYLIST_ID_REGEXES, webpage, 'playlist id') - for regex in self._PLAYLIST_CLIP_REGEXES: - playlist_clips = re.findall(regex, webpage) - if playlist_clips: - title = self._html_search_regex( - self._TITLE_REGEXES, webpage, 'title') - description = self._html_search_regex( - self._DESCRIPTION_REGEXES, webpage, 'description', fatal=False) - entries = [ - self.url_result( - re.match('(.+?//.+?)/', url).group(1) + clip_path, - 'ProSiebenSat1') - for clip_path in playlist_clips] - return self.playlist_result(entries, playlist_id, title, description) + playlist = self._parse_json( + self._search_regex( + r'var\s+contentResources\s*=\s*(\[.+?\]);\s*</script', + webpage, 'playlist'), + playlist_id) + entries = [] + for item in playlist: + clip_id = item.get('id') or item.get('upc') + if not clip_id: + continue + info = self._extract_video_info(url, clip_id) + info.update({ + 'id': clip_id, + 'title': item.get('title') or item.get('teaser', {}).get('headline'), + 'description': item.get('teaser', {}).get('description'), + 'thumbnail': item.get('poster'), + 'duration': float_or_none(item.get('duration')), + 'series': item.get('tvShowTitle'), + 'uploader': item.get('broadcastPublisher'), + }) + entries.append(info) + return self.playlist_result(entries, playlist_id) def _real_extract(self, url): video_id = self._match_id(url) @@ -368,3 +495,6 @@ return self._extract_clip(url, webpage) elif page_type == 'playlist': return self._extract_playlist(url, webpage) + else: + raise ExtractorError( + 'Unsupported page type %s' % page_type, expected=True) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/puhutv.py youtube-dl-2019.05.20/youtube_dl/extractor/puhutv.py --- youtube-dl-2016.02.22/youtube_dl/extractor/puhutv.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/puhutv.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,247 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_HTTPError, + compat_str, +) +from ..utils import ( + ExtractorError, + int_or_none, + float_or_none, + parse_resolution, + str_or_none, + try_get, + unified_timestamp, + url_or_none, + urljoin, +) + + +class PuhuTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?puhutv\.com/(?P<id>[^/?#&]+)-izle' + IE_NAME = 'puhutv' + _TESTS = [{ + # film + 'url': 'https://puhutv.com/sut-kardesler-izle', + 'md5': 'fbd8f2d8e7681f8bcd51b592475a6ae7', + 'info_dict': { + 'id': '5085', + 'display_id': 'sut-kardesler', + 'ext': 'mp4', + 'title': 'Süt Kardeşler', + 'description': 'md5:405fd024df916ca16731114eb18e511a', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 4832.44, + 'creator': 'Arzu Film', + 'timestamp': 1469778212, + 'upload_date': '20160729', + 'release_year': 1976, + 'view_count': int, + 'tags': ['Aile', 'Komedi', 'Klasikler'], + }, + }, { + # episode, geo restricted, bypassable with --geo-verification-proxy + 'url': 'https://puhutv.com/jet-sosyete-1-bolum-izle', + 'only_matching': True, + }, { + # 4k, with subtitles + 'url': 'https://puhutv.com/dip-1-bolum-izle', + 'only_matching': True, + }] + _SUBTITLE_LANGS = { + 'English': 'en', + 'Deutsch': 'de', + 'عربى': 'ar' + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + info = self._download_json( + urljoin(url, '/api/slug/%s-izle' % display_id), + display_id)['data'] + + video_id = compat_str(info['id']) + title = info.get('name') or info['title']['name'] + if info.get('display_name'): + title = '%s %s' % (title, info.get('display_name')) + + try: + videos = self._download_json( + 'https://puhutv.com/api/assets/%s/videos' % video_id, + display_id, 'Downloading video JSON', + headers=self.geo_verification_headers()) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + self.raise_geo_restricted() + raise + + formats = [] + for video in videos['data']['videos']: + media_url = url_or_none(video.get('url')) + if not media_url: + continue + playlist = video.get('is_playlist') + if video.get('stream_type') == 'hls' and playlist is True: + formats.extend(self._extract_m3u8_formats( + media_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + continue + quality = int_or_none(video.get('quality')) + f = { + 'url': media_url, + 'ext': 'mp4', + 'height': quality + } + video_format = video.get('video_format') + if video_format == 'hls' and playlist is False: + format_id = 'hls' + f['protocol'] = 'm3u8_native' + elif video_format == 'mp4': + format_id = 'http' + + else: + continue + if quality: + format_id += '-%sp' % quality + f['format_id'] = format_id + formats.append(f) + self._sort_formats(formats) + + description = try_get( + info, lambda x: x['title']['description'], + compat_str) or info.get('description') + timestamp = unified_timestamp(info.get('created_at')) + creator = try_get( + info, lambda x: x['title']['producer']['name'], compat_str) + + duration = float_or_none( + try_get(info, lambda x: x['content']['duration_in_ms'], int), + scale=1000) + view_count = try_get(info, lambda x: x['content']['watch_count'], int) + + images = try_get( + info, lambda x: x['content']['images']['wide'], dict) or {} + thumbnails = [] + for image_id, image_url in images.items(): + if not isinstance(image_url, compat_str): + continue + if not image_url.startswith(('http', '//')): + image_url = 'https://%s' % image_url + t = parse_resolution(image_id) + t.update({ + 'id': image_id, + 'url': image_url + }) + thumbnails.append(t) + + release_year = try_get(info, lambda x: x['title']['released_at'], int) + + season_number = int_or_none(info.get('season_number')) + season_id = str_or_none(info.get('season_id')) + episode_number = int_or_none(info.get('episode_number')) + + tags = [] + for genre in try_get(info, lambda x: x['title']['genres'], list) or []: + if not isinstance(genre, dict): + continue + genre_name = genre.get('name') + if genre_name and isinstance(genre_name, compat_str): + tags.append(genre_name) + + subtitles = {} + for subtitle in try_get( + info, lambda x: x['content']['subtitles'], list) or []: + if not isinstance(subtitle, dict): + continue + lang = subtitle.get('language') + sub_url = url_or_none(subtitle.get('url')) + if not lang or not isinstance(lang, compat_str) or not sub_url: + continue + subtitles[self._SUBTITLE_LANGS.get(lang, lang)] = [{ + 'url': sub_url + }] + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'season_id': season_id, + 'season_number': season_number, + 'episode_number': episode_number, + 'release_year': release_year, + 'timestamp': timestamp, + 'creator': creator, + 'view_count': view_count, + 'duration': duration, + 'tags': tags, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + 'formats': formats + } + + +class PuhuTVSerieIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?puhutv\.com/(?P<id>[^/?#&]+)-detay' + IE_NAME = 'puhutv:serie' + _TESTS = [{ + 'url': 'https://puhutv.com/deniz-yildizi-detay', + 'info_dict': { + 'title': 'Deniz Yıldızı', + 'id': 'deniz-yildizi', + }, + 'playlist_mincount': 205, + }, { + # a film detail page which is using same url with serie page + 'url': 'https://puhutv.com/kaybedenler-kulubu-detay', + 'only_matching': True, + }] + + def _extract_entries(self, seasons): + for season in seasons: + season_id = season.get('id') + if not season_id: + continue + page = 1 + has_more = True + while has_more is True: + season = self._download_json( + 'https://galadriel.puhutv.com/seasons/%s' % season_id, + season_id, 'Downloading page %s' % page, query={ + 'page': page, + 'per': 40, + }) + episodes = season.get('episodes') + if isinstance(episodes, list): + for ep in episodes: + slug_path = str_or_none(ep.get('slugPath')) + if not slug_path: + continue + video_id = str_or_none(int_or_none(ep.get('id'))) + yield self.url_result( + 'https://puhutv.com/%s' % slug_path, + ie=PuhuTVIE.ie_key(), video_id=video_id, + video_title=ep.get('name') or ep.get('eventLabel')) + page += 1 + has_more = season.get('hasMore') + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + info = self._download_json( + urljoin(url, '/api/slug/%s-detay' % playlist_id), + playlist_id)['data'] + + seasons = info.get('seasons') + if seasons: + return self.playlist_result( + self._extract_entries(seasons), playlist_id, info.get('name')) + + # For films, these are using same url with series + video_id = info.get('slug') or info['assets'][0]['slug'] + return self.url_result( + 'https://puhutv.com/%s-izle' % video_id, + PuhuTVIE.ie_key(), video_id) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/puls4.py youtube-dl-2019.05.20/youtube_dl/extractor/puls4.py --- youtube-dl-2016.02.22/youtube_dl/extractor/puls4.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/puls4.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,88 +1,57 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals -from .common import InfoExtractor +from .prosiebensat1 import ProSiebenSat1BaseIE from ..utils import ( - ExtractorError, unified_strdate, - int_or_none, + parse_duration, + compat_str, ) -class Puls4IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?puls4\.com/video/[^/]+/play/(?P<id>[0-9]+)' +class Puls4IE(ProSiebenSat1BaseIE): + _VALID_URL = r'https?://(?:www\.)?puls4\.com/(?P<id>[^?#&]+)' _TESTS = [{ - 'url': 'http://www.puls4.com/video/pro-und-contra/play/2716816', - 'md5': '49f6a6629747eeec43cef6a46b5df81d', + 'url': 'http://www.puls4.com/2-minuten-2-millionen/staffel-3/videos/2min2miotalk/Tobias-Homberger-von-myclubs-im-2min2miotalk-118118', + 'md5': 'fd3c6b0903ac72c9d004f04bc6bb3e03', 'info_dict': { - 'id': '2716816', - 'ext': 'mp4', - 'title': 'Pro und Contra vom 23.02.2015', - 'description': 'md5:293e44634d9477a67122489994675db6', - 'duration': 2989, - 'upload_date': '20150224', + 'id': '118118', + 'ext': 'flv', + 'title': 'Tobias Homberger von myclubs im #2min2miotalk', + 'description': 'md5:f9def7c5e8745d6026d8885487d91955', + 'upload_date': '20160830', 'uploader': 'PULS_4', }, - 'skip': 'Only works from Germany', }, { - 'url': 'http://www.puls4.com/video/kult-spielfilme/play/1298106', - 'md5': '6a48316c8903ece8dab9b9a7bf7a59ec', - 'info_dict': { - 'id': '1298106', - 'ext': 'mp4', - 'title': 'Lucky Fritz', - }, - 'skip': 'Only works from Germany', + 'url': 'http://www.puls4.com/pro-und-contra/wer-wird-prasident/Ganze-Folgen/Wer-wird-Praesident.-Norbert-Hofer', + 'only_matching': True, + }, { + 'url': 'http://www.puls4.com/pro-und-contra/wer-wird-prasident/Ganze-Folgen/Wer-wird-Praesident-Analyse-des-Interviews-mit-Norbert-Hofer-416598', + 'only_matching': True, }] + _TOKEN = 'puls4' + _SALT = '01!kaNgaiNgah1Ie4AeSha' + _CLIENT_NAME = '' def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - error_message = self._html_search_regex( - r'<div class="message-error">(.+?)</div>', - webpage, 'error message', default=None) - if error_message: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error_message), expected=True) - - real_url = self._html_search_regex( - r'\"fsk-button\".+?href=\"([^"]+)', - webpage, 'fsk_button', default=None) - if real_url: - webpage = self._download_webpage(real_url, video_id) - - player = self._search_regex( - r'p4_video_player(?:_iframe)?\("video_\d+_container"\s*,(.+?)\);\s*\}', - webpage, 'player') - - player_json = self._parse_json( - '[%s]' % player, video_id, - transform_source=lambda s: s.replace('undefined,', '')) - - formats = None - result = None - - for v in player_json: - if isinstance(v, list) and not formats: - formats = [{ - 'url': f['url'], - 'format': 'hd' if f.get('hd') else 'sd', - 'width': int_or_none(f.get('size_x')), - 'height': int_or_none(f.get('size_y')), - 'tbr': int_or_none(f.get('bitrate')), - } for f in v] - self._sort_formats(formats) - elif isinstance(v, dict) and not result: - result = { - 'id': video_id, - 'title': v['videopartname'].strip(), - 'description': v.get('videotitle'), - 'duration': int_or_none(v.get('videoduration') or v.get('episodeduration')), - 'upload_date': unified_strdate(v.get('clipreleasetime')), - 'uploader': v.get('channel'), - } - - result['formats'] = formats - - return result + path = self._match_id(url) + content_path = self._download_json( + 'http://www.puls4.com/api/json-fe/page/' + path, path)['content'][0]['url'] + media = self._download_json( + 'http://www.puls4.com' + content_path, + content_path)['mediaCurrent'] + player_content = media['playerContent'] + info = self._extract_video_info(url, player_content['id']) + info.update({ + 'id': compat_str(media['objectId']), + 'title': player_content['title'], + 'description': media.get('description'), + 'thumbnail': media.get('previewLink'), + 'upload_date': unified_strdate(media.get('date')), + 'duration': parse_duration(player_content.get('duration')), + 'episode': player_content.get('episodePartName'), + 'show': media.get('channel'), + 'season_id': player_content.get('seasonId'), + 'uploader': player_content.get('sourceCompany'), + }) + return info diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/pyvideo.py youtube-dl-2019.05.20/youtube_dl/extractor/pyvideo.py --- youtube-dl-2016.02.22/youtube_dl/extractor/pyvideo.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/pyvideo.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,59 +1,72 @@ from __future__ import unicode_literals import re -import os from .common import InfoExtractor +from ..compat import compat_str +from ..utils import int_or_none class PyvideoIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?pyvideo\.org/video/(?P<id>\d+)/(.*)' + _VALID_URL = r'https?://(?:www\.)?pyvideo\.org/(?P<category>[^/]+)/(?P<id>[^/?#&.]+)' - _TESTS = [ - { - 'url': 'http://pyvideo.org/video/1737/become-a-logging-expert-in-30-minutes', - 'md5': 'de317418c8bc76b1fd8633e4f32acbc6', - 'info_dict': { - 'id': '24_4WWkSmNo', - 'ext': 'mp4', - 'title': 'Become a logging expert in 30 minutes', - 'description': 'md5:9665350d466c67fb5b1598de379021f7', - 'upload_date': '20130320', - 'uploader': 'NextDayVideo', - 'uploader_id': 'NextDayVideo', - }, - 'add_ie': ['Youtube'], + _TESTS = [{ + 'url': 'http://pyvideo.org/pycon-us-2013/become-a-logging-expert-in-30-minutes.html', + 'info_dict': { + 'id': 'become-a-logging-expert-in-30-minutes', }, - { - 'url': 'http://pyvideo.org/video/2542/gloriajw-spotifywitherikbernhardsson182m4v', - 'md5': '5fe1c7e0a8aa5570330784c847ff6d12', - 'info_dict': { - 'id': '2542', - 'ext': 'm4v', - 'title': 'Gloriajw-SpotifyWithErikBernhardsson182', - }, + 'playlist_count': 2, + }, { + 'url': 'http://pyvideo.org/pygotham-2012/gloriajw-spotifywitherikbernhardsson182m4v.html', + 'md5': '5fe1c7e0a8aa5570330784c847ff6d12', + 'info_dict': { + 'id': '2542', + 'ext': 'm4v', + 'title': 'Gloriajw-SpotifyWithErikBernhardsson182.m4v', }, - ] + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) + category = mobj.group('category') video_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) + entries = [] - m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', webpage) - if m_youtube is not None: - return self.url_result(m_youtube.group(1), 'Youtube') - - title = self._html_search_regex( - r'<div class="section">\s*<h3(?:\s+class="[^"]*"[^>]*)?>([^>]+?)</h3>', - webpage, 'title', flags=re.DOTALL) - video_url = self._search_regex( - [r'<source src="(.*?)"', r'<dt>Download</dt>.*?<a href="(.+?)"'], - webpage, 'video url', flags=re.DOTALL) - - return { - 'id': video_id, - 'title': os.path.splitext(title)[0], - 'url': video_url, - } + data = self._download_json( + 'https://raw.githubusercontent.com/pyvideo/data/master/%s/videos/%s.json' + % (category, video_id), video_id, fatal=False) + + if data: + for video in data['videos']: + video_url = video.get('url') + if video_url: + if video.get('type') == 'youtube': + entries.append(self.url_result(video_url, 'Youtube')) + else: + entries.append({ + 'id': compat_str(data.get('id') or video_id), + 'url': video_url, + 'title': data['title'], + 'description': data.get('description') or data.get('summary'), + 'thumbnail': data.get('thumbnail_url'), + 'duration': int_or_none(data.get('duration')), + }) + else: + webpage = self._download_webpage(url, video_id) + title = self._og_search_title(webpage) + media_urls = self._search_regex( + r'(?s)Media URL:(.+?)</li>', webpage, 'media urls') + for m in re.finditer( + r'<a[^>]+href=(["\'])(?P<url>http.+?)\1', media_urls): + media_url = m.group('url') + if re.match(r'https?://www\.youtube\.com/watch\?v=.*', media_url): + entries.append(self.url_result(media_url, 'Youtube')) + else: + entries.append({ + 'id': video_id, + 'url': media_url, + 'title': title, + }) + + return self.playlist_result(entries, video_id) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/qqmusic.py youtube-dl-2019.05.20/youtube_dl/extractor/qqmusic.py --- youtube-dl-2016.02.22/youtube_dl/extractor/qqmusic.py 2016-01-09 00:15:59.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/qqmusic.py 2019-06-07 18:49:07.000000000 +0000 @@ -2,38 +2,37 @@ from __future__ import unicode_literals import random -import time import re +import time from .common import InfoExtractor from ..utils import ( - sanitized_Request, - strip_jsonp, - unescapeHTML, clean_html, ExtractorError, + strip_jsonp, + unescapeHTML, ) class QQMusicIE(InfoExtractor): IE_NAME = 'qqmusic' IE_DESC = 'QQ音乐' - _VALID_URL = r'http://y.qq.com/#type=song&mid=(?P<id>[0-9A-Za-z]+)' + _VALID_URL = r'https?://y\.qq\.com/n/yqq/song/(?P<id>[0-9A-Za-z]+)\.html' _TESTS = [{ - 'url': 'http://y.qq.com/#type=song&mid=004295Et37taLD', - 'md5': '9ce1c1c8445f561506d2e3cfb0255705', + 'url': 'https://y.qq.com/n/yqq/song/004295Et37taLD.html', + 'md5': '5f1e6cea39e182857da7ffc5ef5e6bb8', 'info_dict': { 'id': '004295Et37taLD', 'ext': 'mp3', 'title': '可惜没如果', 'release_date': '20141227', 'creator': '林俊杰', - 'description': 'md5:d327722d0361576fde558f1ac68a7065', - 'thumbnail': 're:^https?://.*\.jpg$', + 'description': 'md5:d85afb3051952ecc50a1ee8a286d1eac', + 'thumbnail': r're:^https?://.*\.jpg$', } }, { 'note': 'There is no mp3-320 version of this song.', - 'url': 'http://y.qq.com/#type=song&mid=004MsGEo3DdNxV', + 'url': 'https://y.qq.com/n/yqq/song/004MsGEo3DdNxV.html', 'md5': 'fa3926f0c585cda0af8fa4f796482e3e', 'info_dict': { 'id': '004MsGEo3DdNxV', @@ -42,19 +41,19 @@ 'release_date': '20050626', 'creator': '李季美', 'description': 'md5:46857d5ed62bc4ba84607a805dccf437', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', } }, { 'note': 'lyrics not in .lrc format', - 'url': 'http://y.qq.com/#type=song&mid=001JyApY11tIp6', + 'url': 'https://y.qq.com/n/yqq/song/001JyApY11tIp6.html', 'info_dict': { 'id': '001JyApY11tIp6', 'ext': 'mp3', 'title': 'Shadows Over Transylvania', 'release_date': '19970225', 'creator': 'Dark Funeral', - 'description': 'md5:ed14d5bd7ecec19609108052c25b2c11', - 'thumbnail': 're:^https?://.*\.jpg$', + 'description': 'md5:c9b20210587cbcd6836a1c597bab4525', + 'thumbnail': r're:^https?://.*\.jpg$', }, 'params': { 'skip_download': True, @@ -105,7 +104,7 @@ [r'albummid:\'([0-9a-zA-Z]+)\'', r'"albummid":"([0-9a-zA-Z]+)"'], detail_info_page, 'album mid', default=None) if albummid: - thumbnail_url = "http://i.gtimg.cn/music/photo/mid_album_500/%s/%s/%s.jpg" \ + thumbnail_url = 'http://i.gtimg.cn/music/photo/mid_album_500/%s/%s/%s.jpg' \ % (albummid[-2:-1], albummid[-1], albummid) guid = self.m_r_get_ruin() @@ -156,15 +155,39 @@ def qq_static_url(category, mid): return 'http://y.qq.com/y/static/%s/%s/%s/%s.html' % (category, mid[-2], mid[-1], mid) - @classmethod - def get_entries_from_page(cls, page): + def get_singer_all_songs(self, singmid, num): + return self._download_webpage( + r'https://c.y.qq.com/v8/fcg-bin/fcg_v8_singer_track_cp.fcg', singmid, + query={ + 'format': 'json', + 'inCharset': 'utf8', + 'outCharset': 'utf-8', + 'platform': 'yqq', + 'needNewCode': 0, + 'singermid': singmid, + 'order': 'listen', + 'begin': 0, + 'num': num, + 'songstatus': 1, + }) + + def get_entries_from_page(self, singmid): entries = [] - for item in re.findall(r'class="data"[^<>]*>([^<>]+)</', page): - song_mid = unescapeHTML(item).split('|')[-5] - entries.append(cls.url_result( - 'http://y.qq.com/#type=song&mid=' + song_mid, 'QQMusic', - song_mid)) + default_num = 1 + json_text = self.get_singer_all_songs(singmid, default_num) + json_obj_all_songs = self._parse_json(json_text, singmid) + + if json_obj_all_songs['code'] == 0: + total = json_obj_all_songs['data']['total'] + json_text = self.get_singer_all_songs(singmid, total) + json_obj_all_songs = self._parse_json(json_text, singmid) + + for item in json_obj_all_songs['data']['list']: + if item['musicData'].get('songmid') is not None: + songmid = item['musicData']['songmid'] + entries.append(self.url_result( + r'https://y.qq.com/n/yqq/song/%s.html' % songmid, 'QQMusic', songmid)) return entries @@ -172,42 +195,32 @@ class QQMusicSingerIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:singer' IE_DESC = 'QQ音乐 - 歌手' - _VALID_URL = r'http://y.qq.com/#type=singer&mid=(?P<id>[0-9A-Za-z]+)' + _VALID_URL = r'https?://y\.qq\.com/n/yqq/singer/(?P<id>[0-9A-Za-z]+)\.html' _TEST = { - 'url': 'http://y.qq.com/#type=singer&mid=001BLpXF2DyJe2', + 'url': 'https://y.qq.com/n/yqq/singer/001BLpXF2DyJe2.html', 'info_dict': { 'id': '001BLpXF2DyJe2', 'title': '林俊杰', 'description': 'md5:870ec08f7d8547c29c93010899103751', }, - 'playlist_count': 12, + 'playlist_mincount': 12, } def _real_extract(self, url): mid = self._match_id(url) - singer_page = self._download_webpage( - self.qq_static_url('singer', mid), mid, 'Download singer page') - - entries = self.get_entries_from_page(singer_page) - + entries = self.get_entries_from_page(mid) + singer_page = self._download_webpage(url, mid, 'Download singer page') singer_name = self._html_search_regex( - r"singername\s*:\s*'([^']+)'", singer_page, 'singer name', - default=None) - - singer_id = self._html_search_regex( - r"singerid\s*:\s*'([0-9]+)'", singer_page, 'singer id', - default=None) - + r"singername\s*:\s*'(.*?)'", singer_page, 'singer name', default=None) singer_desc = None - if singer_id: - req = sanitized_Request( - 'http://s.plcloud.music.qq.com/fcgi-bin/fcg_get_singer_desc.fcg?utf8=1&outCharset=utf-8&format=xml&singerid=%s' % singer_id) - req.add_header( - 'Referer', 'http://s.plcloud.music.qq.com/xhr_proxy_utf8.html') + if mid: singer_desc_page = self._download_xml( - req, mid, 'Donwload singer description XML') + 'http://s.plcloud.music.qq.com/fcgi-bin/fcg_get_singer_desc.fcg', mid, + 'Donwload singer description XML', + query={'utf8': 1, 'outCharset': 'utf-8', 'format': 'xml', 'singermid': mid}, + headers={'Referer': 'https://y.qq.com/n/yqq/singer/'}) singer_desc = singer_desc_page.find('./data/info/desc').text @@ -217,10 +230,10 @@ class QQMusicAlbumIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:album' IE_DESC = 'QQ音乐 - 专辑' - _VALID_URL = r'http://y.qq.com/#type=album&mid=(?P<id>[0-9A-Za-z]+)' + _VALID_URL = r'https?://y\.qq\.com/n/yqq/album/(?P<id>[0-9A-Za-z]+)\.html' _TESTS = [{ - 'url': 'http://y.qq.com/#type=album&mid=000gXCTb2AhRR1', + 'url': 'https://y.qq.com/n/yqq/album/000gXCTb2AhRR1.html', 'info_dict': { 'id': '000gXCTb2AhRR1', 'title': '我们都是这样长大的', @@ -228,7 +241,7 @@ }, 'playlist_count': 4, }, { - 'url': 'http://y.qq.com/#type=album&mid=002Y5a3b3AlCu3', + 'url': 'https://y.qq.com/n/yqq/album/002Y5a3b3AlCu3.html', 'info_dict': { 'id': '002Y5a3b3AlCu3', 'title': '그리고...', @@ -246,7 +259,7 @@ entries = [ self.url_result( - 'http://y.qq.com/#type=song&mid=' + song['songmid'], 'QQMusic', song['songmid'] + 'https://y.qq.com/n/yqq/song/' + song['songmid'] + '.html', 'QQMusic', song['songmid'] ) for song in album['list'] ] album_name = album.get('name') @@ -260,31 +273,30 @@ class QQMusicToplistIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:toplist' IE_DESC = 'QQ音乐 - 排行榜' - _VALID_URL = r'http://y\.qq\.com/#type=toplist&p=(?P<id>(top|global)_[0-9]+)' + _VALID_URL = r'https?://y\.qq\.com/n/yqq/toplist/(?P<id>[0-9]+)\.html' _TESTS = [{ - 'url': 'http://y.qq.com/#type=toplist&p=global_123', + 'url': 'https://y.qq.com/n/yqq/toplist/123.html', 'info_dict': { - 'id': 'global_123', + 'id': '123', 'title': '美国iTunes榜', + 'description': 'md5:89db2335fdbb10678dee2d43fe9aba08', }, - 'playlist_count': 10, + 'playlist_count': 100, }, { - 'url': 'http://y.qq.com/#type=toplist&p=top_3', + 'url': 'https://y.qq.com/n/yqq/toplist/3.html', 'info_dict': { - 'id': 'top_3', + 'id': '3', 'title': '巅峰榜·欧美', - 'description': 'QQ音乐巅峰榜·欧美根据用户收听行为自动生成,集结当下最流行的欧美新歌!:更新时间:每周四22点|统' - '计周期:一周(上周四至本周三)|统计对象:三个月内发行的欧美歌曲|统计数量:100首|统计算法:根据' - '歌曲在一周内的有效播放次数,由高到低取前100名(同一歌手最多允许5首歌曲同时上榜)|有效播放次数:' - '登录用户完整播放一首歌曲,记为一次有效播放;同一用户收听同一首歌曲,每天记录为1次有效播放' + 'description': 'md5:5a600d42c01696b26b71f8c4d43407da', }, 'playlist_count': 100, }, { - 'url': 'http://y.qq.com/#type=toplist&p=global_106', + 'url': 'https://y.qq.com/n/yqq/toplist/106.html', 'info_dict': { - 'id': 'global_106', + 'id': '106', 'title': '韩国Mnet榜', + 'description': 'md5:cb84b325215e1d21708c615cac82a6e7', }, 'playlist_count': 50, }] @@ -292,18 +304,15 @@ def _real_extract(self, url): list_id = self._match_id(url) - list_type, num_id = list_id.split("_") - toplist_json = self._download_json( - 'http://i.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg?type=%s&topid=%s&format=json' - % (list_type, num_id), - list_id, 'Download toplist page') - - entries = [ - self.url_result( - 'http://y.qq.com/#type=song&mid=' + song['data']['songmid'], 'QQMusic', song['data']['songmid'] - ) for song in toplist_json['songlist'] - ] + 'http://i.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg', list_id, + note='Download toplist page', + query={'type': 'toplist', 'topid': list_id, 'format': 'json'}) + + entries = [self.url_result( + 'https://y.qq.com/n/yqq/song/' + song['data']['songmid'] + '.html', 'QQMusic', + song['data']['songmid']) + for song in toplist_json['songlist']] topinfo = toplist_json.get('topinfo', {}) list_name = topinfo.get('ListName') @@ -314,10 +323,10 @@ class QQMusicPlaylistIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:playlist' IE_DESC = 'QQ音乐 - 歌单' - _VALID_URL = r'http://y\.qq\.com/#type=taoge&id=(?P<id>[0-9]+)' + _VALID_URL = r'https?://y\.qq\.com/n/yqq/playlist/(?P<id>[0-9]+)\.html' _TESTS = [{ - 'url': 'http://y.qq.com/#type=taoge&id=3462654915', + 'url': 'http://y.qq.com/n/yqq/playlist/3462654915.html', 'info_dict': { 'id': '3462654915', 'title': '韩国5月新歌精选下旬', @@ -326,7 +335,7 @@ 'playlist_count': 40, 'skip': 'playlist gone', }, { - 'url': 'http://y.qq.com/#type=taoge&id=1374105607', + 'url': 'https://y.qq.com/n/yqq/playlist/1374105607.html', 'info_dict': { 'id': '1374105607', 'title': '易入人心的华语民谣', @@ -339,8 +348,9 @@ list_id = self._match_id(url) list_json = self._download_json( - 'http://i.y.qq.com/qzone-music/fcg-bin/fcg_ucc_getcdinfo_byids_cp.fcg?type=1&json=1&utf8=1&onlysong=0&disstid=%s' - % list_id, list_id, 'Download list page', + 'http://i.y.qq.com/qzone-music/fcg-bin/fcg_ucc_getcdinfo_byids_cp.fcg', + list_id, 'Download list page', + query={'type': 1, 'json': 1, 'utf8': 1, 'onlysong': 0, 'disstid': list_id}, transform_source=strip_jsonp) if not len(list_json.get('cdlist', [])): if list_json.get('code'): @@ -350,11 +360,9 @@ raise ExtractorError('Unable to get playlist info') cdlist = list_json['cdlist'][0] - entries = [ - self.url_result( - 'http://y.qq.com/#type=song&mid=' + song['songmid'], 'QQMusic', song['songmid'] - ) for song in cdlist['songlist'] - ] + entries = [self.url_result( + 'https://y.qq.com/n/yqq/song/' + song['songmid'] + '.html', 'QQMusic', song['songmid']) + for song in cdlist['songlist']] list_name = cdlist.get('dissname') list_description = clean_html(unescapeHTML(cdlist.get('desc'))) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/quickvid.py youtube-dl-2019.05.20/youtube_dl/extractor/quickvid.py --- youtube-dl-2016.02.22/youtube_dl/extractor/quickvid.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/quickvid.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,54 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_urlparse, -) -from ..utils import ( - determine_ext, - int_or_none, -) - - -class QuickVidIE(InfoExtractor): - _VALID_URL = r'https?://(www\.)?quickvid\.org/watch\.php\?v=(?P<id>[a-zA-Z_0-9-]+)' - _TEST = { - 'url': 'http://quickvid.org/watch.php?v=sUQT3RCG8dx', - 'md5': 'c0c72dd473f260c06c808a05d19acdc5', - 'info_dict': { - 'id': 'sUQT3RCG8dx', - 'ext': 'mp4', - 'title': 'Nick Offerman\'s Summer Reading Recap', - 'thumbnail': 're:^https?://.*\.(?:png|jpg|gif)$', - 'view_count': int, - }, - 'skip': 'Not accessible from Travis CI server', - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - title = self._html_search_regex(r'<h2>(.*?)</h2>', webpage, 'title') - view_count = int_or_none(self._html_search_regex( - r'(?s)<div id="views">(.*?)</div>', - webpage, 'view count', fatal=False)) - video_code = self._search_regex( - r'(?s)<video id="video"[^>]*>(.*?)</video>', webpage, 'video code') - formats = [ - { - 'url': compat_urlparse.urljoin(url, src), - 'format_id': determine_ext(src, None), - } for src in re.findall('<source\s+src="([^"]+)"', video_code) - ] - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'thumbnail': self._og_search_thumbnail(webpage), - 'view_count': view_count, - } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/r7.py youtube-dl-2019.05.20/youtube_dl/extractor/r7.py --- youtube-dl-2016.02.22/youtube_dl/extractor/r7.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/r7.py 2019-06-07 18:49:07.000000000 +0000 @@ -2,22 +2,19 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import ( - js_to_json, - unescapeHTML, - int_or_none, -) +from ..utils import int_or_none class R7IE(InfoExtractor): - _VALID_URL = r'''(?x)https?:// + _VALID_URL = r'''(?x) + https?:// (?: (?:[a-zA-Z]+)\.r7\.com(?:/[^/]+)+/idmedia/| noticias\.r7\.com(?:/[^/]+)+/[^/]+-| player\.r7\.com/video/i/ ) (?P<id>[\da-f]{24}) - ''' + ''' _TESTS = [{ 'url': 'http://videos.r7.com/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-/idmedia/54e7050b0cf2ff57e0279389.html', 'md5': '403c4e393617e8e8ddc748978ee8efde', @@ -25,7 +22,8 @@ 'id': '54e7050b0cf2ff57e0279389', 'ext': 'mp4', 'title': 'Policiais humilham suspeito à beira da morte: "Morre com dignidade"', - 'thumbnail': 're:^https?://.*\.jpg$', + 'description': 'md5:01812008664be76a6479aa58ec865b72', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 98, 'like_count': int, 'view_count': int, @@ -44,45 +42,71 @@ def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - 'http://player.r7.com/video/i/%s' % video_id, video_id) + video = self._download_json( + 'http://player-api.r7.com/video/i/%s' % video_id, video_id) - item = self._parse_json(js_to_json(self._search_regex( - r'(?s)var\s+item\s*=\s*({.+?});', webpage, 'player')), video_id) - - title = unescapeHTML(item['title']) - thumbnail = item.get('init', {}).get('thumbUri') - duration = None - - statistics = item.get('statistics', {}) - like_count = int_or_none(statistics.get('likes')) - view_count = int_or_none(statistics.get('views')) + title = video['title'] formats = [] - for format_key, format_dict in item['playlist'][0].items(): - src = format_dict.get('src') - if not src: - continue - format_id = format_dict.get('format') or format_key - if duration is None: - duration = format_dict.get('duration') - if '.f4m' in src: - formats.extend(self._extract_f4m_formats(src, video_id, preference=-1)) - elif src.endswith('.m3u8'): - formats.extend(self._extract_m3u8_formats(src, video_id, 'mp4', preference=-2)) - else: - formats.append({ - 'url': src, - 'format_id': format_id, - }) + media_url_hls = video.get('media_url_hls') + if media_url_hls: + formats.extend(self._extract_m3u8_formats( + media_url_hls, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + media_url = video.get('media_url') + if media_url: + f = { + 'url': media_url, + 'format_id': 'http', + } + # m3u8 format always matches the http format, let's copy metadata from + # one to another + m3u8_formats = list(filter( + lambda f: f.get('vcodec') != 'none', formats)) + if len(m3u8_formats) == 1: + f_copy = m3u8_formats[0].copy() + f_copy.update(f) + f_copy['protocol'] = 'http' + f = f_copy + formats.append(f) self._sort_formats(formats) + description = video.get('description') + thumbnail = video.get('thumb') + duration = int_or_none(video.get('media_duration')) + like_count = int_or_none(video.get('likes')) + view_count = int_or_none(video.get('views')) + return { 'id': video_id, 'title': title, + 'description': description, 'thumbnail': thumbnail, 'duration': duration, 'like_count': like_count, 'view_count': view_count, 'formats': formats, } + + +class R7ArticleIE(InfoExtractor): + _VALID_URL = r'https?://(?:[a-zA-Z]+)\.r7\.com/(?:[^/]+/)+[^/?#&]+-(?P<id>\d+)' + _TEST = { + 'url': 'http://tv.r7.com/record-play/balanco-geral/videos/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-16102015', + 'only_matching': True, + } + + @classmethod + def suitable(cls, url): + return False if R7IE.suitable(url) else super(R7ArticleIE, cls).suitable(url) + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + video_id = self._search_regex( + r'<div[^>]+(?:id=["\']player-|class=["\']embed["\'][^>]+id=["\'])([\da-f]{24})', + webpage, 'video id') + + return self.url_result('http://player.r7.com/video/i/%s' % video_id, R7IE.ie_key()) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/radiobremen.py youtube-dl-2019.05.20/youtube_dl/extractor/radiobremen.py --- youtube-dl-2016.02.22/youtube_dl/extractor/radiobremen.py 2016-02-14 22:26:17.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/radiobremen.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals @@ -13,15 +13,15 @@ IE_NAME = 'radiobremen' _TEST = { - 'url': 'http://www.radiobremen.de/mediathek/index.html?id=114720', + 'url': 'http://www.radiobremen.de/mediathek/?id=141876', 'info_dict': { - 'id': '114720', + 'id': '141876', 'ext': 'mp4', - 'duration': 1685, + 'duration': 178, 'width': 512, - 'title': 'buten un binnen vom 22. Dezember', - 'thumbnail': 're:https?://.*\.jpg$', - 'description': 'Unter anderem mit diesen Themen: 45 Flüchtlinge sind in Worpswede angekommen +++ Freies Internet für alle: Bremer arbeiten an einem flächendeckenden W-Lan-Netzwerk +++ Aktivisten kämpfen für das Unibad +++ So war das Wetter 2014 +++', + 'title': 'Druck auf Patrick Öztürk', + 'thumbnail': r're:https?://.*\.jpg$', + 'description': 'Gegen den SPD-Bürgerschaftsabgeordneten Patrick Öztürk wird wegen Beihilfe zum gewerbsmäßigen Betrug ermittelt. Am Donnerstagabend sollte er dem Vorstand des SPD-Unterbezirks Bremerhaven dazu Rede und Antwort stehen.', }, } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/radiocanada.py youtube-dl-2019.05.20/youtube_dl/extractor/radiocanada.py --- youtube-dl-2016.02.22/youtube_dl/extractor/radiocanada.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/radiocanada.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,171 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + unified_strdate, +) + + +class RadioCanadaIE(InfoExtractor): + IE_NAME = 'radiocanada' + _VALID_URL = r'(?:radiocanada:|https?://ici\.radio-canada\.ca/widgets/mediaconsole/)(?P<app_code>[^:/]+)[:/](?P<id>[0-9]+)' + _TESTS = [ + { + 'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7184272', + 'info_dict': { + 'id': '7184272', + 'ext': 'mp4', + 'title': 'Le parcours du tireur capté sur vidéo', + 'description': 'Images des caméras de surveillance fournies par la GRC montrant le parcours du tireur d\'Ottawa', + 'upload_date': '20141023', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, + { + # empty Title + 'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7754998/', + 'info_dict': { + 'id': '7754998', + 'ext': 'mp4', + 'title': 'letelejournal22h', + 'description': 'INTEGRALE WEB 22H-TJ', + 'upload_date': '20170720', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + { + # with protectionType but not actually DRM protected + 'url': 'radiocanada:toutv:140872', + 'info_dict': { + 'id': '140872', + 'title': 'Épisode 1', + 'series': 'District 31', + }, + 'only_matching': True, + } + ] + _GEO_COUNTRIES = ['CA'] + _access_token = None + _claims = None + + def _call_api(self, path, video_id=None, app_code=None, query=None): + if not query: + query = {} + query.update({ + 'client_key': '773aea60-0e80-41bb-9c7f-e6d7c3ad17fb', + 'output': 'json', + }) + if video_id: + query.update({ + 'appCode': app_code, + 'idMedia': video_id, + }) + if self._access_token: + query['access_token'] = self._access_token + try: + return self._download_json( + 'https://services.radio-canada.ca/media/' + path, video_id, query=query) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 422): + data = self._parse_json(e.cause.read().decode(), None) + error = data.get('error_description') or data['errorMessage']['text'] + raise ExtractorError(error, expected=True) + raise + + def _extract_info(self, app_code, video_id): + metas = self._call_api('meta/v1/index.ashx', video_id, app_code)['Metas'] + + def get_meta(name): + for meta in metas: + if meta.get('name') == name: + text = meta.get('text') + if text: + return text + + # protectionType does not necessarily mean the video is DRM protected (see + # https://github.com/ytdl-org/youtube-dl/pull/18609). + if get_meta('protectionType'): + self.report_warning('This video is probably DRM protected.') + + query = { + 'connectionType': 'hd', + 'deviceType': 'ipad', + 'multibitrate': 'true', + } + if self._claims: + query['claims'] = self._claims + v_data = self._call_api('validation/v2/', video_id, app_code, query) + v_url = v_data.get('url') + if not v_url: + error = v_data['message'] + if error == "Le contenu sélectionné n'est pas disponible dans votre pays": + raise self.raise_geo_restricted(error, self._GEO_COUNTRIES) + if error == 'Le contenu sélectionné est disponible seulement en premium': + self.raise_login_required(error) + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error), expected=True) + formats = self._extract_m3u8_formats(v_url, video_id, 'mp4') + self._sort_formats(formats) + + subtitles = {} + closed_caption_url = get_meta('closedCaption') or get_meta('closedCaptionHTML5') + if closed_caption_url: + subtitles['fr'] = [{ + 'url': closed_caption_url, + 'ext': determine_ext(closed_caption_url, 'vtt'), + }] + + return { + 'id': video_id, + 'title': get_meta('Title') or get_meta('AV-nomEmission'), + 'description': get_meta('Description') or get_meta('ShortDescription'), + 'thumbnail': get_meta('imageHR') or get_meta('imageMR') or get_meta('imageBR'), + 'duration': int_or_none(get_meta('length')), + 'series': get_meta('Emission'), + 'season_number': int_or_none('SrcSaison'), + 'episode_number': int_or_none('SrcEpisode'), + 'upload_date': unified_strdate(get_meta('Date')), + 'subtitles': subtitles, + 'formats': formats, + } + + def _real_extract(self, url): + return self._extract_info(*re.match(self._VALID_URL, url).groups()) + + +class RadioCanadaAudioVideoIE(InfoExtractor): + IE_NAME = 'radiocanada:audiovideo' + _VALID_URL = r'https?://ici\.radio-canada\.ca/([^/]+/)*media-(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://ici.radio-canada.ca/audio-video/media-7527184/barack-obama-au-vietnam', + 'info_dict': { + 'id': '7527184', + 'ext': 'mp4', + 'title': 'Barack Obama au Vietnam', + 'description': 'Les États-Unis lèvent l\'embargo sur la vente d\'armes qui datait de la guerre du Vietnam', + 'upload_date': '20160523', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'https://ici.radio-canada.ca/info/videos/media-7527184/barack-obama-au-vietnam', + 'only_matching': True, + }] + + def _real_extract(self, url): + return self.url_result('radiocanada:medianet:%s' % self._match_id(url)) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/radiode.py youtube-dl-2019.05.20/youtube_dl/extractor/radiode.py --- youtube-dl-2016.02.22/youtube_dl/extractor/radiode.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/radiode.py 2019-06-07 18:49:07.000000000 +0000 @@ -13,7 +13,7 @@ 'ext': 'mp3', 'title': 're:^NDR 2 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'description': 'md5:591c49c702db1a33751625ebfb67f273', - 'thumbnail': 're:^https?://.*\.png', + 'thumbnail': r're:^https?://.*\.png', 'is_live': True, }, 'params': { diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/radiojavan.py youtube-dl-2019.05.20/youtube_dl/extractor/radiojavan.py --- youtube-dl-2016.02.22/youtube_dl/extractor/radiojavan.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/radiojavan.py 2019-06-07 18:49:07.000000000 +0000 @@ -3,9 +3,12 @@ import re from .common import InfoExtractor -from ..utils import( - unified_strdate, +from ..utils import ( + parse_resolution, str_to_int, + unified_strdate, + urlencode_postdata, + urljoin, ) @@ -18,7 +21,7 @@ 'id': 'chaartaar-ashoobam', 'ext': 'mp4', 'title': 'Chaartaar - Ashoobam', - 'thumbnail': 're:^https?://.*\.jpe?g$', + 'thumbnail': r're:^https?://.*\.jpe?g$', 'upload_date': '20150215', 'view_count': int, 'like_count': int, @@ -29,13 +32,26 @@ def _real_extract(self, url): video_id = self._match_id(url) + download_host = self._download_json( + 'https://www.radiojavan.com/videos/video_host', video_id, + data=urlencode_postdata({'id': video_id}), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': url, + }).get('host', 'https://host1.rjmusicmedia.com') + webpage = self._download_webpage(url, video_id) - formats = [{ - 'url': 'https://media.rdjavan.com/media/music_video/%s' % video_path, - 'format_id': '%sp' % height, - 'height': int(height), - } for height, video_path in re.findall(r"RJ\.video(\d+)p\s*=\s*'/?([^']+)'", webpage)] + formats = [] + for format_id, _, video_path in re.findall( + r'RJ\.video(?P<format_id>\d+[pPkK])\s*=\s*(["\'])(?P<url>(?:(?!\2).)+)\2', + webpage): + f = parse_resolution(format_id) + f.update({ + 'url': urljoin(download_host, video_path), + 'format_id': format_id, + }) + formats.append(f) self._sort_formats(formats) title = self._og_search_title(webpage) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/rai.py youtube-dl-2019.05.20/youtube_dl/extractor/rai.py --- youtube-dl-2016.02.22/youtube_dl/extractor/rai.py 2015-12-31 15:50:46.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/rai.py 2019-06-07 18:49:07.000000000 +0000 @@ -4,204 +4,499 @@ from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, compat_urlparse, + compat_str, ) from ..utils import ( ExtractorError, determine_ext, + find_xpath_attr, + fix_xml_ampersands, + GeoRestrictedError, + int_or_none, parse_duration, + strip_or_none, + try_get, + unescapeHTML, unified_strdate, - int_or_none, + unified_timestamp, + update_url_query, + urljoin, xpath_text, ) -class RaiTVIE(InfoExtractor): - _VALID_URL = r'http://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+media/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' - _TESTS = [ - { - 'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html', - 'md5': '96382709b61dd64a6b88e0f791e6df4c', - 'info_dict': { - 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', - 'ext': 'flv', - 'title': 'Report del 07/04/2014', - 'description': 'md5:f27c544694cacb46a078db84ec35d2d9', - 'upload_date': '20140407', - 'duration': 6160, - } +class RaiBaseIE(InfoExtractor): + _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' + _GEO_COUNTRIES = ['IT'] + _GEO_BYPASS = False + + def _extract_relinker_info(self, relinker_url, video_id): + if not re.match(r'https?://', relinker_url): + return {'formats': [{'url': relinker_url}]} + + formats = [] + geoprotection = None + is_live = None + duration = None + + for platform in ('mon', 'flash', 'native'): + relinker = self._download_xml( + relinker_url, video_id, + note='Downloading XML metadata for platform %s' % platform, + transform_source=fix_xml_ampersands, + query={'output': 45, 'pl': platform}, + headers=self.geo_verification_headers()) + + if not geoprotection: + geoprotection = xpath_text( + relinker, './geoprotection', default=None) == 'Y' + + if not is_live: + is_live = xpath_text( + relinker, './is_live', default=None) == 'Y' + if not duration: + duration = parse_duration(xpath_text( + relinker, './duration', default=None)) + + url_elem = find_xpath_attr(relinker, './url', 'type', 'content') + if url_elem is None: + continue + + media_url = url_elem.text + + # This does not imply geo restriction (e.g. + # http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html) + if media_url == 'http://download.rai.it/video_no_available.mp4': + continue + + ext = determine_ext(media_url) + if (ext == 'm3u8' and platform != 'mon') or (ext == 'f4m' and platform != 'flash'): + continue + + if ext == 'm3u8' or 'format=m3u8' in media_url or platform == 'mon': + formats.extend(self._extract_m3u8_formats( + media_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif ext == 'f4m' or platform == 'flash': + manifest_url = update_url_query( + media_url.replace('manifest#live_hds.f4m', 'manifest.f4m'), + {'hdcore': '3.7.0', 'plugin': 'aasp-3.7.0.39.44'}) + formats.extend(self._extract_f4m_formats( + manifest_url, video_id, f4m_id='hds', fatal=False)) + else: + bitrate = int_or_none(xpath_text(relinker, 'bitrate')) + formats.append({ + 'url': media_url, + 'tbr': bitrate if bitrate > 0 else None, + 'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http', + }) + + if not formats and geoprotection is True: + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + + return dict((k, v) for k, v in { + 'is_live': is_live, + 'duration': duration, + 'formats': formats, + }.items() if v is not None) + + @staticmethod + def _extract_subtitles(url, subtitle_url): + subtitles = {} + if subtitle_url and isinstance(subtitle_url, compat_str): + subtitle_url = urljoin(url, subtitle_url) + STL_EXT = '.stl' + SRT_EXT = '.srt' + subtitles['it'] = [{ + 'ext': 'stl', + 'url': subtitle_url, + }] + if subtitle_url.endswith(STL_EXT): + srt_url = subtitle_url[:-len(STL_EXT)] + SRT_EXT + subtitles['it'].append({ + 'ext': 'srt', + 'url': srt_url, + }) + return subtitles + + +class RaiPlayIE(RaiBaseIE): + _VALID_URL = r'(?P<url>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>%s)\.html)' % RaiBaseIE._UUID_RE + _TESTS = [{ + 'url': 'http://www.raiplay.it/video/2016/10/La-Casa-Bianca-e06118bb-59a9-4636-b914-498e4cfd2c66.html?source=twitter', + 'md5': '340aa3b7afb54bfd14a8c11786450d76', + 'info_dict': { + 'id': 'e06118bb-59a9-4636-b914-498e4cfd2c66', + 'ext': 'mp4', + 'title': 'La Casa Bianca', + 'alt_title': 'S2016 - Puntata del 23/10/2016', + 'description': 'md5:a09d45890850458077d1f68bb036e0a5', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Rai 3', + 'creator': 'Rai 3', + 'duration': 3278, + 'timestamp': 1477764300, + 'upload_date': '20161029', + 'series': 'La Casa Bianca', + 'season': '2016', }, - { - 'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html', - 'md5': 'd9751b78eac9710d62c2447b224dea39', - 'info_dict': { - 'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9', - 'ext': 'flv', - 'title': 'TG PRIMO TEMPO', - 'upload_date': '20140612', - 'duration': 1758, - }, - }, - { - 'url': 'http://www.rainews.it/dl/rainews/media/state-of-the-net-Antonella-La-Carpia-regole-virali-7aafdea9-0e5d-49d5-88a6-7e65da67ae13.html', - 'md5': '35cf7c229f22eeef43e48b5cf923bef0', - 'info_dict': { - 'id': '7aafdea9-0e5d-49d5-88a6-7e65da67ae13', - 'ext': 'mp4', - 'title': 'State of the Net, Antonella La Carpia: regole virali', - 'description': 'md5:b0ba04a324126903e3da7763272ae63c', - 'upload_date': '20140613', - }, - 'skip': 'Error 404', - }, - { - 'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-b4a49761-e0cc-4b14-8736-2729f6f73132-tg2.html', - 'info_dict': { - 'id': 'b4a49761-e0cc-4b14-8736-2729f6f73132', - 'ext': 'mp4', - 'title': 'Alluvione in Sardegna e dissesto idrogeologico', - 'description': 'Edizione delle ore 20:30 ', - }, - 'skip': 'invalid urls', - }, - { - 'url': 'http://www.ilcandidato.rai.it/dl/ray/media/Il-Candidato---Primo-episodio-Le-Primarie-28e5525a-b495-45e8-a7c3-bc48ba45d2b6.html', - 'md5': '496ab63e420574447f70d02578333437', - 'info_dict': { - 'id': '28e5525a-b495-45e8-a7c3-bc48ba45d2b6', - 'ext': 'flv', - 'title': 'Il Candidato - Primo episodio: "Le Primarie"', - 'description': 'md5:364b604f7db50594678f483353164fb8', - 'upload_date': '20140923', - 'duration': 386, - } + }, { + 'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', + 'md5': '8970abf8caf8aef4696e7b1f2adfc696', + 'info_dict': { + 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', + 'ext': 'mp4', + 'title': 'Report del 07/04/2014', + 'alt_title': 'S2013/14 - Puntata del 07/04/2014', + 'description': 'md5:f27c544694cacb46a078db84ec35d2d9', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Rai 5', + 'creator': 'Rai 5', + 'duration': 6160, + 'series': 'Report', + 'season_number': 5, + 'season': '2013/14', + }, + 'params': { + 'skip_download': True, }, - ] + }, { + 'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?', + 'only_matching': True, + }] def _real_extract(self, url): - video_id = self._match_id(url) - media = self._download_json( - 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % video_id, - video_id, 'Downloading video JSON') + mobj = re.match(self._VALID_URL, url) + url, video_id = mobj.group('url', 'id') - thumbnails = [] - for image_type in ('image', 'image_medium', 'image_300'): - thumbnail_url = media.get(image_type) - if thumbnail_url: - thumbnails.append({ - 'url': thumbnail_url, - }) + media = self._download_json( + '%s?json' % url, video_id, 'Downloading video JSON') - subtitles = [] - formats = [] - media_type = media['type'] - if 'Audio' in media_type: - formats.append({ - 'format_id': media.get('formatoAudio'), - 'url': media['audioUrl'], - 'ext': media.get('formatoAudio'), - }) - elif 'Video' in media_type: - def fix_xml(xml): - return xml.replace(' tag elementi', '').replace('>/', '</') + title = media['name'] - relinker = self._download_xml( - media['mediaUri'] + '&output=43', - video_id, transform_source=fix_xml) + video = media['video'] - has_subtitle = False + relinker_info = self._extract_relinker_info(video['contentUrl'], video_id) + self._sort_formats(relinker_info['formats']) - for element in relinker.findall('element'): - media_url = xpath_text(element, 'url') - ext = determine_ext(media_url) - content_type = xpath_text(element, 'content-type') - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - media_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - media_url + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', - video_id, f4m_id='hds', fatal=False)) - elif ext == 'stl': - has_subtitle = True - elif content_type.startswith('video/'): - bitrate = int_or_none(xpath_text(element, 'bitrate')) - formats.append({ - 'url': media_url, - 'tbr': bitrate if bitrate > 0 else None, - 'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http', - }) - elif content_type.startswith('image/'): + thumbnails = [] + if 'images' in media: + for _, value in media.get('images').items(): + if value: thumbnails.append({ - 'url': media_url, + 'url': value.replace('[RESOLUTION]', '600x400') }) - self._sort_formats(formats) + timestamp = unified_timestamp(try_get( + media, lambda x: x['availabilities'][0]['start'], compat_str)) - if has_subtitle: - webpage = self._download_webpage(url, video_id) - subtitles = self._get_subtitles(video_id, webpage) - else: - raise ExtractorError('not a media file') + subtitles = self._extract_subtitles(url, video.get('subtitles')) + + info = { + 'id': video_id, + 'title': self._live_title(title) if relinker_info.get( + 'is_live') else title, + 'alt_title': media.get('subtitle'), + 'description': media.get('description'), + 'uploader': strip_or_none(media.get('channel')), + 'creator': strip_or_none(media.get('editor')), + 'duration': parse_duration(video.get('duration')), + 'timestamp': timestamp, + 'thumbnails': thumbnails, + 'series': try_get( + media, lambda x: x['isPartOf']['name'], compat_str), + 'season_number': int_or_none(try_get( + media, lambda x: x['isPartOf']['numeroStagioni'])), + 'season': media.get('stagione') or None, + 'subtitles': subtitles, + } + + info.update(relinker_info) + return info + + +class RaiPlayLiveIE(RaiBaseIE): + _VALID_URL = r'https?://(?:www\.)?raiplay\.it/dirette/(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'http://www.raiplay.it/dirette/rainews24', + 'info_dict': { + 'id': 'd784ad40-e0ae-4a69-aa76-37519d238a9c', + 'display_id': 'rainews24', + 'ext': 'mp4', + 'title': 're:^Diretta di Rai News 24 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 'md5:6eca31500550f9376819f174e5644754', + 'uploader': 'Rai News 24', + 'creator': 'Rai News 24', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + video_id = self._search_regex( + r'data-uniquename=["\']ContentItem-(%s)' % RaiBaseIE._UUID_RE, + webpage, 'content id') return { + '_type': 'url_transparent', + 'ie_key': RaiPlayIE.ie_key(), + 'url': 'http://www.raiplay.it/dirette/ContentItem-%s.html' % video_id, 'id': video_id, - 'title': media['name'], - 'description': media.get('desc'), + 'display_id': display_id, + } + + +class RaiPlayPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'http://www.raiplay.it/programmi/nondirloalmiocapo/', + 'info_dict': { + 'id': 'nondirloalmiocapo', + 'title': 'Non dirlo al mio capo', + 'description': 'md5:9f3d603b2947c1c7abb098f3b14fac86', + }, + 'playlist_mincount': 12, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + title = self._html_search_meta( + ('programma', 'nomeProgramma'), webpage, 'title') + description = unescapeHTML(self._html_search_meta( + ('description', 'og:description'), webpage, 'description')) + + entries = [] + for mobj in re.finditer( + r'<a\b[^>]+\bhref=(["\'])(?P<path>/raiplay/video/.+?)\1', + webpage): + video_url = urljoin(url, mobj.group('path')) + entries.append(self.url_result( + video_url, ie=RaiPlayIE.ie_key(), + video_id=RaiPlayIE._match_id(video_url))) + + return self.playlist_result(entries, playlist_id, title, description) + + +class RaiIE(RaiBaseIE): + _VALID_URL = r'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/.+?-(?P<id>%s)(?:-.+?)?\.html' % RaiBaseIE._UUID_RE + _TESTS = [{ + # var uniquename = "ContentItem-..." + # data-id="ContentItem-..." + 'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html', + 'info_dict': { + 'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9', + 'ext': 'mp4', + 'title': 'TG PRIMO TEMPO', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 1758, + 'upload_date': '20140612', + } + }, { + # with ContentItem in many metas + 'url': 'http://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html', + 'info_dict': { + 'id': '1632c009-c843-4836-bb65-80c33084a64b', + 'ext': 'mp4', + 'title': 'Weekend al cinema, da Hollywood arriva il thriller di Tate Taylor "La ragazza del treno"', + 'description': 'I film in uscita questa settimana.', + 'thumbnail': r're:^https?://.*\.png$', + 'duration': 833, + 'upload_date': '20161103', + } + }, { + # with ContentItem in og:url + 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html', + 'md5': '11959b4e44fa74de47011b5799490adf', + 'info_dict': { + 'id': 'efb17665-691c-45d5-a60c-5301333cbb0c', + 'ext': 'mp4', + 'title': 'TG1 ore 20:00 del 03/11/2016', + 'description': 'TG1 edizione integrale ore 20:00 del giorno 03/11/2016', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2214, + 'upload_date': '20161103', + } + }, { + # drawMediaRaiTV(...) + 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', + 'md5': '2dd727e61114e1ee9c47f0da6914e178', + 'info_dict': { + 'id': '59d69d28-6bb6-409d-a4b5-ed44096560af', + 'ext': 'mp4', + 'title': 'Il pacco', + 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a', + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20141221', + }, + }, { + # initEdizione('ContentItem-...' + 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', + 'info_dict': { + 'id': 'c2187016-8484-4e3a-8ac8-35e475b07303', + 'ext': 'mp4', + 'title': r're:TG1 ore \d{2}:\d{2} del \d{2}/\d{2}/\d{4}', + 'duration': 2274, + 'upload_date': '20170401', + }, + 'skip': 'Changes daily', + }, { + # HDS live stream with only relinker URL + 'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews', + 'info_dict': { + 'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc', + 'ext': 'flv', + 'title': 'EuroNews', + }, + 'params': { + 'skip_download': True, + }, + }, { + # HLS live stream with ContentItem in og:url + 'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html', + 'info_dict': { + 'id': '3156f2f2-dc70-4953-8e2f-70d7489d4ce9', + 'ext': 'mp4', + 'title': 'La diretta di Rainews24', + }, + 'params': { + 'skip_download': True, + }, + }, { + # Direct MMS URL + 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-b63a4089-ac28-48cf-bca5-9f5b5bc46df5.html', + 'only_matching': True, + }, { + 'url': 'https://www.rainews.it/tgr/marche/notiziari/video/2019/02/ContentItem-6ba945a2-889c-4a80-bdeb-8489c70a8db9.html', + 'only_matching': True, + }] + + def _extract_from_content_id(self, content_id, url): + media = self._download_json( + 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % content_id, + content_id, 'Downloading video JSON') + + title = media['name'].strip() + + media_type = media['type'] + if 'Audio' in media_type: + relinker_info = { + 'formats': [{ + 'format_id': media.get('formatoAudio'), + 'url': media['audioUrl'], + 'ext': media.get('formatoAudio'), + }] + } + elif 'Video' in media_type: + relinker_info = self._extract_relinker_info(media['mediaUri'], content_id) + else: + raise ExtractorError('not a media file') + + self._sort_formats(relinker_info['formats']) + + thumbnails = [] + for image_type in ('image', 'image_medium', 'image_300'): + thumbnail_url = media.get(image_type) + if thumbnail_url: + thumbnails.append({ + 'url': compat_urlparse.urljoin(url, thumbnail_url), + }) + + subtitles = self._extract_subtitles(url, media.get('subtitlesUrl')) + + info = { + 'id': content_id, + 'title': title, + 'description': strip_or_none(media.get('desc')), 'thumbnails': thumbnails, 'uploader': media.get('author'), 'upload_date': unified_strdate(media.get('date')), 'duration': parse_duration(media.get('length')), - 'formats': formats, 'subtitles': subtitles, } - def _get_subtitles(self, video_id, webpage): - subtitles = {} - m = re.search(r'<meta name="closedcaption" content="(?P<captions>[^"]+)"', webpage) - if m: - captions = m.group('captions') - STL_EXT = '.stl' - SRT_EXT = '.srt' - if captions.endswith(STL_EXT): - captions = captions[:-len(STL_EXT)] + SRT_EXT - subtitles['it'] = [{ - 'ext': 'srt', - 'url': 'http://www.rai.tv%s' % compat_urllib_parse.quote(captions), - }] - return subtitles - + info.update(relinker_info) -class RaiIE(InfoExtractor): - _VALID_URL = r'http://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' - _TESTS = [ - { - 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', - 'md5': 'e0e7a8a131e249d1aa0ebf270d1d8db7', - 'info_dict': { - 'id': '59d69d28-6bb6-409d-a4b5-ed44096560af', - 'ext': 'flv', - 'title': 'Il pacco', - 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a', - 'upload_date': '20141221', - }, - } - ] - - @classmethod - def suitable(cls, url): - return False if RaiTVIE.suitable(url) else super(RaiIE, cls).suitable(url) + return info def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) - iframe_url = self._search_regex( - [r'<iframe[^>]+src="([^"]*/dl/[^"]+\?iframe\b[^"]*)"', - r'drawMediaRaiTV\(["\'](.+?)["\']'], - webpage, 'iframe') - if not iframe_url.startswith('http'): - iframe_url = compat_urlparse.urljoin(url, iframe_url) - return self.url_result(iframe_url) + content_item_id = None + + content_item_url = self._html_search_meta( + ('og:url', 'og:video', 'og:video:secure_url', 'twitter:url', + 'twitter:player', 'jsonlink'), webpage, default=None) + if content_item_url: + content_item_id = self._search_regex( + r'ContentItem-(%s)' % self._UUID_RE, content_item_url, + 'content item id', default=None) + + if not content_item_id: + content_item_id = self._search_regex( + r'''(?x) + (?: + (?:initEdizione|drawMediaRaiTV)\(| + <(?:[^>]+\bdata-id|var\s+uniquename)= + ) + (["\']) + (?:(?!\1).)*\bContentItem-(?P<id>%s) + ''' % self._UUID_RE, + webpage, 'content item id', default=None, group='id') + + content_item_ids = set() + if content_item_id: + content_item_ids.add(content_item_id) + if video_id not in content_item_ids: + content_item_ids.add(video_id) + + for content_item_id in content_item_ids: + try: + return self._extract_from_content_id(content_item_id, url) + except GeoRestrictedError: + raise + except ExtractorError: + pass + + relinker_url = self._search_regex( + r'''(?x) + (?: + var\s+videoURL| + mediaInfo\.mediaUri + )\s*=\s* + ([\'"]) + (?P<url> + (?:https?:)? + //mediapolis(?:vod)?\.rai\.it/relinker/relinkerServlet\.htm\? + (?:(?!\1).)*\bcont=(?:(?!\1).)+)\1 + ''', + webpage, 'relinker URL', group='url') + + relinker_info = self._extract_relinker_info( + urljoin(url, relinker_url), video_id) + self._sort_formats(relinker_info['formats']) + + title = self._search_regex( + r'var\s+videoTitolo\s*=\s*([\'"])(?P<title>[^\'"]+)\1', + webpage, 'title', group='title', + default=None) or self._og_search_title(webpage) + + info = { + 'id': video_id, + 'title': title, + } + + info.update(relinker_info) + + return info diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/raywenderlich.py youtube-dl-2019.05.20/youtube_dl/extractor/raywenderlich.py --- youtube-dl-2016.02.22/youtube_dl/extractor/raywenderlich.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/raywenderlich.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,179 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .vimeo import VimeoIE +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + merge_dicts, + try_get, + unescapeHTML, + unified_timestamp, + urljoin, +) + + +class RayWenderlichIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + videos\.raywenderlich\.com/courses| + (?:www\.)?raywenderlich\.com + )/ + (?P<course_id>[^/]+)/lessons/(?P<id>\d+) + ''' + + _TESTS = [{ + 'url': 'https://www.raywenderlich.com/3530-testing-in-ios/lessons/1', + 'info_dict': { + 'id': '248377018', + 'ext': 'mp4', + 'title': 'Introduction', + 'description': 'md5:804d031b3efa9fcb49777d512d74f722', + 'timestamp': 1513906277, + 'upload_date': '20171222', + 'duration': 133, + 'uploader': 'Ray Wenderlich', + 'uploader_id': 'user3304672', + }, + 'params': { + 'noplaylist': True, + 'skip_download': True, + }, + 'add_ie': [VimeoIE.ie_key()], + 'expected_warnings': ['HTTP Error 403: Forbidden'], + }, { + 'url': 'https://videos.raywenderlich.com/courses/105-testing-in-ios/lessons/1', + 'only_matching': True, + }] + + @staticmethod + def _extract_video_id(data, lesson_id): + if not data: + return + groups = try_get(data, lambda x: x['groups'], list) or [] + if not groups: + return + for group in groups: + if not isinstance(group, dict): + continue + contents = try_get(data, lambda x: x['contents'], list) or [] + for content in contents: + if not isinstance(content, dict): + continue + ordinal = int_or_none(content.get('ordinal')) + if ordinal != lesson_id: + continue + video_id = content.get('identifier') + if video_id: + return compat_str(video_id) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + course_id, lesson_id = mobj.group('course_id', 'id') + display_id = '%s/%s' % (course_id, lesson_id) + + webpage = self._download_webpage(url, display_id) + + thumbnail = self._og_search_thumbnail( + webpage, default=None) or self._html_search_meta( + 'twitter:image', webpage, 'thumbnail') + + if '>Subscribe to unlock' in webpage: + raise ExtractorError( + 'This content is only available for subscribers', + expected=True) + + info = { + 'thumbnail': thumbnail, + } + + vimeo_id = self._search_regex( + r'data-vimeo-id=["\'](\d+)', webpage, 'vimeo id', default=None) + + if not vimeo_id: + data = self._parse_json( + self._search_regex( + r'data-collection=(["\'])(?P<data>{.+?})\1', webpage, + 'data collection', default='{}', group='data'), + display_id, transform_source=unescapeHTML, fatal=False) + video_id = self._extract_video_id( + data, lesson_id) or self._search_regex( + r'/videos/(\d+)/', thumbnail, 'video id') + headers = { + 'Referer': url, + 'X-Requested-With': 'XMLHttpRequest', + } + csrf_token = self._html_search_meta( + 'csrf-token', webpage, 'csrf token', default=None) + if csrf_token: + headers['X-CSRF-Token'] = csrf_token + video = self._download_json( + 'https://videos.raywenderlich.com/api/v1/videos/%s.json' + % video_id, display_id, headers=headers)['video'] + vimeo_id = video['clips'][0]['provider_id'] + info.update({ + '_type': 'url_transparent', + 'title': video.get('name'), + 'description': video.get('description') or video.get( + 'meta_description'), + 'duration': int_or_none(video.get('duration')), + 'timestamp': unified_timestamp(video.get('created_at')), + }) + + return merge_dicts(info, self.url_result( + VimeoIE._smuggle_referrer( + 'https://player.vimeo.com/video/%s' % vimeo_id, url), + ie=VimeoIE.ie_key(), video_id=vimeo_id)) + + +class RayWenderlichCourseIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + videos\.raywenderlich\.com/courses| + (?:www\.)?raywenderlich\.com + )/ + (?P<id>[^/]+) + ''' + + _TEST = { + 'url': 'https://www.raywenderlich.com/3530-testing-in-ios', + 'info_dict': { + 'title': 'Testing in iOS', + 'id': '3530-testing-in-ios', + }, + 'params': { + 'noplaylist': False, + }, + 'playlist_count': 29, + } + + @classmethod + def suitable(cls, url): + return False if RayWenderlichIE.suitable(url) else super( + RayWenderlichCourseIE, cls).suitable(url) + + def _real_extract(self, url): + course_id = self._match_id(url) + + webpage = self._download_webpage(url, course_id) + + entries = [] + lesson_urls = set() + for lesson_url in re.findall( + r'<a[^>]+\bhref=["\'](/%s/lessons/\d+)' % course_id, webpage): + if lesson_url in lesson_urls: + continue + lesson_urls.add(lesson_url) + entries.append(self.url_result( + urljoin(url, lesson_url), ie=RayWenderlichIE.ie_key())) + + title = self._og_search_title( + webpage, default=None) or self._html_search_meta( + 'twitter:title', webpage, 'title', default=None) + + return self.playlist_result(entries, course_id, title) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/rbmaradio.py youtube-dl-2019.05.20/youtube_dl/extractor/rbmaradio.py --- youtube-dl-2016.02.22/youtube_dl/extractor/rbmaradio.py 2016-02-14 22:26:17.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/rbmaradio.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,55 +1,72 @@ -# encoding: utf-8 from __future__ import unicode_literals -import json import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( - ExtractorError, + clean_html, + int_or_none, + unified_timestamp, + update_url_query, ) class RBMARadioIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$' + _VALID_URL = r'https?://(?:www\.)?(?:rbmaradio|redbullradio)\.com/shows/(?P<show_id>[^/]+)/episodes/(?P<id>[^/?#&]+)' _TEST = { - 'url': 'http://www.rbmaradio.com/shows/ford-lopatin-live-at-primavera-sound-2011', + 'url': 'https://www.rbmaradio.com/shows/main-stage/episodes/ford-lopatin-live-at-primavera-sound-2011', 'md5': '6bc6f9bcb18994b4c983bc3bf4384d95', 'info_dict': { 'id': 'ford-lopatin-live-at-primavera-sound-2011', 'ext': 'mp3', - 'uploader_id': 'ford-lopatin', - 'location': 'Spain', - 'description': 'Joel Ford and Daniel ’Oneohtrix Point Never’ Lopatin fly their midified pop extravaganza to Spain. Live at Primavera Sound 2011.', - 'uploader': 'Ford & Lopatin', - 'title': 'Live at Primavera Sound 2011', + 'title': 'Main Stage - Ford & Lopatin at Primavera Sound', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 2452, + 'timestamp': 1307103164, + 'upload_date': '20110603', }, } def _real_extract(self, url): - m = re.match(self._VALID_URL, url) - video_id = m.group('videoID') - - webpage = self._download_webpage(url, video_id) - - json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$', - webpage, 'json data', flags=re.MULTILINE) - - try: - data = json.loads(json_data) - except ValueError as e: - raise ExtractorError('Invalid JSON: ' + str(e)) - - video_url = data['akamai_url'] + '&cbr=256' + mobj = re.match(self._VALID_URL, url) + show_id = mobj.group('show_id') + episode_id = mobj.group('id') + + webpage = self._download_webpage(url, episode_id) + + episode = self._parse_json( + self._search_regex( + r'__INITIAL_STATE__\s*=\s*({.+?})\s*</script>', + webpage, 'json data'), + episode_id)['episodes'][show_id][episode_id] + + title = episode['title'] + + show_title = episode.get('showTitle') + if show_title: + title = '%s - %s' % (show_title, title) + + formats = [{ + 'url': update_url_query(episode['audioURL'], query={'cbr': abr}), + 'format_id': compat_str(abr), + 'abr': abr, + 'vcodec': 'none', + } for abr in (96, 128, 192, 256)] + self._check_formats(formats, episode_id) + + description = clean_html(episode.get('longTeaser')) + thumbnail = self._proto_relative_url(episode.get('imageURL', {}).get('landscape')) + duration = int_or_none(episode.get('duration')) + timestamp = unified_timestamp(episode.get('publishedAt')) return { - 'id': video_id, - 'url': video_url, - 'title': data['title'], - 'description': data.get('teaser_text'), - 'location': data.get('country_of_origin'), - 'uploader': data.get('host', {}).get('name'), - 'uploader_id': data.get('host', {}).get('slug'), - 'thumbnail': data.get('image', {}).get('large_url_2x'), - 'duration': data.get('duration'), + 'id': episode_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/rds.py youtube-dl-2019.05.20/youtube_dl/extractor/rds.py --- youtube-dl-2016.02.22/youtube_dl/extractor/rds.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/rds.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,25 +1,25 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( parse_duration, parse_iso8601, + js_to_json, ) +from ..compat import compat_str class RDSIE(InfoExtractor): IE_DESC = 'RDS.ca' - _VALID_URL = r'https?://(?:www\.)?rds\.ca/vid(?:[eé]|%C3%A9)os/(?:[^/]+/)*(?P<display_id>[^/]+)-(?P<id>\d+\.\d+)' + _VALID_URL = r'https?://(?:www\.)?rds\.ca/vid(?:[eé]|%C3%A9)os/(?:[^/]+/)*(?P<id>[^/]+)-\d+\.\d+' _TESTS = [{ 'url': 'http://www.rds.ca/videos/football/nfl/fowler-jr-prend-la-direction-de-jacksonville-3.1132799', 'info_dict': { - 'id': '3.1132799', + 'id': '604333', 'display_id': 'fowler-jr-prend-la-direction-de-jacksonville', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Fowler Jr. prend la direction de Jacksonville', 'description': 'Dante Fowler Jr. est le troisième choix du repêchage 2015 de la NFL. ', 'timestamp': 1430397346, @@ -33,22 +33,17 @@ }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') + display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - # TODO: extract f4m from 9c9media.com - video_url = self._search_regex( - r'<span[^>]+itemprop="contentURL"[^>]+content="([^"]+)"', - webpage, 'video url') - - title = self._og_search_title(webpage) or self._html_search_meta( + item = self._parse_json(self._search_regex(r'(?s)itemToPush\s*=\s*({.+?});', webpage, 'item'), display_id, js_to_json) + video_id = compat_str(item['id']) + title = item.get('title') or self._og_search_title(webpage) or self._html_search_meta( 'title', webpage, 'title', fatal=True) description = self._og_search_description(webpage) or self._html_search_meta( 'description', webpage, 'description') - thumbnail = self._og_search_thumbnail(webpage) or self._search_regex( + thumbnail = item.get('urlImageBig') or self._og_search_thumbnail(webpage) or self._search_regex( [r'<link[^>]+itemprop="thumbnailUrl"[^>]+href="([^"]+)"', r'<span[^>]+itemprop="thumbnailUrl"[^>]+content="([^"]+)"'], webpage, 'thumbnail', fatal=False) @@ -61,13 +56,15 @@ age_limit = self._family_friendly_search(webpage) return { + '_type': 'url_transparent', 'id': video_id, 'display_id': display_id, - 'url': video_url, + 'url': '9c9media:rds_web:%s' % video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'timestamp': timestamp, 'duration': duration, 'age_limit': age_limit, + 'ie_key': 'NineCNineMedia', } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/redbulltv.py youtube-dl-2019.05.20/youtube_dl/extractor/redbulltv.py --- youtube-dl-2016.02.22/youtube_dl/extractor/redbulltv.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/redbulltv.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,128 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( + float_or_none, + ExtractorError, +) + + +class RedBullTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?redbull(?:\.tv|\.com(?:/[^/]+)?(?:/tv)?)(?:/events/[^/]+)?/(?:videos?|live)/(?P<id>AP-\w+)' + _TESTS = [{ + # film + 'url': 'https://www.redbull.tv/video/AP-1Q6XCDTAN1W11', + 'md5': 'fb0445b98aa4394e504b413d98031d1f', + 'info_dict': { + 'id': 'AP-1Q6XCDTAN1W11', + 'ext': 'mp4', + 'title': 'ABC of... WRC - ABC of... S1E6', + 'description': 'md5:5c7ed8f4015c8492ecf64b6ab31e7d31', + 'duration': 1582.04, + }, + }, { + # episode + 'url': 'https://www.redbull.tv/video/AP-1PMHKJFCW1W11', + 'info_dict': { + 'id': 'AP-1PMHKJFCW1W11', + 'ext': 'mp4', + 'title': 'Grime - Hashtags S2E4', + 'description': 'md5:b5f522b89b72e1e23216e5018810bb25', + 'duration': 904.6, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.redbull.com/int-en/tv/video/AP-1UWHCAR9S1W11/rob-meets-sam-gaze?playlist=playlists::3f81040a-2f31-4832-8e2e-545b1d39d173', + 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/us-en/videos/AP-1YM9QCYE52111', + 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/us-en/events/AP-1XV2K61Q51W11/live/AP-1XUJ86FDH1W11', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + session = self._download_json( + 'https://api.redbull.tv/v3/session', video_id, + note='Downloading access token', query={ + 'category': 'personal_computer', + 'os_family': 'http', + }) + if session.get('code') == 'error': + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, session['message'])) + token = session['token'] + + try: + video = self._download_json( + 'https://api.redbull.tv/v3/products/' + video_id, + video_id, note='Downloading video information', + headers={'Authorization': token} + ) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: + error_message = self._parse_json( + e.cause.read().decode(), video_id)['error'] + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, error_message), expected=True) + raise + + title = video['title'].strip() + + formats = self._extract_m3u8_formats( + 'https://dms.redbull.tv/v3/%s/%s/playlist.m3u8' % (video_id, token), + video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') + self._sort_formats(formats) + + subtitles = {} + for resource in video.get('resources', []): + if resource.startswith('closed_caption_'): + splitted_resource = resource.split('_') + if splitted_resource[2]: + subtitles.setdefault('en', []).append({ + 'url': 'https://resources.redbull.tv/%s/%s' % (video_id, resource), + 'ext': splitted_resource[2], + }) + + subheading = video.get('subheading') + if subheading: + title += ' - %s' % subheading + + return { + 'id': video_id, + 'title': title, + 'description': video.get('long_description') or video.get( + 'short_description'), + 'duration': float_or_none(video.get('duration'), scale=1000), + 'formats': formats, + 'subtitles': subtitles, + } + + +class RedBullTVRrnContentIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?redbull(?:\.tv|\.com(?:/[^/]+)?(?:/tv)?)/(?:video|live)/rrn:content:[^:]+:(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _TESTS = [{ + 'url': 'https://www.redbull.com/int-en/tv/video/rrn:content:live-videos:e3e6feb4-e95f-50b7-962a-c70f8fd13c73/mens-dh-finals-fort-william', + 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/int-en/tv/video/rrn:content:videos:a36a0f36-ff1b-5db8-a69d-ee11a14bf48b/tn-ts-style?playlist=rrn:content:event-profiles:83f05926-5de8-5389-b5e4-9bb312d715e8:extras', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + video_url = self._og_search_url(webpage) + + return self.url_result( + video_url, ie=RedBullTVIE.ie_key(), + video_id=RedBullTVIE._match_id(video_url)) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/reddit.py youtube-dl-2019.05.20/youtube_dl/extractor/reddit.py --- youtube-dl-2016.02.22/youtube_dl/extractor/reddit.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/reddit.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,130 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + float_or_none, + url_or_none, +) + + +class RedditIE(InfoExtractor): + _VALID_URL = r'https?://v\.redd\.it/(?P<id>[^/?#&]+)' + _TEST = { + # from https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/ + 'url': 'https://v.redd.it/zv89llsvexdz', + 'md5': '0a070c53eba7ec4534d95a5a1259e253', + 'info_dict': { + 'id': 'zv89llsvexdz', + 'ext': 'mp4', + 'title': 'zv89llsvexdz', + }, + 'params': { + 'format': 'bestvideo', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + formats = self._extract_m3u8_formats( + 'https://v.redd.it/%s/HLSPlaylist.m3u8' % video_id, video_id, + 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + + formats.extend(self._extract_mpd_formats( + 'https://v.redd.it/%s/DASHPlaylist.mpd' % video_id, video_id, + mpd_id='dash', fatal=False)) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_id, + 'formats': formats, + } + + +class RedditRIE(InfoExtractor): + _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?reddit\.com/r/[^/]+/comments/(?P<id>[^/?#&]+))' + _TESTS = [{ + 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/', + 'info_dict': { + 'id': 'zv89llsvexdz', + 'ext': 'mp4', + 'title': 'That small heart attack.', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1501941939, + 'upload_date': '20170805', + 'uploader': 'Antw87', + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'age_limit': 0, + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + }, { + 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj', + 'only_matching': True, + }, { + # imgur + 'url': 'https://www.reddit.com/r/MadeMeSmile/comments/6t7wi5/wait_for_it/', + 'only_matching': True, + }, { + # imgur @ old reddit + 'url': 'https://old.reddit.com/r/MadeMeSmile/comments/6t7wi5/wait_for_it/', + 'only_matching': True, + }, { + # streamable + 'url': 'https://www.reddit.com/r/videos/comments/6t7sg9/comedians_hilarious_joke_about_the_guam_flag/', + 'only_matching': True, + }, { + # youtube + 'url': 'https://www.reddit.com/r/videos/comments/6t75wq/southern_man_tries_to_speak_without_an_accent/', + 'only_matching': True, + }, { + # reddit video @ nm reddit + 'url': 'https://nm.reddit.com/r/Cricket/comments/8idvby/lousy_cameraman_finds_himself_in_cairns_line_of/', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + url, video_id = mobj.group('url', 'id') + + video_id = self._match_id(url) + + data = self._download_json( + url + '/.json', video_id)[0]['data']['children'][0]['data'] + + video_url = data['url'] + + # Avoid recursing into the same reddit URL + if 'reddit.com/' in video_url and '/%s/' % video_id in video_url: + raise ExtractorError('No media found', expected=True) + + over_18 = data.get('over_18') + if over_18 is True: + age_limit = 18 + elif over_18 is False: + age_limit = 0 + else: + age_limit = None + + return { + '_type': 'url_transparent', + 'url': video_url, + 'title': data.get('title'), + 'thumbnail': url_or_none(data.get('thumbnail')), + 'timestamp': float_or_none(data.get('created_utc')), + 'uploader': data.get('author'), + 'like_count': int_or_none(data.get('ups')), + 'dislike_count': int_or_none(data.get('downs')), + 'comment_count': int_or_none(data.get('num_comments')), + 'age_limit': age_limit, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/redtube.py youtube-dl-2019.05.20/youtube_dl/extractor/redtube.py --- youtube-dl-2016.02.22/youtube_dl/extractor/redtube.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/redtube.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,35 +1,102 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import ( + ExtractorError, + int_or_none, + str_to_int, + unified_strdate, + url_or_none, +) class RedTubeIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?redtube\.com/(?P<id>[0-9]+)' - _TEST = { + _VALID_URL = r'https?://(?:(?:www\.)?redtube\.com/|embed\.redtube\.com/\?.*?\bid=)(?P<id>[0-9]+)' + _TESTS = [{ 'url': 'http://www.redtube.com/66418', - 'md5': '7b8c22b5e7098a3e1c09709df1126d2d', + 'md5': 'fc08071233725f26b8f014dba9590005', 'info_dict': { 'id': '66418', 'ext': 'mp4', 'title': 'Sucked on a toilet', + 'upload_date': '20110811', + 'duration': 596, + 'view_count': int, 'age_limit': 18, } - } + }, { + 'url': 'http://embed.redtube.com/?bgcolor=000000&id=1443286', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//embed\.redtube\.com/\?.*?\bid=\d+)', + webpage) def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage( + 'http://www.redtube.com/%s' % video_id, video_id) if any(s in webpage for s in ['video-deleted-info', '>This video has been removed']): raise ExtractorError('Video %s has been removed' % video_id, expected=True) - video_url = self._html_search_regex( - r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL') - video_title = self._html_search_regex( - r'<h1 class="videoTitle[^"]*">(.+?)</h1>', - webpage, 'title') - video_thumbnail = self._og_search_thumbnail(webpage) + title = self._html_search_regex( + (r'<h(\d)[^>]+class="(?:video_title_text|videoTitle)[^"]*">(?P<title>(?:(?!\1).)+)</h\1>', + r'(?:videoTitle|title)\s*:\s*(["\'])(?P<title>(?:(?!\1).)+)\1',), + webpage, 'title', group='title', + default=None) or self._og_search_title(webpage) + + formats = [] + sources = self._parse_json( + self._search_regex( + r'sources\s*:\s*({.+?})', webpage, 'source', default='{}'), + video_id, fatal=False) + if sources and isinstance(sources, dict): + for format_id, format_url in sources.items(): + if format_url: + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'height': int_or_none(format_id), + }) + medias = self._parse_json( + self._search_regex( + r'mediaDefinition\s*:\s*(\[.+?\])', webpage, + 'media definitions', default='{}'), + video_id, fatal=False) + if medias and isinstance(medias, list): + for media in medias: + format_url = url_or_none(media.get('videoUrl')) + if not format_url: + continue + format_id = media.get('quality') + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'height': int_or_none(format_id), + }) + if not formats: + video_url = self._html_search_regex( + r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL') + formats.append({'url': video_url}) + self._sort_formats(formats) + + thumbnail = self._og_search_thumbnail(webpage) + upload_date = unified_strdate(self._search_regex( + r'<span[^>]+>ADDED ([^<]+)<', + webpage, 'upload date', fatal=False)) + duration = int_or_none(self._og_search_property( + 'video:duration', webpage, default=None) or self._search_regex( + r'videoDuration\s*:\s*(\d+)', webpage, 'duration', default=None)) + view_count = str_to_int(self._search_regex( + (r'<div[^>]*>Views</div>\s*<div[^>]*>\s*([\d,.]+)', + r'<span[^>]*>VIEWS</span>\s*</td>\s*<td>\s*([\d,.]+)'), + webpage, 'view count', fatal=False)) # No self-labeling, but they describe themselves as # "Home of Videos Porno" @@ -37,9 +104,12 @@ return { 'id': video_id, - 'url': video_url, 'ext': 'mp4', - 'title': video_title, - 'thumbnail': video_thumbnail, + 'title': title, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'duration': duration, + 'view_count': view_count, 'age_limit': age_limit, + 'formats': formats, } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/rentv.py youtube-dl-2019.05.20/youtube_dl/extractor/rentv.py --- youtube-dl-2016.02.22/youtube_dl/extractor/rentv.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/rentv.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,106 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + int_or_none, + url_or_none, +) + + +class RENTVIE(InfoExtractor): + _VALID_URL = r'(?:rentv:|https?://(?:www\.)?ren\.tv/(?:player|video/epizod)/)(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://ren.tv/video/epizod/118577', + 'md5': 'd91851bf9af73c0ad9b2cdf76c127fbb', + 'info_dict': { + 'id': '118577', + 'ext': 'mp4', + 'title': 'Документальный спецпроект: "Промывка мозгов. Технологии XXI века"', + 'timestamp': 1472230800, + 'upload_date': '20160826', + } + }, { + 'url': 'http://ren.tv/player/118577', + 'only_matching': True, + }, { + 'url': 'rentv:118577', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage('http://ren.tv/player/' + video_id, video_id) + config = self._parse_json(self._search_regex( + r'config\s*=\s*({.+})\s*;', webpage, 'config'), video_id) + title = config['title'] + formats = [] + for video in config['src']: + src = url_or_none(video.get('src')) + if not src: + continue + ext = determine_ext(src) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + src, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': src, + }) + self._sort_formats(formats) + return { + 'id': video_id, + 'title': title, + 'description': config.get('description'), + 'thumbnail': config.get('image'), + 'duration': int_or_none(config.get('duration')), + 'timestamp': int_or_none(config.get('date')), + 'formats': formats, + } + + +class RENTVArticleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ren\.tv/novosti/\d{4}-\d{2}-\d{2}/(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'http://ren.tv/novosti/2016-10-26/video-mikroavtobus-popavshiy-v-dtp-s-gruzovikami-v-podmoskove-prevratilsya-v', + 'md5': 'ebd63c4680b167693745ab91343df1d6', + 'info_dict': { + 'id': '136472', + 'ext': 'mp4', + 'title': 'Видео: микроавтобус, попавший в ДТП с грузовиками в Подмосковье, превратился в груду металла', + 'description': 'Жертвами столкновения двух фур и микроавтобуса, по последним данным, стали семь человек.', + } + }, { + # TODO: invalid m3u8 + 'url': 'http://ren.tv/novosti/2015-09-25/sluchaynyy-prohozhiy-poymal-avtougonshchika-v-murmanske-video', + 'info_dict': { + 'id': 'playlist', + 'ext': 'mp4', + 'title': 'Случайный прохожий поймал автоугонщика в Мурманске. ВИДЕО | РЕН ТВ', + 'uploader': 'ren.tv', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + }, + 'skip': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + drupal_settings = self._parse_json(self._search_regex( + r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', + webpage, 'drupal settings'), display_id) + + entries = [] + for config_profile in drupal_settings.get('ren_jwplayer', {}).values(): + media_id = config_profile.get('mediaid') + if not media_id: + continue + media_id = compat_str(media_id) + entries.append(self.url_result('rentv:' + media_id, 'RENTV', media_id)) + return self.playlist_result(entries, display_id) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/restudy.py youtube-dl-2019.05.20/youtube_dl/extractor/restudy.py --- youtube-dl-2016.02.22/youtube_dl/extractor/restudy.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/restudy.py 2019-06-07 18:49:07.000000000 +0000 @@ -5,8 +5,8 @@ class RestudyIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?restudy\.dk/video/play/id/(?P<id>[0-9]+)' - _TEST = { + _VALID_URL = r'https?://(?:(?:www|portal)\.)?restudy\.dk/video/[^/]+/id/(?P<id>[0-9]+)' + _TESTS = [{ 'url': 'https://www.restudy.dk/video/play/id/1637', 'info_dict': { 'id': '1637', @@ -18,7 +18,10 @@ # rtmp download 'skip_download': True, } - } + }, { + 'url': 'https://portal.restudy.dk/video/leiden-frosteffekt/id/1637', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -29,8 +32,9 @@ description = self._og_search_description(webpage).strip() formats = self._extract_smil_formats( - 'https://www.restudy.dk/awsmedia/SmilDirectory/video_%s.xml' % video_id, + 'https://cdn.portal.restudy.dk/dynamic/themes/front/awsmedia/SmilDirectory/video_%s.xml' % video_id, video_id) + self._sort_formats(formats) return { 'id': video_id, diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/reuters.py youtube-dl-2019.05.20/youtube_dl/extractor/reuters.py --- youtube-dl-2016.02.22/youtube_dl/extractor/reuters.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/reuters.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,69 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + js_to_json, + int_or_none, + unescapeHTML, +) + + +class ReutersIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?reuters\.com/.*?\?.*?videoId=(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.reuters.com/video/2016/05/20/san-francisco-police-chief-resigns?videoId=368575562', + 'md5': '8015113643a0b12838f160b0b81cc2ee', + 'info_dict': { + 'id': '368575562', + 'ext': 'mp4', + 'title': 'San Francisco police chief resigns', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage( + 'http://www.reuters.com/assets/iframe/yovideo?videoId=%s' % video_id, video_id) + video_data = js_to_json(self._search_regex( + r'(?s)Reuters\.yovideo\.drawPlayer\(({.*?})\);', + webpage, 'video data')) + + def get_json_value(key, fatal=False): + return self._search_regex(r'"%s"\s*:\s*"([^"]+)"' % key, video_data, key, fatal=fatal) + + title = unescapeHTML(get_json_value('title', fatal=True)) + mmid, fid = re.search(r',/(\d+)\?f=(\d+)', get_json_value('flv', fatal=True)).groups() + + mas_data = self._download_json( + 'http://mas-e.cds1.yospace.com/mas/%s/%s?trans=json' % (mmid, fid), + video_id, transform_source=js_to_json) + formats = [] + for f in mas_data: + f_url = f.get('url') + if not f_url: + continue + method = f.get('method') + if method == 'hls': + formats.extend(self._extract_m3u8_formats( + f_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + else: + container = f.get('container') + ext = '3gp' if method == 'mobile' else container + formats.append({ + 'format_id': ext, + 'url': f_url, + 'ext': ext, + 'container': container if method != 'mobile' else None, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': get_json_value('thumb'), + 'duration': int_or_none(get_json_value('seconds')), + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/reverbnation.py youtube-dl-2019.05.20/youtube_dl/extractor/reverbnation.py --- youtube-dl-2016.02.22/youtube_dl/extractor/reverbnation.py 2016-02-14 22:26:17.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/reverbnation.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,29 +1,29 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import str_or_none +from ..utils import ( + qualities, + str_or_none, +) class ReverbNationIE(InfoExtractor): _VALID_URL = r'^https?://(?:www\.)?reverbnation\.com/.*?/song/(?P<id>\d+).*?$' _TESTS = [{ 'url': 'http://www.reverbnation.com/alkilados/song/16965047-mona-lisa', - 'md5': '3da12ebca28c67c111a7f8b262d3f7a7', + 'md5': 'c0aaf339bcee189495fdf5a8c8ba8645', 'info_dict': { 'id': '16965047', 'ext': 'mp3', 'title': 'MONA LISA', 'uploader': 'ALKILADOS', 'uploader_id': '216429', - 'thumbnail': 're:^https://gp1\.wac\.edgecastcdn\.net/.*?\.jpg$' + 'thumbnail': r're:^https?://.*\.jpg', }, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - song_id = mobj.group('id') + song_id = self._match_id(url) api_res = self._download_json( 'https://api.reverbnation.com/song/%s' % song_id, @@ -31,14 +31,23 @@ note='Downloading information of song %s' % song_id ) + THUMBNAILS = ('thumbnail', 'image') + quality = qualities(THUMBNAILS) + thumbnails = [] + for thumb_key in THUMBNAILS: + if api_res.get(thumb_key): + thumbnails.append({ + 'url': api_res[thumb_key], + 'preference': quality(thumb_key) + }) + return { 'id': song_id, - 'title': api_res.get('name'), - 'url': api_res.get('url'), + 'title': api_res['name'], + 'url': api_res['url'], 'uploader': api_res.get('artist', {}).get('name'), 'uploader_id': str_or_none(api_res.get('artist', {}).get('id')), - 'thumbnail': self._proto_relative_url( - api_res.get('image', api_res.get('thumbnail'))), + 'thumbnails': thumbnails, 'ext': 'mp3', 'vcodec': 'none', } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/revision3.py youtube-dl-2019.05.20/youtube_dl/extractor/revision3.py --- youtube-dl-2016.02.22/youtube_dl/extractor/revision3.py 2016-01-09 00:15:59.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/revision3.py 2019-06-07 18:49:07.000000000 +0000 @@ -13,13 +13,69 @@ ) +class Revision3EmbedIE(InfoExtractor): + IE_NAME = 'revision3:embed' + _VALID_URL = r'(?:revision3:(?:(?P<playlist_type>[^:]+):)?|https?://(?:(?:(?:www|embed)\.)?(?:revision3|animalist)|(?:(?:api|embed)\.)?seekernetwork)\.com/player/embed\?videoId=)(?P<playlist_id>\d+)' + _TEST = { + 'url': 'http://api.seekernetwork.com/player/embed?videoId=67558', + 'md5': '83bcd157cab89ad7318dd7b8c9cf1306', + 'info_dict': { + 'id': '67558', + 'ext': 'mp4', + 'title': 'The Pros & Cons Of Zoos', + 'description': 'Zoos are often depicted as a terrible place for animals to live, but is there any truth to this?', + 'uploader_id': 'dnews', + 'uploader': 'DNews', + } + } + _API_KEY = 'ba9c741bce1b9d8e3defcc22193f3651b8867e62' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + playlist_id = mobj.group('playlist_id') + playlist_type = mobj.group('playlist_type') or 'video_id' + video_data = self._download_json( + 'http://revision3.com/api/getPlaylist.json', playlist_id, query={ + 'api_key': self._API_KEY, + 'codecs': 'h264,vp8,theora', + playlist_type: playlist_id, + })['items'][0] + + formats = [] + for vcodec, media in video_data['media'].items(): + for quality_id, quality in media.items(): + if quality_id == 'hls': + formats.extend(self._extract_m3u8_formats( + quality['url'], playlist_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': quality['url'], + 'format_id': '%s-%s' % (vcodec, quality_id), + 'tbr': int_or_none(quality.get('bitrate')), + 'vcodec': vcodec, + }) + self._sort_formats(formats) + + return { + 'id': playlist_id, + 'title': unescapeHTML(video_data['title']), + 'description': unescapeHTML(video_data.get('summary')), + 'uploader': video_data.get('show', {}).get('name'), + 'uploader_id': video_data.get('show', {}).get('slug'), + 'duration': int_or_none(video_data.get('duration')), + 'formats': formats, + } + + class Revision3IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:revision3|testtube|animalist)\.com)/(?P<id>[^/]+(?:/[^/?#]+)?)' + IE_NAME = 'revision' + _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:revision3|animalist)\.com)/(?P<id>[^/]+(?:/[^/?#]+)?)' _TESTS = [{ 'url': 'http://www.revision3.com/technobuffalo/5-google-predictions-for-2016', 'md5': 'd94a72d85d0a829766de4deb8daaf7df', 'info_dict': { - 'id': '73034', + 'id': '71089', 'display_id': 'technobuffalo/5-google-predictions-for-2016', 'ext': 'webm', 'title': '5 Google Predictions for 2016', @@ -31,89 +87,76 @@ 'uploader_id': 'technobuffalo', } }, { - 'url': 'http://testtube.com/brainstuff', - 'info_dict': { - 'id': '251', - 'title': 'BrainStuff', - 'description': 'Whether the topic is popcorn or particle physics, you can count on the HowStuffWorks team to explore-and explain-the everyday science in the world around us on BrainStuff.', - }, - 'playlist_mincount': 93, + # Show + 'url': 'http://revision3.com/variant', + 'only_matching': True, }, { - 'url': 'https://testtube.com/dnews/5-weird-ways-plants-can-eat-animals?utm_source=FB&utm_medium=DNews&utm_campaign=DNewsSocial', - 'info_dict': { - 'id': '60163', - 'display_id': 'dnews/5-weird-ways-plants-can-eat-animals', - 'duration': 275, - 'ext': 'webm', - 'title': '5 Weird Ways Plants Can Eat Animals', - 'description': 'Why have some plants evolved to eat meat?', - 'upload_date': '20150120', - 'timestamp': 1421763300, - 'uploader': 'DNews', - 'uploader_id': 'dnews', - }, + # Tag + 'url': 'http://revision3.com/vr', + 'only_matching': True, }] _PAGE_DATA_TEMPLATE = 'http://www.%s/apiProxy/ddn/%s?domain=%s' - _API_KEY = 'ba9c741bce1b9d8e3defcc22193f3651b8867e62' def _real_extract(self, url): domain, display_id = re.match(self._VALID_URL, url).groups() + site = domain.split('.')[0] page_info = self._download_json( self._PAGE_DATA_TEMPLATE % (domain, display_id, domain), display_id) - if page_info['data']['type'] == 'episode': - episode_data = page_info['data'] - video_id = compat_str(episode_data['video']['data']['id']) - video_data = self._download_json( - 'http://revision3.com/api/getPlaylist.json?api_key=%s&codecs=h264,vp8,theora&video_id=%s' % (self._API_KEY, video_id), - video_id)['items'][0] - - formats = [] - for vcodec, media in video_data['media'].items(): - for quality_id, quality in media.items(): - if quality_id == 'hls': - formats.extend(self._extract_m3u8_formats( - quality['url'], video_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False)) - else: - formats.append({ - 'url': quality['url'], - 'format_id': '%s-%s' % (vcodec, quality_id), - 'tbr': int_or_none(quality.get('bitrate')), - 'vcodec': vcodec, - }) - self._sort_formats(formats) + page_data = page_info['data'] + page_type = page_data['type'] + if page_type in ('episode', 'embed'): + show_data = page_data['show']['data'] + page_id = compat_str(page_data['id']) + video_id = compat_str(page_data['video']['data']['id']) preference = qualities(['mini', 'small', 'medium', 'large']) thumbnails = [{ 'url': image_url, 'id': image_id, 'preference': preference(image_id) - } for image_id, image_url in video_data.get('images', {}).items()] + } for image_id, image_url in page_data.get('images', {}).items()] - return { - 'id': video_id, + info = { + 'id': page_id, 'display_id': display_id, - 'title': unescapeHTML(video_data['title']), - 'description': unescapeHTML(video_data.get('summary')), - 'timestamp': parse_iso8601(episode_data.get('publishTime'), ' '), - 'author': episode_data.get('author'), - 'uploader': video_data.get('show', {}).get('name'), - 'uploader_id': video_data.get('show', {}).get('slug'), - 'duration': int_or_none(video_data.get('duration')), + 'title': unescapeHTML(page_data['name']), + 'description': unescapeHTML(page_data.get('summary')), + 'timestamp': parse_iso8601(page_data.get('publishTime'), ' '), + 'author': page_data.get('author'), + 'uploader': show_data.get('name'), + 'uploader_id': show_data.get('slug'), 'thumbnails': thumbnails, - 'formats': formats, + 'extractor_key': site, } + + if page_type == 'embed': + info.update({ + '_type': 'url_transparent', + 'url': page_data['video']['data']['embed'], + }) + return info + + info.update({ + '_type': 'url_transparent', + 'url': 'revision3:%s' % video_id, + }) + return info else: - show_data = page_info['show']['data'] + list_data = page_info[page_type]['data'] episodes_data = page_info['episodes']['data'] num_episodes = page_info['meta']['totalEpisodes'] processed_episodes = 0 entries = [] page_num = 1 while True: - entries.extend([self.url_result( - 'http://%s/%s/%s' % (domain, display_id, episode['slug'])) for episode in episodes_data]) + entries.extend([{ + '_type': 'url', + 'url': 'http://%s%s' % (domain, episode['path']), + 'id': compat_str(episode['id']), + 'ie_key': 'Revision3', + 'extractor_key': site, + } for episode in episodes_data]) processed_episodes += len(episodes_data) if processed_episodes == num_episodes: break @@ -123,5 +166,5 @@ display_id)['episodes']['data'] return self.playlist_result( - entries, compat_str(show_data['id']), - show_data.get('name'), show_data.get('summary')) + entries, compat_str(list_data['id']), + list_data.get('name'), list_data.get('summary')) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/rice.py youtube-dl-2019.05.20/youtube_dl/extractor/rice.py --- youtube-dl-2016.02.22/youtube_dl/extractor/rice.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/rice.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,116 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_parse_qs +from ..utils import ( + xpath_text, + xpath_element, + int_or_none, + parse_iso8601, + ExtractorError, +) + + +class RICEIE(InfoExtractor): + _VALID_URL = r'https?://mediahub\.rice\.edu/app/[Pp]ortal/video\.aspx\?(?P<query>.+)' + _TEST = { + 'url': 'https://mediahub.rice.edu/app/Portal/video.aspx?PortalID=25ffd62c-3d01-4b29-8c70-7c94270efb3e&DestinationID=66bc9434-03bd-4725-b47e-c659d8d809db&ContentID=YEWIvbhb40aqdjMD1ALSqw', + 'md5': '9b83b4a2eead4912dc3b7fac7c449b6a', + 'info_dict': { + 'id': 'YEWIvbhb40aqdjMD1ALSqw', + 'ext': 'mp4', + 'title': 'Active Learning in Archeology', + 'upload_date': '20140616', + 'timestamp': 1402926346, + } + } + _NS = 'http://schemas.datacontract.org/2004/07/ensembleVideo.Data.Service.Contracts.Models.Player.Config' + + def _real_extract(self, url): + qs = compat_parse_qs(re.match(self._VALID_URL, url).group('query')) + if not qs.get('PortalID') or not qs.get('DestinationID') or not qs.get('ContentID'): + raise ExtractorError('Invalid URL', expected=True) + + portal_id = qs['PortalID'][0] + playlist_id = qs['DestinationID'][0] + content_id = qs['ContentID'][0] + + content_data = self._download_xml('https://mediahub.rice.edu/api/portal/GetContentTitle', content_id, query={ + 'portalId': portal_id, + 'playlistId': playlist_id, + 'contentId': content_id + }) + metadata = xpath_element(content_data, './/metaData', fatal=True) + title = xpath_text(metadata, 'primaryTitle', fatal=True) + encodings = xpath_element(content_data, './/encodings', fatal=True) + player_data = self._download_xml('https://mediahub.rice.edu/api/player/GetPlayerConfig', content_id, query={ + 'temporaryLinkId': xpath_text(encodings, 'temporaryLinkId', fatal=True), + 'contentId': content_id, + }) + + common_fmt = {} + dimensions = xpath_text(encodings, 'dimensions') + if dimensions: + wh = dimensions.split('x') + if len(wh) == 2: + common_fmt.update({ + 'width': int_or_none(wh[0]), + 'height': int_or_none(wh[1]), + }) + + formats = [] + rtsp_path = xpath_text(player_data, self._xpath_ns('RtspPath', self._NS)) + if rtsp_path: + fmt = { + 'url': rtsp_path, + 'format_id': 'rtsp', + } + fmt.update(common_fmt) + formats.append(fmt) + for source in player_data.findall(self._xpath_ns('.//Source', self._NS)): + video_url = xpath_text(source, self._xpath_ns('File', self._NS)) + if not video_url: + continue + if '.m3u8' in video_url: + formats.extend(self._extract_m3u8_formats(video_url, content_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + else: + fmt = { + 'url': video_url, + 'format_id': video_url.split(':')[0], + } + fmt.update(common_fmt) + rtmp = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>.+))/(?P<playpath>mp4:.+)$', video_url) + if rtmp: + fmt.update({ + 'url': rtmp.group('url'), + 'play_path': rtmp.group('playpath'), + 'app': rtmp.group('app'), + 'ext': 'flv', + }) + formats.append(fmt) + self._sort_formats(formats) + + thumbnails = [] + for content_asset in content_data.findall('.//contentAssets'): + asset_type = xpath_text(content_asset, 'type') + if asset_type == 'image': + image_url = xpath_text(content_asset, 'httpPath') + if not image_url: + continue + thumbnails.append({ + 'id': xpath_text(content_asset, 'ID'), + 'url': image_url, + }) + + return { + 'id': content_id, + 'title': title, + 'description': xpath_text(metadata, 'abstract'), + 'duration': int_or_none(xpath_text(metadata, 'duration')), + 'timestamp': parse_iso8601(xpath_text(metadata, 'dateUpdated')), + 'thumbnails': thumbnails, + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/ringtv.py youtube-dl-2019.05.20/youtube_dl/extractor/ringtv.py --- youtube-dl-2016.02.22/youtube_dl/extractor/ringtv.py 2016-02-14 22:26:17.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/ringtv.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,44 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - - -class RingTVIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?ringtv\.craveonline\.com/(?P<type>news|videos/video)/(?P<id>[^/?#]+)' - _TEST = { - 'url': 'http://ringtv.craveonline.com/news/310833-luis-collazo-says-victor-ortiz-better-not-quit-on-jan-30', - 'md5': 'd25945f5df41cdca2d2587165ac28720', - 'info_dict': { - 'id': '857645', - 'ext': 'mp4', - 'title': 'Video: Luis Collazo says Victor Ortiz "better not quit on Jan. 30" - Ring TV', - 'description': 'Luis Collazo is excited about his Jan. 30 showdown with fellow former welterweight titleholder Victor Ortiz at Barclays Center in his hometown of Brooklyn. The SuperBowl week fight headlines a Golden Boy Live! card on Fox Sports 1.', - } - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id').split('-')[0] - webpage = self._download_webpage(url, video_id) - - if mobj.group('type') == 'news': - video_id = self._search_regex( - r'''(?x)<iframe[^>]+src="http://cms\.springboardplatform\.com/ - embed_iframe/[0-9]+/video/([0-9]+)/''', - webpage, 'real video ID') - title = self._og_search_title(webpage) - description = self._html_search_regex( - r'addthis:description="([^"]+)"', - webpage, 'description', fatal=False) - final_url = 'http://ringtv.craveonline.springboardplatform.com/storage/ringtv.craveonline.com/conversion/%s.mp4' % video_id - thumbnail_url = 'http://ringtv.craveonline.springboardplatform.com/storage/ringtv.craveonline.com/snapshots/%s.jpg' % video_id - - return { - 'id': video_id, - 'url': final_url, - 'title': title, - 'thumbnail': thumbnail_url, - 'description': description, - } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/rmcdecouverte.py youtube-dl-2019.05.20/youtube_dl/extractor/rmcdecouverte.py --- youtube-dl-2016.02.22/youtube_dl/extractor/rmcdecouverte.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/rmcdecouverte.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,55 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .brightcove import BrightcoveLegacyIE +from ..compat import ( + compat_parse_qs, + compat_urlparse, +) +from ..utils import smuggle_url + + +class RMCDecouverteIE(InfoExtractor): + _VALID_URL = r'https?://rmcdecouverte\.bfmtv\.com/(?:(?:[^/]+/)*program_(?P<id>\d+)|(?P<live_id>mediaplayer-direct))' + + _TESTS = [{ + 'url': 'https://rmcdecouverte.bfmtv.com/wheeler-dealers-occasions-a-saisir/program_2566/', + 'info_dict': { + 'id': '5983675500001', + 'ext': 'mp4', + 'title': 'CORVETTE', + 'description': 'md5:c1e8295521e45ffebf635d6a7658f506', + 'uploader_id': '1969646226001', + 'upload_date': '20181226', + 'timestamp': 1545861635, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'only available for a week', + }, { + # live, geo restricted, bypassable + 'url': 'https://rmcdecouverte.bfmtv.com/mediaplayer-direct/', + 'only_matching': True, + }] + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1969646226001/default_default/index.html?videoId=%s' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('id') or mobj.group('live_id') + webpage = self._download_webpage(url, display_id) + brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) + if brightcove_legacy_url: + brightcove_id = compat_parse_qs(compat_urlparse.urlparse( + brightcove_legacy_url).query)['@videoPlayer'][0] + else: + brightcove_id = self._search_regex( + r'data-video-id=["\'](\d+)', webpage, 'brightcove id') + return self.url_result( + smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + {'geo_countries': ['FR']}), + 'BrightcoveNew', brightcove_id) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/ro220.py youtube-dl-2019.05.20/youtube_dl/extractor/ro220.py --- youtube-dl-2016.02.22/youtube_dl/extractor/ro220.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/ro220.py 2019-06-07 18:49:07.000000000 +0000 @@ -14,7 +14,7 @@ 'id': 'LYV6doKo7f', 'ext': 'mp4', 'title': 'Luati-le Banii sez 4 ep 1', - 'description': 're:^Iata-ne reveniti dupa o binemeritata vacanta\. +Va astept si pe Facebook cu pareri si comentarii.$', + 'description': r're:^Iata-ne reveniti dupa o binemeritata vacanta\. +Va astept si pe Facebook cu pareri si comentarii.$', } } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/rockstargames.py youtube-dl-2019.05.20/youtube_dl/extractor/rockstargames.py --- youtube-dl-2016.02.22/youtube_dl/extractor/rockstargames.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/rockstargames.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,69 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, +) + + +class RockstarGamesIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rockstargames\.com/videos(?:/video/|#?/?\?.*\bvideo=)(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.rockstargames.com/videos/video/11544/', + 'md5': '03b5caa6e357a4bd50e3143fc03e5733', + 'info_dict': { + 'id': '11544', + 'ext': 'mp4', + 'title': 'Further Adventures in Finance and Felony Trailer', + 'description': 'md5:6d31f55f30cb101b5476c4a379e324a3', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1464876000, + 'upload_date': '20160602', + } + }, { + 'url': 'http://www.rockstargames.com/videos#/?video=48', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_json( + 'https://www.rockstargames.com/videoplayer/videos/get-video.json', + video_id, query={ + 'id': video_id, + 'locale': 'en_us', + })['video'] + + title = video['title'] + + formats = [] + for video in video['files_processed']['video/mp4']: + if not video.get('src'): + continue + resolution = video.get('resolution') + height = int_or_none(self._search_regex( + r'^(\d+)[pP]$', resolution or '', 'height', default=None)) + formats.append({ + 'url': self._proto_relative_url(video['src']), + 'format_id': resolution, + 'height': height, + }) + + if not formats: + youtube_id = video.get('youtube_id') + if youtube_id: + return self.url_result(youtube_id, 'Youtube') + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': video.get('description'), + 'thumbnail': self._proto_relative_url(video.get('screencap')), + 'timestamp': parse_iso8601(video.get('created')), + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/roosterteeth.py youtube-dl-2019.05.20/youtube_dl/extractor/roosterteeth.py --- youtube-dl-2016.02.22/youtube_dl/extractor/roosterteeth.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/roosterteeth.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,148 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + strip_or_none, + unescapeHTML, + urlencode_postdata, +) + + +class RoosterTeethIE(InfoExtractor): + _VALID_URL = r'https?://(?:.+?\.)?roosterteeth\.com/episode/(?P<id>[^/?#&]+)' + _LOGIN_URL = 'https://roosterteeth.com/login' + _NETRC_MACHINE = 'roosterteeth' + _TESTS = [{ + 'url': 'http://roosterteeth.com/episode/million-dollars-but-season-2-million-dollars-but-the-game-announcement', + 'md5': 'e2bd7764732d785ef797700a2489f212', + 'info_dict': { + 'id': '26576', + 'display_id': 'million-dollars-but-season-2-million-dollars-but-the-game-announcement', + 'ext': 'mp4', + 'title': 'Million Dollars, But...: Million Dollars, But... The Game Announcement', + 'description': 'md5:0cc3b21986d54ed815f5faeccd9a9ca5', + 'thumbnail': r're:^https?://.*\.png$', + 'series': 'Million Dollars, But...', + 'episode': 'Million Dollars, But... The Game Announcement', + 'comment_count': int, + }, + }, { + 'url': 'http://achievementhunter.roosterteeth.com/episode/off-topic-the-achievement-hunter-podcast-2016-i-didn-t-think-it-would-pass-31', + 'only_matching': True, + }, { + 'url': 'http://funhaus.roosterteeth.com/episode/funhaus-shorts-2016-austin-sucks-funhaus-shorts', + 'only_matching': True, + }, { + 'url': 'http://screwattack.roosterteeth.com/episode/death-battle-season-3-mewtwo-vs-shadow', + 'only_matching': True, + }, { + 'url': 'http://theknow.roosterteeth.com/episode/the-know-game-news-season-1-boring-steam-sales-are-better', + 'only_matching': True, + }, { + # only available for FIRST members + 'url': 'http://roosterteeth.com/episode/rt-docs-the-world-s-greatest-head-massage-the-world-s-greatest-head-massage-an-asmr-journey-part-one', + 'only_matching': True, + }] + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, + note='Downloading login page', + errnote='Unable to download login page') + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'username': username, + 'password': password, + }) + + login_request = self._download_webpage( + self._LOGIN_URL, None, + note='Logging in', + data=urlencode_postdata(login_form), + headers={ + 'Referer': self._LOGIN_URL, + }) + + if not any(re.search(p, login_request) for p in ( + r'href=["\']https?://(?:www\.)?roosterteeth\.com/logout"', + r'>Sign Out<')): + error = self._html_search_regex( + r'(?s)<div[^>]+class=(["\']).*?\balert-danger\b.*?\1[^>]*>(?:\s*<button[^>]*>.*?</button>)?(?P<error>.+?)</div>', + login_request, 'alert', default=None, group='error') + if error: + raise ExtractorError('Unable to login: %s' % error, expected=True) + raise ExtractorError('Unable to log in') + + def _real_initialize(self): + self._login() + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + episode = strip_or_none(unescapeHTML(self._search_regex( + (r'videoTitle\s*=\s*(["\'])(?P<title>(?:(?!\1).)+)\1', + r'<title>(?P<title>[^<]+)'), webpage, 'title', + default=None, group='title'))) + + title = strip_or_none(self._og_search_title( + webpage, default=None)) or episode + + m3u8_url = self._search_regex( + r'file\s*:\s*(["\'])(?Phttp.+?\.m3u8.*?)\1', + webpage, 'm3u8 url', default=None, group='url') + + if not m3u8_url: + if re.search(r']+class=["\']non-sponsor', webpage): + self.raise_login_required( + '%s is only available for FIRST members' % display_id) + + if re.search(r']+class=["\']golive-gate', webpage): + self.raise_login_required('%s is not available yet' % display_id) + + raise ExtractorError('Unable to extract m3u8 URL') + + formats = self._extract_m3u8_formats( + m3u8_url, display_id, ext='mp4', + entry_protocol='m3u8_native', m3u8_id='hls') + self._sort_formats(formats) + + description = strip_or_none(self._og_search_description(webpage)) + thumbnail = self._proto_relative_url(self._og_search_thumbnail(webpage)) + + series = self._search_regex( + (r'

More ([^<]+)

', r']+>See All ([^<]+) Videos<'), + webpage, 'series', fatal=False) + + comment_count = int_or_none(self._search_regex( + r'>Comments \((\d+)\)<', webpage, + 'comment count', fatal=False)) + + video_id = self._search_regex( + (r'containerId\s*=\s*["\']episode-(\d+)\1', + r'\d+)' +class RottenTomatoesIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rottentomatoes\.com/m/[^/]+/trailers/(?P\d+)' _TEST = { 'url': 'http://www.rottentomatoes.com/m/toy_story_3/trailers/11028566/', 'info_dict': { - 'id': '613340', + 'id': '11028566', 'ext': 'mp4', - 'title': 'TOY STORY 3', + 'title': 'Toy Story 3', 'description': 'From the creators of the beloved TOY STORY films, comes a story that will reunite the gang in a whole new way.', + 'thumbnail': r're:^https?://.*\.jpg$', }, } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + iva_id = self._search_regex(r'publishedid=(\d+)', webpage, 'internet video archive id') + + return { + '_type': 'url_transparent', + 'url': 'http://video.internetvideoarchive.net/player/6/configuration.ashx?domain=www.videodetective.com&customerid=69249&playerid=641&publishedid=' + iva_id, + 'ie_key': InternetVideoArchiveIE.ie_key(), + 'id': video_id, + 'title': self._og_search_title(webpage), + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/roxwel.py youtube-dl-2019.05.20/youtube_dl/extractor/roxwel.py --- youtube-dl-2016.02.22/youtube_dl/extractor/roxwel.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/roxwel.py 2019-06-07 18:49:07.000000000 +0000 @@ -7,7 +7,7 @@ class RoxwelIE(InfoExtractor): - _VALID_URL = r'https?://www\.roxwel\.com/player/(?P.+?)(\.|\?|$)' + _VALID_URL = r'https?://(?:www\.)?roxwel\.com/player/(?P.+?)(\.|\?|$)' _TEST = { 'url': 'http://www.roxwel.com/player/passionpittakeawalklive.html', diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/rozhlas.py youtube-dl-2019.05.20/youtube_dl/extractor/rozhlas.py --- youtube-dl-2016.02.22/youtube_dl/extractor/rozhlas.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/rozhlas.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,50 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + remove_start, +) + + +class RozhlasIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?prehravac\.rozhlas\.cz/audio/(?P[0-9]+)' + _TESTS = [{ + 'url': 'http://prehravac.rozhlas.cz/audio/3421320', + 'md5': '504c902dbc9e9a1fd50326eccf02a7e2', + 'info_dict': { + 'id': '3421320', + 'ext': 'mp3', + 'title': 'Echo Pavla Klusáka (30.06.2015 21:00)', + 'description': 'Osmdesátiny Terryho Rileyho jsou skvělou příležitostí proletět se elektronickými i akustickými díly zakladatatele minimalismu, který je aktivní už přes padesát let' + } + }, { + 'url': 'http://prehravac.rozhlas.cz/audio/3421320/embed', + 'only_matching': True, + }] + + def _real_extract(self, url): + audio_id = self._match_id(url) + + webpage = self._download_webpage( + 'http://prehravac.rozhlas.cz/audio/%s' % audio_id, audio_id) + + title = self._html_search_regex( + r'

(.+?)

\s*]*>.*?

\s*]+id=["\']player-track', + webpage, 'title', default=None) or remove_start( + self._og_search_title(webpage), 'Radio Wave - ') + description = self._html_search_regex( + r']+title=(["\'])(?P(?:(?!\1).)+)\1[^>]*>.*?

\s*]+id=["\']player-track', + webpage, 'description', fatal=False, group='url') + duration = int_or_none(self._search_regex( + r'data-duration=["\'](\d+)', webpage, 'duration', default=None)) + + return { + 'id': audio_id, + 'url': 'http://media.rozhlas.cz/_audio/%s.mp3' % audio_id, + 'title': title, + 'description': description, + 'duration': duration, + 'vcodec': 'none', + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/rtbf.py youtube-dl-2019.05.20/youtube_dl/extractor/rtbf.py --- youtube-dl-2016.02.22/youtube_dl/extractor/rtbf.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/rtbf.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,23 +1,36 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( + ExtractorError, + float_or_none, int_or_none, - unescapeHTML, + strip_or_none, ) class RTBFIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?rtbf\.be/(?:video/[^?]+\?.*\bid=|ouftivi/(?:[^/]+/)*[^?]+\?.*\bvideoId=)(?P\d+)' + _VALID_URL = r'''(?x) + https?://(?:www\.)?rtbf\.be/ + (?: + video/[^?]+\?.*\bid=| + ouftivi/(?:[^/]+/)*[^?]+\?.*\bvideoId=| + auvio/[^/]+\?.*\b(?Pl)?id= + )(?P\d+)''' _TESTS = [{ 'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274', - 'md5': '799f334ddf2c0a582ba80c44655be570', + 'md5': '8c876a1cceeb6cf31b476461ade72384', 'info_dict': { 'id': '1921274', 'ext': 'mp4', 'title': 'Les Diables au coeur (épisode 2)', - 'duration': 3099, + 'description': '(du 25/04/2014)', + 'duration': 3099.54, + 'upload_date': '20140425', + 'timestamp': 1398456300, } }, { # geo restricted @@ -26,45 +39,123 @@ }, { 'url': 'http://www.rtbf.be/ouftivi/niouzz?videoId=2055858', 'only_matching': True, + }, { + 'url': 'http://www.rtbf.be/auvio/detail_jeudi-en-prime-siegfried-bracke?id=2102996', + 'only_matching': True, + }, { + # Live + 'url': 'https://www.rtbf.be/auvio/direct_pure-fm?lid=134775', + 'only_matching': True, + }, { + # Audio + 'url': 'https://www.rtbf.be/auvio/detail_cinq-heures-cinema?id=2360811', + 'only_matching': True, + }, { + # With Subtitle + 'url': 'https://www.rtbf.be/auvio/detail_les-carnets-du-bourlingueur?id=2361588', + 'only_matching': True, }] - + _IMAGE_HOST = 'http://ds1.ds.static.rtbf.be' + _PROVIDERS = { + 'YOUTUBE': 'Youtube', + 'DAILYMOTION': 'Dailymotion', + 'VIMEO': 'Vimeo', + } _QUALITIES = [ - ('mobile', 'mobile'), - ('web', 'SD'), - ('url', 'MD'), + ('mobile', 'SD'), + ('web', 'MD'), ('high', 'HD'), ] def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage( - 'http://www.rtbf.be/video/embed?id=%s' % video_id, video_id) - - data = self._parse_json( - unescapeHTML(self._search_regex( - r'data-media="([^"]+)"', webpage, 'data video')), - video_id) - - if data.get('provider').lower() == 'youtube': - video_url = data.get('downloadUrl') or data.get('url') - return self.url_result(video_url, 'Youtube') + live, media_id = re.match(self._VALID_URL, url).groups() + embed_page = self._download_webpage( + 'https://www.rtbf.be/auvio/embed/' + ('direct' if live else 'media'), + media_id, query={'id': media_id}) + data = self._parse_json(self._html_search_regex( + r'data-media="([^"]+)"', embed_page, 'media data'), media_id) + + error = data.get('error') + if error: + raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) + + provider = data.get('provider') + if provider in self._PROVIDERS: + return self.url_result(data['url'], self._PROVIDERS[provider]) + + title = data['title'] + is_live = data.get('isLive') + if is_live: + title = self._live_title(title) + height_re = r'-(\d+)p\.' formats = [] - for key, format_id in self._QUALITIES: - format_url = data['sources'].get(key) - if format_url: + + m3u8_url = data.get('urlHlsAes128') or data.get('urlHls') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False)) + + fix_url = lambda x: x.replace('//rtbf-vod.', '//rtbf.') if '/geo/drm/' in x else x + http_url = data.get('url') + if formats and http_url and re.search(height_re, http_url): + http_url = fix_url(http_url) + for m3u8_f in formats[:]: + height = m3u8_f.get('height') + if not height: + continue + f = m3u8_f.copy() + del f['protocol'] + f.update({ + 'format_id': m3u8_f['format_id'].replace('hls-', 'http-'), + 'url': re.sub(height_re, '-%dp.' % height, http_url), + }) + formats.append(f) + else: + sources = data.get('sources') or {} + for key, format_id in self._QUALITIES: + format_url = sources.get(key) + if not format_url: + continue + height = int_or_none(self._search_regex( + height_re, format_url, 'height', default=None)) formats.append({ 'format_id': format_id, - 'url': format_url, + 'url': fix_url(format_url), + 'height': height, }) + mpd_url = data.get('urlDash') + if not data.get('drm') and mpd_url: + formats.extend(self._extract_mpd_formats( + mpd_url, media_id, mpd_id='dash', fatal=False)) + + audio_url = data.get('urlAudio') + if audio_url: + formats.append({ + 'format_id': 'audio', + 'url': audio_url, + 'vcodec': 'none', + }) + self._sort_formats(formats) + + subtitles = {} + for track in (data.get('tracks') or {}).values(): + sub_url = track.get('url') + if not sub_url: + continue + subtitles.setdefault(track.get('lang') or 'fr', []).append({ + 'url': sub_url, + }) + return { - 'id': video_id, + 'id': media_id, 'formats': formats, - 'title': data['title'], - 'description': data.get('description') or data.get('subtitle'), + 'title': title, + 'description': strip_or_none(data.get('description')), 'thumbnail': data.get('thumbnail'), - 'duration': data.get('duration') or data.get('realDuration'), - 'timestamp': int_or_none(data.get('created')), - 'view_count': int_or_none(data.get('viewCount')), + 'duration': float_or_none(data.get('realDuration')), + 'timestamp': int_or_none(data.get('liveFrom')), + 'series': data.get('programLabel'), + 'subtitles': subtitles, + 'is_live': is_live, } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/rte.py youtube-dl-2019.05.20/youtube_dl/extractor/rte.py --- youtube-dl-2016.02.22/youtube_dl/extractor/rte.py 2016-02-14 22:26:17.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/rte.py 2019-06-07 18:49:07.000000000 +0000 @@ -4,63 +4,126 @@ import re from .common import InfoExtractor +from ..compat import compat_HTTPError from ..utils import ( float_or_none, parse_iso8601, + str_or_none, + try_get, unescapeHTML, + url_or_none, + ExtractorError, ) -class RteIE(InfoExtractor): +class RteBaseIE(InfoExtractor): + def _real_extract(self, url): + item_id = self._match_id(url) + + info_dict = {} + formats = [] + + ENDPOINTS = ( + 'https://feeds.rasset.ie/rteavgen/player/playlist?type=iptv&format=json&showId=', + 'http://www.rte.ie/rteavgen/getplaylist/?type=web&format=json&id=', + ) + + for num, ep_url in enumerate(ENDPOINTS, start=1): + try: + data = self._download_json(ep_url + item_id, item_id) + except ExtractorError as ee: + if num < len(ENDPOINTS) or formats: + continue + if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404: + error_info = self._parse_json(ee.cause.read().decode(), item_id, fatal=False) + if error_info: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error_info['message']), + expected=True) + raise + + # NB the string values in the JSON are stored using XML escaping(!) + show = try_get(data, lambda x: x['shows'][0], dict) + if not show: + continue + + if not info_dict: + title = unescapeHTML(show['title']) + description = unescapeHTML(show.get('description')) + thumbnail = show.get('thumbnail') + duration = float_or_none(show.get('duration'), 1000) + timestamp = parse_iso8601(show.get('published')) + info_dict = { + 'id': item_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + } + + mg = try_get(show, lambda x: x['media:group'][0], dict) + if not mg: + continue + + if mg.get('url'): + m = re.match(r'(?Prtmpe?://[^/]+)/(?P.+)/(?Pmp4:.*)', mg['url']) + if m: + m = m.groupdict() + formats.append({ + 'url': m['url'] + '/' + m['app'], + 'app': m['app'], + 'play_path': m['playpath'], + 'player_url': url, + 'ext': 'flv', + 'format_id': 'rtmp', + }) + + if mg.get('hls_server') and mg.get('hls_url'): + formats.extend(self._extract_m3u8_formats( + mg['hls_server'] + mg['hls_url'], item_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + + if mg.get('hds_server') and mg.get('hds_url'): + formats.extend(self._extract_f4m_formats( + mg['hds_server'] + mg['hds_url'], item_id, + f4m_id='hds', fatal=False)) + + mg_rte_server = str_or_none(mg.get('rte:server')) + mg_url = str_or_none(mg.get('url')) + if mg_rte_server and mg_url: + hds_url = url_or_none(mg_rte_server + mg_url) + if hds_url: + formats.extend(self._extract_f4m_formats( + hds_url, item_id, f4m_id='hds', fatal=False)) + + self._sort_formats(formats) + + info_dict['formats'] = formats + return info_dict + + +class RteIE(RteBaseIE): IE_NAME = 'rte' IE_DESC = 'Raidió Teilifís Éireann TV' _VALID_URL = r'https?://(?:www\.)?rte\.ie/player/[^/]{2,3}/show/[^/]+/(?P[0-9]+)' _TEST = { 'url': 'http://www.rte.ie/player/ie/show/iwitness-862/10478715/', + 'md5': '4a76eb3396d98f697e6e8110563d2604', 'info_dict': { 'id': '10478715', - 'ext': 'flv', - 'title': 'Watch iWitness online', - 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 'iWitness : The spirit of Ireland, one voice and one minute at a time.', + 'ext': 'mp4', + 'title': 'iWitness', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'The spirit of Ireland, one voice and one minute at a time.', 'duration': 60.046, + 'upload_date': '20151012', + 'timestamp': 1444694160, }, - 'params': { - 'skip_download': 'f4m fails with --test atm' - } } - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - title = self._og_search_title(webpage) - description = self._html_search_meta('description', webpage, 'description') - duration = float_or_none(self._html_search_meta( - 'duration', webpage, 'duration', fatal=False), 1000) - - thumbnail_id = self._search_regex( - r'', webpage, 'thumbnail') - thumbnail = 'http://img.rasset.ie/' + thumbnail_id + '.jpg' - - feeds_url = self._html_search_meta('feeds-prefix', webpage, 'feeds url') + video_id - json_string = self._download_json(feeds_url, video_id) - - # f4m_url = server + relative_url - f4m_url = json_string['shows'][0]['media:group'][0]['rte:server'] + json_string['shows'][0]['media:group'][0]['url'] - f4m_formats = self._extract_f4m_formats(f4m_url, video_id) - - return { - 'id': video_id, - 'title': title, - 'formats': f4m_formats, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - } - -class RteRadioIE(InfoExtractor): +class RteRadioIE(RteBaseIE): IE_NAME = 'rte:radio' IE_DESC = 'Raidió Teilifís Éireann radio' # Radioplayer URLs have two distinct specifier formats, @@ -73,19 +136,17 @@ _TESTS = [{ # Old-style player URL; HLS and RTMPE formats 'url': 'http://www.rte.ie/radio/utils/radioplayer/rteradioweb.html#!rii=16:10507902:2414:27-12-2015:', + 'md5': 'c79ccb2c195998440065456b69760411', 'info_dict': { 'id': '10507902', 'ext': 'mp4', 'title': 'Gloria', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'description': 'md5:9ce124a7fb41559ec68f06387cabddf0', 'timestamp': 1451203200, 'upload_date': '20151227', 'duration': 7230.0, }, - 'params': { - 'skip_download': 'f4m fails with --test atm' - } }, { # New-style player URL; RTMPE formats only 'url': 'http://rte.ie/radio/utils/radioplayer/rteradioweb.html#!rii=b16_3250678_8861_06-04-2012_', @@ -93,67 +154,14 @@ 'id': '3250678', 'ext': 'flv', 'title': 'The Lyric Concert with Paul Herriott', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'description': '', 'timestamp': 1333742400, 'upload_date': '20120406', 'duration': 7199.016, }, 'params': { - 'skip_download': 'f4m fails with --test atm' - } + # rtmp download + 'skip_download': True, + }, }] - - def _real_extract(self, url): - item_id = self._match_id(url) - - json_string = self._download_json( - 'http://www.rte.ie/rteavgen/getplaylist/?type=web&format=json&id=' + item_id, - item_id) - - # NB the string values in the JSON are stored using XML escaping(!) - show = json_string['shows'][0] - title = unescapeHTML(show['title']) - description = unescapeHTML(show.get('description')) - thumbnail = show.get('thumbnail') - duration = float_or_none(show.get('duration'), 1000) - timestamp = parse_iso8601(show.get('published')) - - mg = show['media:group'][0] - - formats = [] - - if mg.get('url'): - m = re.match(r'(?Prtmpe?://[^/]+)/(?P.+)/(?Pmp4:.*)', mg['url']) - if m: - m = m.groupdict() - formats.append({ - 'url': m['url'] + '/' + m['app'], - 'app': m['app'], - 'play_path': m['playpath'], - 'player_url': url, - 'ext': 'flv', - 'format_id': 'rtmp', - }) - - if mg.get('hls_server') and mg.get('hls_url'): - formats.extend(self._extract_m3u8_formats( - mg['hls_server'] + mg['hls_url'], item_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) - - if mg.get('hds_server') and mg.get('hds_url'): - formats.extend(self._extract_f4m_formats( - mg['hds_server'] + mg['hds_url'], item_id, - f4m_id='hds', fatal=False)) - - self._sort_formats(formats) - - return { - 'id': item_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'duration': duration, - 'formats': formats, - } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/rtl2.py youtube-dl-2019.05.20/youtube_dl/extractor/rtl2.py --- youtube-dl-2016.02.22/youtube_dl/extractor/rtl2.py 2016-02-14 22:26:17.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/rtl2.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,85 +1,207 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re + from .common import InfoExtractor +from ..aes import aes_cbc_decrypt +from ..compat import ( + compat_b64decode, + compat_ord, + compat_str, +) +from ..utils import ( + bytes_to_intlist, + ExtractorError, + intlist_to_bytes, + int_or_none, + strip_or_none, +) class RTL2IE(InfoExtractor): - _VALID_URL = r'http?://(?:www\.)?rtl2\.de/[^?#]*?/(?P[^?#/]*?)(?:$|/(?:$|[?#]))' + IE_NAME = 'rtl2' + _VALID_URL = r'https?://(?:www\.)?rtl2\.de/sendung/[^/]+/(?:video/(?P\d+)[^/]+/(?P\d+)-|folge/)(?P[^/?#]+)' _TESTS = [{ 'url': 'http://www.rtl2.de/sendung/grip-das-motormagazin/folge/folge-203-0', 'info_dict': { 'id': 'folge-203-0', 'ext': 'f4v', 'title': 'GRIP sucht den Sommerkönig', - 'description': 'Matthias, Det und Helge treten gegeneinander an.' + 'description': 'md5:e3adbb940fd3c6e76fa341b8748b562f' }, 'params': { # rtmp download 'skip_download': True, }, + 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], }, { 'url': 'http://www.rtl2.de/sendung/koeln-50667/video/5512-anna/21040-anna-erwischt-alex/', 'info_dict': { - 'id': '21040-anna-erwischt-alex', + 'id': 'anna-erwischt-alex', 'ext': 'mp4', 'title': 'Anna erwischt Alex!', - 'description': 'Anna ist Alex\' Tochter bei Köln 50667.' + 'description': 'Anna nimmt ihrem Vater nicht ab, dass er nicht spielt. Und tatsächlich erwischt sie ihn auf frischer Tat.' }, 'params': { # rtmp download 'skip_download': True, }, + 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], }] def _real_extract(self, url): - # Some rtl2 urls have no slash at the end, so append it. - if not url.endswith('/'): - url += '/' + vico_id, vivi_id, display_id = re.match(self._VALID_URL, url).groups() + if not vico_id: + webpage = self._download_webpage(url, display_id) + + mobj = re.search( + r'data-collection="(?P\d+)"[^>]+data-video="(?P\d+)"', + webpage) + if mobj: + vico_id = mobj.group('vico_id') + vivi_id = mobj.group('vivi_id') + else: + vico_id = self._html_search_regex( + r'vico_id\s*:\s*([0-9]+)', webpage, 'vico_id') + vivi_id = self._html_search_regex( + r'vivi_id\s*:\s*([0-9]+)', webpage, 'vivi_id') + + info = self._download_json( + 'https://service.rtl2.de/api-player-vipo/video.php', + display_id, query={ + 'vico_id': vico_id, + 'vivi_id': vivi_id, + }) + video_info = info['video'] + title = video_info['titel'] - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + formats = [] - mobj = re.search( - r']+data-collection="(?P\d+)"[^>]+data-video="(?P\d+)"', - webpage) - if mobj: - vico_id = mobj.group('vico_id') - vivi_id = mobj.group('vivi_id') - else: - vico_id = self._html_search_regex( - r'vico_id\s*:\s*([0-9]+)', webpage, 'vico_id') - vivi_id = self._html_search_regex( - r'vivi_id\s*:\s*([0-9]+)', webpage, 'vivi_id') - info_url = 'http://www.rtl2.de/video/php/get_video.php?vico_id=' + vico_id + '&vivi_id=' + vivi_id + rtmp_url = video_info.get('streamurl') + if rtmp_url: + rtmp_url = rtmp_url.replace('\\', '') + stream_url = 'mp4:' + self._html_search_regex(r'/ondemand/(.+)', rtmp_url, 'stream URL') + rtmp_conn = ['S:connect', 'O:1', 'NS:pageUrl:' + url, 'NB:fpad:0', 'NN:videoFunction:1', 'O:0'] + + formats.append({ + 'format_id': 'rtmp', + 'url': rtmp_url, + 'play_path': stream_url, + 'player_url': 'https://www.rtl2.de/sites/default/modules/rtl2/jwplayer/jwplayer-7.6.0/jwplayer.flash.swf', + 'page_url': url, + 'flash_version': 'LNX 11,2,202,429', + 'rtmp_conn': rtmp_conn, + 'no_resume': True, + 'preference': 1, + }) + + m3u8_url = video_info.get('streamurl_hls') + if m3u8_url: + formats.extend(self._extract_akamai_formats(m3u8_url, display_id)) - info = self._download_json(info_url, video_id) - video_info = info['video'] - title = video_info['titel'] - description = video_info.get('beschreibung') - thumbnail = video_info.get('image') + self._sort_formats(formats) + + return { + 'id': display_id, + 'title': title, + 'thumbnail': video_info.get('image'), + 'description': video_info.get('beschreibung'), + 'duration': int_or_none(video_info.get('duration')), + 'formats': formats, + } + + +class RTL2YouBaseIE(InfoExtractor): + _BACKWERK_BASE_URL = 'https://p-you-backwerk.rtl2apps.de/' + + +class RTL2YouIE(RTL2YouBaseIE): + IE_NAME = 'rtl2:you' + _VALID_URL = r'http?://you\.rtl2\.de/(?:video/\d+/|youplayer/index\.html\?.*?\bvid=)(?P\d+)' + _TESTS = [{ + 'url': 'http://you.rtl2.de/video/3002/15740/MJUNIK%20%E2%80%93%20Home%20of%20YOU/307-hirn-wo-bist-du', + 'info_dict': { + 'id': '15740', + 'ext': 'mp4', + 'title': 'MJUNIK – Home of YOU - #307 Hirn, wo bist du?!', + 'description': 'md5:ddaa95c61b372b12b66e115b2772fe01', + 'age_limit': 12, + }, + }, { + 'url': 'http://you.rtl2.de/youplayer/index.html?vid=15712', + 'only_matching': True, + }] + _AES_KEY = b'\xe9W\xe4.<*\xb8\x1a\xd2\xb6\x92\xf3C\xd3\xefL\x1b\x03*\xbbbH\xc0\x03\xffo\xc2\xf2(\xaa\xaa!' + _GEO_COUNTRIES = ['DE'] + + def _real_extract(self, url): + video_id = self._match_id(url) + + stream_data = self._download_json( + self._BACKWERK_BASE_URL + 'stream/video/' + video_id, video_id) - download_url = video_info['streamurl'] - download_url = download_url.replace('\\', '') - stream_url = 'mp4:' + self._html_search_regex(r'ondemand/(.*)', download_url, 'stream URL') - rtmp_conn = ['S:connect', 'O:1', 'NS:pageUrl:' + url, 'NB:fpad:0', 'NN:videoFunction:1', 'O:0'] - - formats = [{ - 'url': download_url, - 'play_path': stream_url, - 'player_url': 'http://www.rtl2.de/flashplayer/vipo_player.swf', - 'page_url': url, - 'flash_version': 'LNX 11,2,202,429', - 'rtmp_conn': rtmp_conn, - 'no_resume': True, - }] + data, iv = compat_b64decode(stream_data['streamUrl']).decode().split(':') + stream_url = intlist_to_bytes(aes_cbc_decrypt( + bytes_to_intlist(compat_b64decode(data)), + bytes_to_intlist(self._AES_KEY), + bytes_to_intlist(compat_b64decode(iv)) + )) + if b'rtl2_you_video_not_found' in stream_url: + raise ExtractorError('video not found', expected=True) + + formats = self._extract_m3u8_formats( + stream_url[:-compat_ord(stream_url[-1])].decode(), + video_id, 'mp4', 'm3u8_native') self._sort_formats(formats) + video_data = self._download_json( + self._BACKWERK_BASE_URL + 'video/' + video_id, video_id) + + series = video_data.get('formatTitle') + title = episode = video_data.get('title') or series + if series and series != title: + title = '%s - %s' % (series, title) + return { 'id': video_id, 'title': title, - 'thumbnail': thumbnail, - 'description': description, 'formats': formats, + 'description': strip_or_none(video_data.get('description')), + 'thumbnail': video_data.get('image'), + 'duration': int_or_none(stream_data.get('duration') or video_data.get('duration'), 1000), + 'series': series, + 'episode': episode, + 'age_limit': int_or_none(video_data.get('minimumAge')), } + + +class RTL2YouSeriesIE(RTL2YouBaseIE): + IE_NAME = 'rtl2:you:series' + _VALID_URL = r'http?://you\.rtl2\.de/videos/(?P\d+)' + _TEST = { + 'url': 'http://you.rtl2.de/videos/115/dragon-ball', + 'info_dict': { + 'id': '115', + }, + 'playlist_mincount': 5, + } + + def _real_extract(self, url): + series_id = self._match_id(url) + stream_data = self._download_json( + self._BACKWERK_BASE_URL + 'videos', + series_id, query={ + 'formatId': series_id, + 'limit': 1000000000, + }) + + entries = [] + for video in stream_data.get('videos', []): + video_id = compat_str(video['videoId']) + if not video_id: + continue + entries.append(self.url_result( + 'http://you.rtl2.de/video/%s/%s' % (series_id, video_id), + 'RTL2You', video_id)) + return self.playlist_result(entries, series_id) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/rtlnl.py youtube-dl-2019.05.20/youtube_dl/extractor/rtlnl.py --- youtube-dl-2016.02.22/youtube_dl/extractor/rtlnl.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/rtlnl.py 2019-06-07 18:49:07.000000000 +0000 @@ -12,26 +12,27 @@ IE_NAME = 'rtl.nl' IE_DESC = 'rtl.nl and rtlxl.nl' _VALID_URL = r'''(?x) - https?://(?:www\.)? + https?://(?:(?:www|static)\.)? (?: - rtlxl\.nl/\#!/[^/]+/| - rtl\.nl/system/videoplayer/(?:[^/]+/)+(?:video_)?embed\.html\b.+?\buuid= + rtlxl\.nl/[^\#]*\#!/[^/]+/| + rtl\.nl/(?:(?:system/videoplayer/(?:[^/]+/)+(?:video_)?embed\.html|embed)\b.+?\buuid=|video/) ) (?P[0-9a-f-]+)''' _TESTS = [{ - 'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/6e4203a6-0a5e-3596-8424-c599a59e0677', - 'md5': 'cc16baa36a6c169391f0764fa6b16654', + 'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/82b1aad1-4a14-3d7b-b554-b0aed1b2c416', + 'md5': '473d1946c1fdd050b2c0161a4b13c373', 'info_dict': { - 'id': '6e4203a6-0a5e-3596-8424-c599a59e0677', + 'id': '82b1aad1-4a14-3d7b-b554-b0aed1b2c416', 'ext': 'mp4', - 'title': 'RTL Nieuws - Laat', - 'description': 'md5:6b61f66510c8889923b11f2778c72dc5', - 'timestamp': 1408051800, - 'upload_date': '20140814', - 'duration': 576.880, + 'title': 'RTL Nieuws', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'timestamp': 1461951000, + 'upload_date': '20160429', + 'duration': 1167.96, }, }, { + # best format avaialble a3t 'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed/autoplay=false', 'md5': 'dea7474214af1271d91ef332fb8be7ea', 'info_dict': { @@ -39,18 +40,19 @@ 'ext': 'mp4', 'timestamp': 1424039400, 'title': 'RTL Nieuws - Nieuwe beelden Kopenhagen: chaos direct na aanslag', - 'thumbnail': 're:^https?://screenshots\.rtl\.nl/system/thumb/sz=[0-9]+x[0-9]+/uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed$', + 'thumbnail': r're:^https?://screenshots\.rtl\.nl/(?:[^/]+/)*sz=[0-9]+x[0-9]+/uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed$', 'upload_date': '20150215', 'description': 'Er zijn nieuwe beelden vrijgegeven die vlak na de aanslag in Kopenhagen zijn gemaakt. Op de video is goed te zien hoe omstanders zich bekommeren om één van de slachtoffers, terwijl de eerste agenten ter plaatse komen.', } }, { - # empty synopsis and missing episodes (see https://github.com/rg3/youtube-dl/issues/6275) + # empty synopsis and missing episodes (see https://github.com/ytdl-org/youtube-dl/issues/6275) + # best format available nettv 'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a/autoplay=false', 'info_dict': { 'id': 'f536aac0-1dc3-4314-920e-3bd1c5b3811a', 'ext': 'mp4', 'title': 'RTL Nieuws - Meer beelden van overval juwelier', - 'thumbnail': 're:^https?://screenshots\.rtl\.nl/system/thumb/sz=[0-9]+x[0-9]+/uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a$', + 'thumbnail': r're:^https?://screenshots\.rtl\.nl/(?:[^/]+/)*sz=[0-9]+x[0-9]+/uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a$', 'timestamp': 1437233400, 'upload_date': '20150718', 'duration': 30.474, @@ -65,6 +67,15 @@ }, { 'url': 'http://www.rtl.nl/system/videoplayer/derden/embed.html#!/uuid=bb0353b0-d6a4-1dad-90e9-18fe75b8d1f0', 'only_matching': True, + }, { + 'url': 'http://rtlxl.nl/?_ga=1.204735956.572365465.1466978370#!/rtl-nieuws-132237/3c487912-023b-49ac-903e-2c5d79f8410f', + 'only_matching': True, + }, { + 'url': 'https://www.rtl.nl/video/c603c9c2-601d-4b5e-8175-64f1e942dc7d/', + 'only_matching': True, + }, { + 'url': 'https://static.rtl.nl/embed/?uuid=1a2970fc-5c0b-43ff-9fdc-927e39e6d1bc&autoplay=false&publicatiepunt=rtlnieuwsnl', + 'only_matching': True, }] def _real_extract(self, url): @@ -82,34 +93,11 @@ meta = info.get('meta', {}) - # m3u8 streams are encrypted and may not be handled properly by older ffmpeg/avconv. - # To workaround this previously adaptive -> flash trick was used to obtain - # unencrypted m3u8 streams (see https://github.com/rg3/youtube-dl/issues/4118) - # and bypass georestrictions as well. - # Currently, unencrypted m3u8 playlists are (intentionally?) invalid and therefore - # unusable albeit can be fixed by simple string replacement (see - # https://github.com/rg3/youtube-dl/pull/6337) - # Since recent ffmpeg and avconv handle encrypted streams just fine encrypted - # streams are used now. videopath = material['videopath'] m3u8_url = meta.get('videohost', 'http://manifest.us.rtl.nl') + videopath - formats = self._extract_m3u8_formats(m3u8_url, uuid, ext='mp4') - - video_urlpart = videopath.split('/adaptive/')[1][:-5] - PG_URL_TEMPLATE = 'http://pg.us.rtl.nl/rtlxl/network/%s/progressive/%s.mp4' - - formats.extend([ - { - 'url': PG_URL_TEMPLATE % ('a2m', video_urlpart), - 'format_id': 'pg-sd', - }, - { - 'url': PG_URL_TEMPLATE % ('a3m', video_urlpart), - 'format_id': 'pg-hd', - 'quality': 0, - } - ]) + formats = self._extract_m3u8_formats( + m3u8_url, uuid, 'mp4', m3u8_id='hls', fatal=False) self._sort_formats(formats) thumbnails = [] diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/rtp.py youtube-dl-2019.05.20/youtube_dl/extractor/rtp.py --- youtube-dl-2016.02.22/youtube_dl/extractor/rtp.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/rtp.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,9 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..utils import ( + determine_ext, + js_to_json, +) class RTPIE(InfoExtractor): @@ -16,11 +18,7 @@ 'ext': 'mp3', 'title': 'Paixões Cruzadas', 'description': 'As paixões musicais de António Cartaxo e António Macedo', - 'thumbnail': 're:^https?://.*\.jpg', - }, - 'params': { - # rtmp download - 'skip_download': True, + 'thumbnail': r're:^https?://.*\.jpg', }, }, { 'url': 'http://www.rtp.pt/play/p831/a-quimica-das-coisas', @@ -33,57 +31,36 @@ webpage = self._download_webpage(url, video_id) title = self._html_search_meta( 'twitter:title', webpage, display_name='title', fatal=True) - description = self._html_search_meta('description', webpage) - thumbnail = self._og_search_thumbnail(webpage) - - player_config = self._search_regex( - r'(?s)RTPPLAY\.player\.newPlayer\(\s*(\{.*?\})\s*\)', webpage, 'player config') - config = self._parse_json(player_config, video_id) - - path, ext = config.get('file').rsplit('.', 1) - formats = [{ - 'format_id': 'rtmp', - 'ext': ext, - 'vcodec': config.get('type') == 'audio' and 'none' or None, - 'preference': -2, - 'url': 'rtmp://{streamer:s}/{application:s}'.format(**config), - 'app': config.get('application'), - 'play_path': '{ext:s}:{path:s}'.format(ext=ext, path=path), - 'page_url': url, - 'rtmp_live': config.get('live', False), - 'player_url': 'http://programas.rtp.pt/play/player.swf?v3', - 'rtmp_real_time': True, - }] - - # Construct regular HTTP download URLs - replacements = { - 'audio': { - 'format_id': 'mp3', - 'pattern': r'^nas2\.share/wavrss/', - 'repl': 'http://rsspod.rtp.pt/podcasts/', - 'vcodec': 'none', - }, - 'video': { - 'format_id': 'mp4_h264', - 'pattern': r'^nas2\.share/h264/', - 'repl': 'http://rsspod.rtp.pt/videocasts/', - 'vcodec': 'h264', - }, - } - r = replacements[config['type']] - if re.match(r['pattern'], config['file']) is not None: - formats.append({ - 'format_id': r['format_id'], - 'url': re.sub(r['pattern'], r['repl'], config['file']), - 'vcodec': r['vcodec'], - }) - self._sort_formats(formats) + config = self._parse_json(self._search_regex( + r'(?s)RTPPlayer\(({.+?})\);', webpage, + 'player config'), video_id, js_to_json) + file_url = config['file'] + ext = determine_ext(file_url) + if ext == 'm3u8': + file_key = config.get('fileKey') + formats = self._extract_m3u8_formats( + file_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=file_key) + if file_key: + formats.append({ + 'url': 'https://cdn-ondemand.rtp.pt' + file_key, + 'preference': 1, + }) + self._sort_formats(formats) + else: + formats = [{ + 'url': file_url, + 'ext': ext, + }] + if config.get('mediaType') == 'audio': + for f in formats: + f['vcodec'] = 'none' return { 'id': video_id, 'title': title, 'formats': formats, - 'description': description, - 'thumbnail': thumbnail, + 'description': self._html_search_meta(['description', 'twitter:description'], webpage), + 'thumbnail': config.get('poster') or self._og_search_thumbnail(webpage), } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/rts.py youtube-dl-2019.05.20/youtube_dl/extractor/rts.py --- youtube-dl-2016.02.22/youtube_dl/extractor/rts.py 2015-12-31 15:50:46.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/rts.py 2019-06-07 18:49:07.000000000 +0000 @@ -4,27 +4,24 @@ import re from .srgssr import SRGSSRIE -from ..compat import ( - compat_str, - compat_urllib_parse_urlparse, -) +from ..compat import compat_str from ..utils import ( int_or_none, parse_duration, parse_iso8601, unescapeHTML, - xpath_text, + determine_ext, ) class RTSIE(SRGSSRIE): IE_DESC = 'RTS.ch' - _VALID_URL = r'rts:(?P\d+)|https?://(?:www\.)?rts\.ch/(?:[^/]+/){2,}(?P[0-9]+)-(?P.+?)\.html' + _VALID_URL = r'rts:(?P\d+)|https?://(?:.+?\.)?rts\.ch/(?:[^/]+/){2,}(?P[0-9]+)-(?P.+?)\.html' _TESTS = [ { 'url': 'http://www.rts.ch/archives/tv/divers/3449373-les-enfants-terribles.html', - 'md5': 'f254c4b26fb1d3c183793d52bc40d3e7', + 'md5': 'ff7f8450a90cf58dacb64e29707b4a8e', 'info_dict': { 'id': '3449373', 'display_id': 'les-enfants-terribles', @@ -35,38 +32,20 @@ 'uploader': 'Divers', 'upload_date': '19680921', 'timestamp': -40280400, - 'thumbnail': 're:^https?://.*\.image', + 'thumbnail': r're:^https?://.*\.image', 'view_count': int, }, - 'params': { - # m3u8 download - 'skip_download': True, - } }, { 'url': 'http://www.rts.ch/emissions/passe-moi-les-jumelles/5624067-entre-ciel-et-mer.html', - 'md5': 'f1077ac5af686c76528dc8d7c5df29ba', 'info_dict': { - 'id': '5742494', - 'display_id': '5742494', - 'ext': 'mp4', - 'duration': 3720, - 'title': 'Les yeux dans les cieux - Mon homard au Canada', - 'description': 'md5:d22ee46f5cc5bac0912e5a0c6d44a9f7', - 'uploader': 'Passe-moi les jumelles', - 'upload_date': '20140404', - 'timestamp': 1396635300, - 'thumbnail': 're:^https?://.*\.image', - 'view_count': int, + 'id': '5624065', + 'title': 'Passe-moi les jumelles', }, - 'params': { - # m3u8 download - 'skip_download': True, - } + 'playlist_mincount': 4, }, { 'url': 'http://www.rts.ch/video/sport/hockey/5745975-1-2-kloten-fribourg-5-2-second-but-pour-gotteron-par-kwiatowski.html', - 'md5': 'b4326fecd3eb64a458ba73c73e91299d', 'info_dict': { 'id': '5745975', 'display_id': '1-2-kloten-fribourg-5-2-second-but-pour-gotteron-par-kwiatowski', @@ -77,14 +56,18 @@ 'uploader': 'Hockey', 'upload_date': '20140403', 'timestamp': 1396556882, - 'thumbnail': 're:^https?://.*\.image', + 'thumbnail': r're:^https?://.*\.image', 'view_count': int, }, + 'params': { + # m3u8 download + 'skip_download': True, + }, 'skip': 'Blocked outside Switzerland', }, { 'url': 'http://www.rts.ch/video/info/journal-continu/5745356-londres-cachee-par-un-epais-smog.html', - 'md5': '9f713382f15322181bb366cc8c3a4ff0', + 'md5': '1bae984fe7b1f78e94abc74e802ed99f', 'info_dict': { 'id': '5745356', 'display_id': 'londres-cachee-par-un-epais-smog', @@ -92,16 +75,12 @@ 'duration': 33, 'title': 'Londres cachée par un épais smog', 'description': 'Un important voile de smog recouvre Londres depuis mercredi, provoqué par la pollution et du sable du Sahara.', - 'uploader': 'Le Journal en continu', + 'uploader': 'L\'actu en vidéo', 'upload_date': '20140403', 'timestamp': 1396537322, - 'thumbnail': 're:^https?://.*\.image', + 'thumbnail': r're:^https?://.*\.image', 'view_count': int, }, - 'params': { - # m3u8 download - 'skip_download': True, - } }, { 'url': 'http://www.rts.ch/audio/couleur3/programmes/la-belle-video-de-stephane-laurenceau/5706148-urban-hippie-de-damien-krisl-03-04-2014.html', @@ -125,6 +104,10 @@ 'title': 'Hockey: Davos décroche son 31e titre de champion de Suisse', }, 'playlist_mincount': 5, + }, + { + 'url': 'http://pages.rts.ch/emissions/passe-moi-les-jumelles/5624065-entre-ciel-et-mer.html', + 'only_matching': True, } ] @@ -142,19 +125,32 @@ # media_id extracted out of URL is not always a real id if 'video' not in all_info and 'audio' not in all_info: - page = self._download_webpage(url, display_id) + entries = [] - # article with videos on rhs - videos = re.findall( - r']+class="content-item"[^>]*>\s*]+data-video-urn="urn:([^"]+)"', - page) - if not videos: + for item in all_info.get('items', []): + item_url = item.get('url') + if not item_url: + continue + entries.append(self.url_result(item_url, 'RTS')) + + if not entries: + page, urlh = self._download_webpage_handle(url, display_id) + if re.match(self._VALID_URL, urlh.geturl()).group('id') != media_id: + return self.url_result(urlh.geturl(), 'RTS') + + # article with videos on rhs videos = re.findall( - r'(?s)]+class="srg-player"[^>]+src="[^"]+urn:([^"]+)"', + r']+class="content-item"[^>]*>\s*]+data-video-urn="urn:([^"]+)"', page) - if videos: - entries = [self.url_result('srgssr:%s' % video_urn, 'SRGSSR') for video_urn in videos] - return self.playlist_result(entries, media_id, self._og_search_title(page)) + if not videos: + videos = re.findall( + r'(?s)]+class="srg-player"[^>]+src="[^"]+urn:([^"]+)"', + page) + if videos: + entries = [self.url_result('srgssr:%s' % video_urn, 'SRGSSR') for video_urn in videos] + + if entries: + return self.playlist_result(entries, media_id, all_info.get('title')) internal_id = self._html_search_regex( r'<(?:video|audio) data-id="([0-9]+)"', page, @@ -168,36 +164,29 @@ info = all_info['video']['JSONinfo'] if 'video' in all_info else all_info['audio'] - upload_timestamp = parse_iso8601(info.get('broadcast_date')) - duration = info.get('duration') or info.get('cutout') or info.get('cutduration') - if isinstance(duration, compat_str): - duration = parse_duration(duration) - view_count = info.get('plays') - thumbnail = unescapeHTML(info.get('preview_image_url')) + title = info['title'] def extract_bitrate(url): return int_or_none(self._search_regex( r'-([0-9]+)k\.', url, 'bitrate', default=None)) formats = [] - for format_id, format_url in info['streams'].items(): - if format_id == 'hds_sd' and 'hds' in info['streams']: + streams = info.get('streams', {}) + for format_id, format_url in streams.items(): + if format_id == 'hds_sd' and 'hds' in streams: continue - if format_id == 'hls_sd' and 'hls' in info['streams']: + if format_id == 'hls_sd' and 'hls' in streams: continue - if format_url.endswith('.f4m'): - token = self._download_xml( - 'http://tp.srgssr.ch/token/akahd.xml?stream=%s/*' % compat_urllib_parse_urlparse(format_url).path, - media_id, 'Downloading %s token' % format_id) - auth_params = xpath_text(token, './/authparams', 'auth params') - if not auth_params: - continue - formats.extend(self._extract_f4m_formats( - '%s?%s&hdcore=3.4.0&plugin=aasp-3.4.0.132.66' % (format_url, auth_params), - media_id, f4m_id=format_id, fatal=False)) - elif format_url.endswith('.m3u8'): - formats.extend(self._extract_m3u8_formats( - format_url, media_id, 'mp4', 'm3u8_native', m3u8_id=format_id, fatal=False)) + ext = determine_ext(format_url) + if ext in ('m3u8', 'f4m'): + format_url = self._get_tokenized_src(format_url, media_id, format_id) + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + format_url + ('?' if '?' not in format_url else '&') + 'hdcore=3.4.0', + media_id, f4m_id=format_id, fatal=False)) + else: + formats.extend(self._extract_m3u8_formats( + format_url, media_id, 'mp4', 'm3u8_native', m3u8_id=format_id, fatal=False)) else: formats.append({ 'format_id': format_id, @@ -205,25 +194,37 @@ 'tbr': extract_bitrate(format_url), }) - if 'media' in info: - formats.extend([{ - 'format_id': '%s-%sk' % (media['ext'], media['rate']), - 'url': 'http://download-video.rts.ch/%s' % media['url'], - 'tbr': media['rate'] or extract_bitrate(media['url']), - } for media in info['media'] if media.get('rate')]) + for media in info.get('media', []): + media_url = media.get('url') + if not media_url or re.match(r'https?://', media_url): + continue + rate = media.get('rate') + ext = media.get('ext') or determine_ext(media_url, 'mp4') + format_id = ext + if rate: + format_id += '-%dk' % rate + formats.append({ + 'format_id': format_id, + 'url': 'http://download-video.rts.ch/' + media_url, + 'tbr': rate or extract_bitrate(media_url), + }) self._check_formats(formats, media_id) self._sort_formats(formats) + duration = info.get('duration') or info.get('cutout') or info.get('cutduration') + if isinstance(duration, compat_str): + duration = parse_duration(duration) + return { 'id': media_id, 'display_id': display_id, 'formats': formats, - 'title': info['title'], + 'title': title, 'description': info.get('intro'), 'duration': duration, - 'view_count': view_count, + 'view_count': int_or_none(info.get('plays')), 'uploader': info.get('programName'), - 'timestamp': upload_timestamp, - 'thumbnail': thumbnail, + 'timestamp': parse_iso8601(info.get('broadcast_date')), + 'thumbnail': unescapeHTML(info.get('preview_image_url')), } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/rtve.py youtube-dl-2019.05.20/youtube_dl/extractor/rtve.py --- youtube-dl-2016.02.22/youtube_dl/extractor/rtve.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/rtve.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import base64 @@ -6,27 +6,35 @@ import time from .common import InfoExtractor +from ..compat import ( + compat_b64decode, + compat_struct_unpack, +) from ..utils import ( + determine_ext, ExtractorError, float_or_none, remove_end, + remove_start, sanitized_Request, std_headers, - struct_unpack, ) def _decrypt_url(png): - encrypted_data = base64.b64decode(png.encode('utf-8')) + encrypted_data = compat_b64decode(png) text_index = encrypted_data.find(b'tEXt') text_chunk = encrypted_data[text_index - 4:] - length = struct_unpack('!I', text_chunk[:4])[0] + length = compat_struct_unpack('!I', text_chunk[:4])[0] # Use bytearray to get integers when iterating in both python 2.x and 3.x data = bytearray(text_chunk[8:8 + length]) data = [chr(b) for b in data if b != 0] hash_index = data.index('#') alphabet_data = data[:hash_index] url_data = data[hash_index + 1:] + if url_data[0] == 'H' and url_data[3] == '%': + # remove useless HQ%% at the start + url_data = url_data[4:] alphabet = [] e = 0 @@ -61,7 +69,7 @@ class RTVEALaCartaIE(InfoExtractor): IE_NAME = 'rtve.es:alacarta' IE_DESC = 'RTVE a la carta' - _VALID_URL = r'http://www\.rtve\.es/(m/)?alacarta/videos/[^/]+/[^/]+/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/(m/)?(alacarta/videos|filmoteca)/[^/]+/[^/]+/(?P\d+)' _TESTS = [{ 'url': 'http://www.rtve.es/alacarta/videos/balonmano/o-swiss-cup-masculina-final-espana-suecia/2491869/', @@ -82,8 +90,23 @@ }, 'skip': 'The f4m manifest can\'t be used yet', }, { + 'url': 'http://www.rtve.es/alacarta/videos/servir-y-proteger/servir-proteger-capitulo-104/4236788/', + 'md5': 'e55e162379ad587e9640eda4f7353c0f', + 'info_dict': { + 'id': '4236788', + 'ext': 'mp4', + 'title': 'Servir y proteger - Capítulo 104 ', + 'duration': 3222.0, + }, + 'params': { + 'skip_download': True, # requires ffmpeg + }, + }, { 'url': 'http://www.rtve.es/m/alacarta/videos/cuentame-como-paso/cuentame-como-paso-t16-ultimo-minuto-nuestra-vida-capitulo-276/2969138/?media=tve', 'only_matching': True, + }, { + 'url': 'http://www.rtve.es/filmoteca/no-do/not-1-introduccion-primer-noticiario-espanol/1465256/', + 'only_matching': True, }] def _real_initialize(self): @@ -101,15 +124,32 @@ video_id)['page']['items'][0] if info['state'] == 'DESPU': raise ExtractorError('The video is no longer available', expected=True) + title = info['title'] png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/%s/videos/%s.png' % (self._manager, video_id) png_request = sanitized_Request(png_url) png_request.add_header('Referer', url) png = self._download_webpage(png_request, video_id, 'Downloading url information') video_url = _decrypt_url(png) - if not video_url.endswith('.f4m'): - video_url = video_url.replace( - 'resources/', 'auth/resources/' - ).replace('.net.rtve', '.multimedia.cdn.rtve') + ext = determine_ext(video_url) + + formats = [] + if not video_url.endswith('.f4m') and ext != 'm3u8': + if '?' not in video_url: + video_url = video_url.replace('resources/', 'auth/resources/') + video_url = video_url.replace('.net.rtve', '.multimedia.cdn.rtve') + + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + video_url, video_id, f4m_id='hds', fatal=False)) + else: + formats.append({ + 'url': video_url, + }) + self._sort_formats(formats) subtitles = None if info.get('sbtFile') is not None: @@ -117,8 +157,8 @@ return { 'id': video_id, - 'title': info['title'], - 'url': video_url, + 'title': title, + 'formats': formats, 'thumbnail': info.get('image'), 'page_url': url, 'subtitles': subtitles, @@ -178,14 +218,14 @@ class RTVELiveIE(InfoExtractor): IE_NAME = 'rtve.es:live' IE_DESC = 'RTVE.es live streams' - _VALID_URL = r'http://www\.rtve\.es/(?:deportes/directo|noticias|television)/(?P[a-zA-Z0-9-]+)' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/directo/(?P[a-zA-Z0-9-]+)' _TESTS = [{ - 'url': 'http://www.rtve.es/noticias/directo-la-1/', + 'url': 'http://www.rtve.es/directo/la-1/', 'info_dict': { - 'id': 'directo-la-1', - 'ext': 'flv', - 'title': 're:^La 1 de TVE [0-9]{4}-[0-9]{2}-[0-9]{2}Z[0-9]{6}$', + 'id': 'la-1', + 'ext': 'mp4', + 'title': 're:^La 1 [0-9]{4}-[0-9]{2}-[0-9]{2}Z[0-9]{6}$', }, 'params': { 'skip_download': 'live stream', @@ -198,23 +238,55 @@ video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - player_url = self._search_regex( - r'', webpage, 'player URL') - title = remove_end(self._og_search_title(webpage), ' en directo') + title = remove_end(self._og_search_title(webpage), ' en directo en RTVE.es') + title = remove_start(title, 'Estoy viendo ') title += ' ' + time.strftime('%Y-%m-%dZ%H%M%S', start_time) vidplayer_id = self._search_regex( - r' id="vidplayer([0-9]+)"', webpage, 'internal video ID') - png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/default/videos/%s.png' % vidplayer_id + (r'playerId=player([0-9]+)', + r'class=["\'].*?\blive_mod\b.*?["\'][^>]+data-assetid=["\'](\d+)', + r'data-id=["\'](\d+)'), + webpage, 'internal video ID') + png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/amonet/videos/%s.png' % vidplayer_id png = self._download_webpage(png_url, video_id, 'Downloading url information') - video_url = _decrypt_url(png) + m3u8_url = _decrypt_url(png) + formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') + self._sort_formats(formats) return { 'id': video_id, - 'ext': 'flv', 'title': title, - 'url': video_url, - 'app': 'rtve-live-live?ovpfv=2.1.2', - 'player_url': player_url, - 'rtmp_live': True, + 'formats': formats, + 'is_live': True, } + + +class RTVETelevisionIE(InfoExtractor): + IE_NAME = 'rtve.es:television' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/television/[^/]+/[^/]+/(?P\d+).shtml' + + _TEST = { + 'url': 'http://www.rtve.es/television/20160628/revolucion-del-movil/1364141.shtml', + 'info_dict': { + 'id': '3069778', + 'ext': 'mp4', + 'title': 'Documentos TV - La revolución del móvil', + 'duration': 3496.948, + }, + 'params': { + 'skip_download': True, + }, + } + + def _real_extract(self, url): + page_id = self._match_id(url) + webpage = self._download_webpage(url, page_id) + + alacarta_url = self._search_regex( + r'data-location="alacarta_videos"[^<]+url":"(http://www\.rtve\.es/alacarta.+?)&', + webpage, 'alacarta url', default=None) + if alacarta_url is None: + raise ExtractorError( + 'The webpage doesn\'t contain any video', expected=True) + + return self.url_result(alacarta_url, ie=RTVEALaCartaIE.ie_key()) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/rtvnh.py youtube-dl-2019.05.20/youtube_dl/extractor/rtvnh.py --- youtube-dl-2016.02.22/youtube_dl/extractor/rtvnh.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/rtvnh.py 2019-06-07 18:49:07.000000000 +0000 @@ -9,12 +9,12 @@ _VALID_URL = r'https?://(?:www\.)?rtvnh\.nl/video/(?P[0-9]+)' _TEST = { 'url': 'http://www.rtvnh.nl/video/131946', - 'md5': '6e1d0ab079e2a00b6161442d3ceacfc1', + 'md5': 'cdbec9f44550763c8afc96050fa747dc', 'info_dict': { 'id': '131946', 'ext': 'mp4', 'title': 'Grote zoektocht in zee bij Zandvoort naar vermiste vrouw', - 'thumbnail': 're:^https?:.*\.jpg$' + 'thumbnail': r're:^https?:.*\.jpg$' } } @@ -29,15 +29,30 @@ raise ExtractorError( '%s returned error code %d' % (self.IE_NAME, status), expected=True) - formats = self._extract_smil_formats( - 'http://www.rtvnh.nl/video/smil?m=' + video_id, video_id, fatal=False) - - for item in meta['source']['fb']: - if item.get('type') == 'hls': - formats.extend(self._extract_m3u8_formats( - item['file'], video_id, ext='mp4', entry_protocol='m3u8_native')) - elif item.get('type') == '': - formats.append({'url': item['file']}) + formats = [] + rtmp_formats = self._extract_smil_formats( + 'http://www.rtvnh.nl/video/smil?m=' + video_id, video_id) + formats.extend(rtmp_formats) + + for rtmp_format in rtmp_formats: + rtmp_url = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path']) + rtsp_format = rtmp_format.copy() + del rtsp_format['play_path'] + del rtsp_format['ext'] + rtsp_format.update({ + 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'), + 'url': rtmp_url.replace('rtmp://', 'rtsp://'), + 'protocol': 'rtsp', + }) + formats.append(rtsp_format) + http_base_url = rtmp_url.replace('rtmp://', 'http://') + formats.extend(self._extract_m3u8_formats( + http_base_url + '/playlist.m3u8', video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + formats.extend(self._extract_f4m_formats( + http_base_url + '/manifest.f4m', + video_id, f4m_id='hds', fatal=False)) + self._sort_formats(formats) return { 'id': video_id, diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/rtvs.py youtube-dl-2019.05.20/youtube_dl/extractor/rtvs.py --- youtube-dl-2016.02.22/youtube_dl/extractor/rtvs.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/rtvs.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,47 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class RTVSIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rtvs\.sk/(?:radio|televizia)/archiv/\d+/(?P\d+)' + _TESTS = [{ + # radio archive + 'url': 'http://www.rtvs.sk/radio/archiv/11224/414872', + 'md5': '134d5d6debdeddf8a5d761cbc9edacb8', + 'info_dict': { + 'id': '414872', + 'ext': 'mp3', + 'title': 'Ostrov pokladov 1 časť.mp3' + }, + 'params': { + 'skip_download': True, + } + }, { + # tv archive + 'url': 'http://www.rtvs.sk/televizia/archiv/8249/63118', + 'md5': '85e2c55cf988403b70cac24f5c086dc6', + 'info_dict': { + 'id': '63118', + 'ext': 'mp4', + 'title': 'Amaro Džives - Náš deň', + 'description': 'Galavečer pri príležitosti Medzinárodného dňa Rómov.' + }, + 'params': { + 'skip_download': True, + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + playlist_url = self._search_regex( + r'playlist["\']?\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, + 'playlist url', group='url') + + data = self._download_json( + playlist_url, video_id, 'Downloading playlist')[0] + return self._parse_jwplayer_data(data, video_id=video_id) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/rudo.py youtube-dl-2019.05.20/youtube_dl/extractor/rudo.py --- youtube-dl-2016.02.22/youtube_dl/extractor/rudo.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/rudo.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,53 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + js_to_json, + get_element_by_class, + unified_strdate, +) + + +class RudoIE(InfoExtractor): + _VALID_URL = r'https?://rudo\.video/vod/(?P[0-9a-zA-Z]+)' + + _TEST = { + 'url': 'http://rudo.video/vod/oTzw0MGnyG', + 'md5': '2a03a5b32dd90a04c83b6d391cf7b415', + 'info_dict': { + 'id': 'oTzw0MGnyG', + 'ext': 'mp4', + 'title': 'Comentario Tomás Mosciatti', + 'upload_date': '20160617', + }, + } + + @classmethod + def _extract_url(cls, webpage): + mobj = re.search( + r']+src=(?P[\'"])(?P(?:https?:)?//rudo\.video/vod/[0-9a-zA-Z]+)(?P=q1)', + webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id, encoding='iso-8859-1') + + jwplayer_data = self._parse_json(self._search_regex( + r'(?s)playerInstance\.setup\(({.+?})\)', webpage, 'jwplayer data'), video_id, + transform_source=lambda s: js_to_json(re.sub(r'encodeURI\([^)]+\)', '""', s))) + + info_dict = self._parse_jwplayer_data( + jwplayer_data, video_id, require_title=False, m3u8_id='hls', mpd_id='dash') + + info_dict.update({ + 'title': self._og_search_title(webpage), + 'upload_date': unified_strdate(get_element_by_class('date', webpage)), + }) + + return info_dict diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/ruhd.py youtube-dl-2019.05.20/youtube_dl/extractor/ruhd.py --- youtube-dl-2016.02.22/youtube_dl/extractor/ruhd.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/ruhd.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,11 +1,11 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor class RUHDIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?ruhd\.ru/play\.php\?vid=(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?ruhd\.ru/play\.php\?vid=(?P\d+)' _TEST = { 'url': 'http://www.ruhd.ru/play.php?vid=207', 'md5': 'd1a9ec4edf8598e3fbd92bb16072ba83', @@ -14,7 +14,7 @@ 'ext': 'divx', 'title': 'КОТ бааааам', 'description': 'классный кот)', - 'thumbnail': 're:^http://.*\.jpg$', + 'thumbnail': r're:^http://.*\.jpg$', } } @@ -25,7 +25,7 @@ video_url = self._html_search_regex( r'([^<]+)   RUHD.ru - Видео Высокого качества №1 в России!', + r'([^<]+)   RUHD\.ru - Видео Высокого качества №1 в России!', webpage, 'title') description = self._html_search_regex( r'(?s)
(.+?)', diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/rutube.py youtube-dl-2019.05.20/youtube_dl/extractor/rutube.py --- youtube-dl-2016.02.22/youtube_dl/extractor/rutube.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/rutube.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re @@ -7,94 +7,155 @@ from .common import InfoExtractor from ..compat import ( compat_str, + compat_parse_qs, + compat_urllib_parse_urlparse, ) from ..utils import ( determine_ext, - unified_strdate, + bool_or_none, + int_or_none, + try_get, + unified_timestamp, + url_or_none, ) -class RutubeIE(InfoExtractor): +class RutubeBaseIE(InfoExtractor): + def _download_api_info(self, video_id, query=None): + if not query: + query = {} + query['format'] = 'json' + return self._download_json( + 'http://rutube.ru/api/video/%s/' % video_id, + video_id, 'Downloading video JSON', + 'Unable to download video JSON', query=query) + + @staticmethod + def _extract_info(video, video_id=None, require_title=True): + title = video['title'] if require_title else video.get('title') + + age_limit = video.get('is_adult') + if age_limit is not None: + age_limit = 18 if age_limit is True else 0 + + uploader_id = try_get(video, lambda x: x['author']['id']) + category = try_get(video, lambda x: x['category']['name']) + + return { + 'id': video.get('id') or video_id if video_id else video['id'], + 'title': title, + 'description': video.get('description'), + 'thumbnail': video.get('thumbnail_url'), + 'duration': int_or_none(video.get('duration')), + 'uploader': try_get(video, lambda x: x['author']['name']), + 'uploader_id': compat_str(uploader_id) if uploader_id else None, + 'timestamp': unified_timestamp(video.get('created_ts')), + 'category': [category] if category else None, + 'age_limit': age_limit, + 'view_count': int_or_none(video.get('hits')), + 'comment_count': int_or_none(video.get('comments_count')), + 'is_live': bool_or_none(video.get('is_livestream')), + } + + def _download_and_extract_info(self, video_id, query=None): + return self._extract_info( + self._download_api_info(video_id, query=query), video_id) + + def _download_api_options(self, video_id, query=None): + if not query: + query = {} + query['format'] = 'json' + return self._download_json( + 'http://rutube.ru/api/play/options/%s/' % video_id, + video_id, 'Downloading options JSON', + 'Unable to download options JSON', + headers=self.geo_verification_headers(), query=query) + + def _extract_formats(self, options, video_id): + formats = [] + for format_id, format_url in options['video_balancer'].items(): + ext = determine_ext(format_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + format_url, video_id, f4m_id=format_id, fatal=False)) + else: + formats.append({ + 'url': format_url, + 'format_id': format_id, + }) + self._sort_formats(formats) + return formats + + def _download_and_extract_formats(self, video_id, query=None): + return self._extract_formats( + self._download_api_options(video_id, query=query), video_id) + + +class RutubeIE(RutubeBaseIE): IE_NAME = 'rutube' IE_DESC = 'Rutube videos' - _VALID_URL = r'https?://rutube\.ru/(?:video|play/embed)/(?P[\da-z]{32})' + _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/(?P[\da-z]{32})' _TESTS = [{ 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/', + 'md5': '1d24f180fac7a02f3900712e5a5764d6', 'info_dict': { 'id': '3eac3b4561676c17df9132a9a1e62e3e', 'ext': 'mp4', 'title': 'Раненный кенгуру забежал в аптеку', 'description': 'http://www.ntdtv.ru ', - 'duration': 80, + 'duration': 81, 'uploader': 'NTDRussian', 'uploader_id': '29790', + 'timestamp': 1381943602, 'upload_date': '20131016', 'age_limit': 0, }, - 'params': { - # It requires ffmpeg (m3u8 download) - 'skip_download': True, - }, }, { 'url': 'http://rutube.ru/play/embed/a10e53b86e8f349080f718582ce4c661', 'only_matching': True, + }, { + 'url': 'http://rutube.ru/embed/a10e53b86e8f349080f718582ce4c661', + 'only_matching': True, + }, { + 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/?pl_id=4252', + 'only_matching': True, + }, { + 'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_type=source', + 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return False if RutubePlaylistIE.suitable(url) else super(RutubeIE, cls).suitable(url) + + @staticmethod + def _extract_urls(webpage): + return [mobj.group('url') for mobj in re.finditer( + r']+?src=(["\'])(?P(?:https?:)?//rutube\.ru/embed/[\da-z]{32}.*?)\1', + webpage)] + def _real_extract(self, url): video_id = self._match_id(url) - video = self._download_json( - 'http://rutube.ru/api/video/%s/?format=json' % video_id, - video_id, 'Downloading video JSON') - - # Some videos don't have the author field - author = video.get('author') or {} - - options = self._download_json( - 'http://rutube.ru/api/play/options/%s/?format=json' % video_id, - video_id, 'Downloading options JSON') - - formats = [] - for format_id, format_url in options['video_balancer'].items(): - ext = determine_ext(format_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - format_url, video_id, f4m_id=format_id, fatal=False)) - else: - formats.append({ - 'url': format_url, - 'format_id': format_id, - }) - self._sort_formats(formats) - - return { - 'id': video['id'], - 'title': video['title'], - 'description': video['description'], - 'duration': video['duration'], - 'view_count': video['hits'], - 'formats': formats, - 'thumbnail': video['thumbnail_url'], - 'uploader': author.get('name'), - 'uploader_id': compat_str(author['id']) if author else None, - 'upload_date': unified_strdate(video['created_ts']), - 'age_limit': 18 if video['is_adult'] else 0, - } + info = self._download_and_extract_info(video_id) + info['formats'] = self._download_and_extract_formats(video_id) + return info -class RutubeEmbedIE(InfoExtractor): +class RutubeEmbedIE(RutubeBaseIE): IE_NAME = 'rutube:embed' IE_DESC = 'Rutube embedded videos' - _VALID_URL = 'https?://rutube\.ru/(?:video|play)/embed/(?P[0-9]+)' + _VALID_URL = r'https?://rutube\.ru/(?:video|play)/embed/(?P[0-9]+)' _TESTS = [{ 'url': 'http://rutube.ru/video/embed/6722881?vk_puid37=&vk_puid38=', 'info_dict': { 'id': 'a10e53b86e8f349080f718582ce4c661', 'ext': 'mp4', + 'timestamp': 1387830582, 'upload_date': '20131223', 'uploader_id': '297833', 'description': 'Видео группы ★http://vk.com/foxkidsreset★ музей Fox Kids и Jetix

восстановлено и сделано в шикоформате subziro89 http://vk.com/subziro89', @@ -102,27 +163,78 @@ 'title': 'Мистический городок Эйри в Индиан 5 серия озвучка subziro89', }, 'params': { - 'skip_download': 'Requires ffmpeg', + 'skip_download': True, }, }, { 'url': 'http://rutube.ru/play/embed/8083783', 'only_matching': True, + }, { + # private video + 'url': 'https://rutube.ru/play/embed/10631925?p=IbAigKqWd1do4mjaM5XLIQ', + 'only_matching': True, }] def _real_extract(self, url): embed_id = self._match_id(url) - webpage = self._download_webpage(url, embed_id) + # Query may contain private videos token and should be passed to API + # requests (see #19163) + query = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + options = self._download_api_options(embed_id, query) + video_id = options['effective_video'] + formats = self._extract_formats(options, video_id) + info = self._download_and_extract_info(video_id, query) + info.update({ + 'extractor_key': 'Rutube', + 'formats': formats, + }) + return info - canonical_url = self._html_search_regex( - r'\d+)' + _VALID_URL = r'https?://rutube\.ru/tags/video/(?P\d+)' _TESTS = [{ 'url': 'http://rutube.ru/tags/video/1800/', 'info_dict': { @@ -133,30 +245,11 @@ _PAGE_TEMPLATE = 'http://rutube.ru/api/tags/video/%s/?page=%s&format=json' - def _extract_videos(self, channel_id, channel_title=None): - entries = [] - for pagenum in itertools.count(1): - page = self._download_json( - self._PAGE_TEMPLATE % (channel_id, pagenum), - channel_id, 'Downloading page %s' % pagenum) - results = page['results'] - if not results: - break - entries.extend(self.url_result(result['video_url'], 'Rutube') for result in results) - if not page['has_next']: - break - return self.playlist_result(entries, channel_id, channel_title) - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - channel_id = mobj.group('id') - return self._extract_videos(channel_id) - - -class RutubeMovieIE(RutubeChannelIE): +class RutubeMovieIE(RutubePlaylistBaseIE): IE_NAME = 'rutube:movie' IE_DESC = 'Rutube movies' - _VALID_URL = r'http://rutube\.ru/metainfo/tv/(?P\d+)' + _VALID_URL = r'https?://rutube\.ru/metainfo/tv/(?P\d+)' _TESTS = [] _MOVIE_TEMPLATE = 'http://rutube.ru/api/metainfo/tv/%s/?format=json' @@ -167,14 +260,14 @@ movie = self._download_json( self._MOVIE_TEMPLATE % movie_id, movie_id, 'Downloading movie JSON') - movie_name = movie['name'] - return self._extract_videos(movie_id, movie_name) + return self._extract_playlist( + movie_id, playlist_name=movie.get('name')) -class RutubePersonIE(RutubeChannelIE): +class RutubePersonIE(RutubePlaylistBaseIE): IE_NAME = 'rutube:person' IE_DESC = 'Rutube person videos' - _VALID_URL = r'http://rutube\.ru/video/person/(?P\d+)' + _VALID_URL = r'https?://rutube\.ru/video/person/(?P\d+)' _TESTS = [{ 'url': 'http://rutube.ru/video/person/313878/', 'info_dict': { @@ -184,3 +277,37 @@ }] _PAGE_TEMPLATE = 'http://rutube.ru/api/video/person/%s/?page=%s&format=json' + + +class RutubePlaylistIE(RutubePlaylistBaseIE): + IE_NAME = 'rutube:playlist' + IE_DESC = 'Rutube playlists' + _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/[\da-z]{32}/\?.*?\bpl_id=(?P\d+)' + _TESTS = [{ + 'url': 'https://rutube.ru/video/cecd58ed7d531fc0f3d795d51cee9026/?pl_id=3097&pl_type=tag', + 'info_dict': { + 'id': '3097', + }, + 'playlist_count': 27, + }, { + 'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_id=4252&pl_type=source', + 'only_matching': True, + }] + + _PAGE_TEMPLATE = 'http://rutube.ru/api/playlist/%s/%s/?page=%s&format=json' + + @classmethod + def suitable(cls, url): + if not super(RutubePlaylistIE, cls).suitable(url): + return False + params = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + return params.get('pl_type', [None])[0] and int_or_none(params.get('pl_id', [None])[0]) + + def _next_page_url(self, page_num, playlist_id, item_kind): + return self._PAGE_TEMPLATE % (item_kind, playlist_id, page_num) + + def _real_extract(self, url): + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + playlist_kind = qs['pl_type'][0] + playlist_id = qs['pl_id'][0] + return self._extract_playlist(playlist_id, item_kind=playlist_kind) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/rutv.py youtube-dl-2019.05.20/youtube_dl/extractor/rutv.py --- youtube-dl-2016.02.22/youtube_dl/extractor/rutv.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/rutv.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re @@ -13,11 +13,15 @@ class RUTVIE(InfoExtractor): IE_DESC = 'RUTV.RU' _VALID_URL = r'''(?x) - https?://player\.(?:rutv\.ru|vgtrk\.com)/ - (?Pflash2v/container\.swf\?id= - |iframe/(?Pswf|video|live)/id/ - |index/iframe/cast_id/) - (?P\d+)''' + https?:// + (?:test)?player\.(?:rutv\.ru|vgtrk\.com)/ + (?P + flash\d+v/container\.swf\?id=| + iframe/(?Pswf|video|live)/id/| + index/iframe/cast_id/ + ) + (?P\d+) + ''' _TESTS = [ { @@ -99,17 +103,21 @@ 'skip_download': True, }, }, + { + 'url': 'https://testplayer.vgtrk.com/iframe/live/id/19201/showZoomBtn/false/isPlay/true/', + 'only_matching': True, + }, ] @classmethod def _extract_url(cls, webpage): mobj = re.search( - r']+?src=(["\'])(?Phttps?://player\.(?:rutv\.ru|vgtrk\.com)/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage) + r']+?src=(["\'])(?Phttps?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage) if mobj: return mobj.group('url') mobj = re.search( - r']+?property=(["\'])og:video\1[^>]+?content=(["\'])(?Phttps?://player\.(?:rutv\.ru|vgtrk\.com)/flash2v/container\.swf\?id=.+?\2)', + r']+?property=(["\'])og:video\1[^>]+?content=(["\'])(?Phttps?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/flash\d+v/container\.swf\?id=.+?\2)', webpage) if mobj: return mobj.group('url') @@ -119,7 +127,7 @@ video_id = mobj.group('id') video_path = mobj.group('path') - if video_path.startswith('flash2v'): + if re.match(r'flash\d+v', video_path): video_type = 'video' elif video_path.startswith('iframe'): video_type = mobj.group('type') @@ -168,7 +176,7 @@ 'play_path': mobj.group('playpath'), 'app': mobj.group('app'), 'page_url': 'http://player.rutv.ru', - 'player_url': 'http://player.rutv.ru/flash2v/osmf.swf?i=22', + 'player_url': 'http://player.rutv.ru/flash3v/osmf.swf?i=22', 'rtmp_live': True, 'ext': 'flv', 'vbr': int(quality), diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/ruutu.py youtube-dl-2019.05.20/youtube_dl/extractor/ruutu.py --- youtube-dl-2016.02.22/youtube_dl/extractor/ruutu.py 2016-01-09 00:15:59.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/ruutu.py 2019-06-07 18:49:07.000000000 +0000 @@ -5,6 +5,7 @@ from ..compat import compat_urllib_parse_urlparse from ..utils import ( determine_ext, + ExtractorError, int_or_none, xpath_attr, xpath_text, @@ -12,7 +13,7 @@ class RuutuIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ruutu\.fi/video/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?(?:ruutu|supla)\.fi/(?:video|supla)/(?P\d+)' _TESTS = [ { 'url': 'http://www.ruutu.fi/video/2058907', @@ -22,7 +23,7 @@ 'ext': 'mp4', 'title': 'Oletko aina halunnut tietää mitä tapahtuu vain hetki ennen lähetystä? - Nyt se selvisi!', 'description': 'md5:cfc6ccf0e57a814360df464a91ff67d6', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 114, 'age_limit': 0, }, @@ -34,19 +35,52 @@ 'id': '2057306', 'ext': 'mp4', 'title': 'Superpesis: katso koko kausi Ruudussa', - 'description': 'md5:da2736052fef3b2bd5e0005e63c25eac', - 'thumbnail': 're:^https?://.*\.jpg$', + 'description': 'md5:bfb7336df2a12dc21d18fa696c9f8f23', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 40, 'age_limit': 0, }, }, + { + 'url': 'http://www.supla.fi/supla/2231370', + 'md5': 'df14e782d49a2c0df03d3be2a54ef949', + 'info_dict': { + 'id': '2231370', + 'ext': 'mp4', + 'title': 'Osa 1: Mikael Jungner', + 'description': 'md5:7d90f358c47542e3072ff65d7b1bcffe', + 'thumbnail': r're:^https?://.*\.jpg$', + 'age_limit': 0, + }, + }, + # Episode where is "NOT-USED", but has other + # downloadable sources available. + { + 'url': 'http://www.ruutu.fi/video/3193728', + 'only_matching': True, + }, + { + # audio podcast + 'url': 'https://www.supla.fi/supla/3382410', + 'md5': 'b9d7155fed37b2ebf6021d74c4b8e908', + 'info_dict': { + 'id': '3382410', + 'ext': 'mp3', + 'title': 'Mikä ihmeen poltergeist?', + 'description': 'md5:bbb6963df17dfd0ecd9eb9a61bf14b52', + 'thumbnail': r're:^https?://.*\.jpg$', + 'age_limit': 0, + }, + 'expected_warnings': ['HTTP Error 502: Bad Gateway'], + } ] def _real_extract(self, url): video_id = self._match_id(url) video_xml = self._download_xml( - 'http://gatling.ruutu.fi/media-xml-cache?id=%s' % video_id, video_id) + 'https://gatling.nelonenmedia.fi/media-xml-cache', video_id, + query={'id': video_id}) formats = [] processed_urls = [] @@ -57,9 +91,9 @@ extract_formats(child) elif child.tag.endswith('File'): video_url = child.text - if (not video_url or video_url in processed_urls or - any(p in video_url for p in ('NOT_USED', 'NOT-USED'))): - return + if (not video_url or video_url in processed_urls + or any(p in video_url for p in ('NOT_USED', 'NOT-USED'))): + continue processed_urls.append(video_url) ext = determine_ext(video_url) if ext == 'm3u8': @@ -68,6 +102,18 @@ elif ext == 'f4m': formats.extend(self._extract_f4m_formats( video_url, video_id, f4m_id='hds', fatal=False)) + elif ext == 'mpd': + # video-only and audio-only streams are of different + # duration resulting in out of sync issue + continue + formats.extend(self._extract_mpd_formats( + video_url, video_id, mpd_id='dash', fatal=False)) + elif ext == 'mp3' or child.tag == 'AudioMediaFile': + formats.append({ + 'format_id': 'audio', + 'url': video_url, + 'vcodec': 'none', + }) else: proto = compat_urllib_parse_urlparse(video_url).scheme if not child.tag.startswith('HTTP') and proto != 'rtmp': @@ -89,6 +135,11 @@ }) extract_formats(video_xml.find('./Clip')) + + drm = xpath_text(video_xml, './Clip/DRM', default=None) + if not formats and drm: + raise ExtractorError('This video is DRM protected.', expected=True) + self._sort_formats(formats) return { diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/ruv.py youtube-dl-2019.05.20/youtube_dl/extractor/ruv.py --- youtube-dl-2016.02.22/youtube_dl/extractor/ruv.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/ruv.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,101 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + unified_timestamp, +) + + +class RuvIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ruv\.is/(?:sarpurinn/[^/]+|node)/(?P[^/]+(?:/\d+)?)' + _TESTS = [{ + # m3u8 + 'url': 'http://ruv.is/sarpurinn/ruv-aukaras/fh-valur/20170516', + 'md5': '66347652f4e13e71936817102acc1724', + 'info_dict': { + 'id': '1144499', + 'display_id': 'fh-valur/20170516', + 'ext': 'mp4', + 'title': 'FH - Valur', + 'description': 'Bein útsending frá 3. leik FH og Vals í úrslitum Olísdeildar karla í handbolta.', + 'timestamp': 1494963600, + 'upload_date': '20170516', + }, + }, { + # mp3 + 'url': 'http://ruv.is/sarpurinn/ras-2/morgunutvarpid/20170619', + 'md5': '395ea250c8a13e5fdb39d4670ef85378', + 'info_dict': { + 'id': '1153630', + 'display_id': 'morgunutvarpid/20170619', + 'ext': 'mp3', + 'title': 'Morgunútvarpið', + 'description': 'md5:a4cf1202c0a1645ca096b06525915418', + 'timestamp': 1497855000, + 'upload_date': '20170619', + }, + }, { + 'url': 'http://ruv.is/sarpurinn/ruv/frettir/20170614', + 'only_matching': True, + }, { + 'url': 'http://www.ruv.is/node/1151854', + 'only_matching': True, + }, { + 'url': 'http://ruv.is/sarpurinn/klippa/secret-soltice-hefst-a-morgun', + 'only_matching': True, + }, { + 'url': 'http://ruv.is/sarpurinn/ras-1/morgunvaktin/20170619', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + title = self._og_search_title(webpage) + + FIELD_RE = r'video\.%s\s*=\s*(["\'])(?P(?:(?!\1).)+)\1' + + media_url = self._html_search_regex( + FIELD_RE % 'src', webpage, 'video URL', group='url') + + video_id = self._search_regex( + r']+\bhref=["\']https?://www\.ruv\.is/node/(\d+)', + webpage, 'video id', default=display_id) + + ext = determine_ext(media_url) + + if ext == 'm3u8': + formats = self._extract_m3u8_formats( + media_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + elif ext == 'mp3': + formats = [{ + 'format_id': 'mp3', + 'url': media_url, + 'vcodec': 'none', + }] + else: + formats = [{ + 'url': media_url, + }] + + description = self._og_search_description(webpage, default=None) + thumbnail = self._og_search_thumbnail( + webpage, default=None) or self._search_regex( + FIELD_RE % 'poster', webpage, 'thumbnail', fatal=False) + timestamp = unified_timestamp(self._html_search_meta( + 'article:published_time', webpage, 'timestamp', fatal=False)) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/safari.py youtube-dl-2019.05.20/youtube_dl/extractor/safari.py --- youtube-dl-2016.02.22/youtube_dl/extractor/safari.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/safari.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,127 +1,217 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals +import json import re from .common import InfoExtractor -from .brightcove import BrightcoveLegacyIE +from ..compat import ( + compat_parse_qs, + compat_str, + compat_urlparse, +) from ..utils import ( ExtractorError, - sanitized_Request, - smuggle_url, - std_headers, - urlencode_postdata, + update_url_query, ) class SafariBaseIE(InfoExtractor): - _LOGIN_URL = 'https://www.safaribooksonline.com/accounts/login/' - _SUCCESSFUL_LOGIN_REGEX = r']*>Sign Out' + _LOGIN_URL = 'https://learning.oreilly.com/accounts/login/' _NETRC_MACHINE = 'safari' - _API_BASE = 'https://www.safaribooksonline.com/api/v1/book' + _API_BASE = 'https://learning.oreilly.com/api/v1' _API_FORMAT = 'json' LOGGED_IN = False def _real_initialize(self): - # We only need to log in once for courses or individual videos - if not self.LOGGED_IN: - self._login() - SafariBaseIE.LOGGED_IN = True + self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: - self.raise_login_required('safaribooksonline.com account is required') + return - headers = std_headers - if 'Referer' not in headers: - headers['Referer'] = self._LOGIN_URL - - login_page = self._download_webpage( - self._LOGIN_URL, None, - 'Downloading login form') - - csrf = self._html_search_regex( - r"name='csrfmiddlewaretoken'\s+value='([^']+)'", - login_page, 'csrf token') - - login_form = { - 'csrfmiddlewaretoken': csrf, - 'email': username, - 'password1': password, - 'login': 'Sign In', - 'next': '', - } + _, urlh = self._download_webpage_handle( + 'https://learning.oreilly.com/accounts/login-check/', None, + 'Downloading login page') + + def is_logged(urlh): + return 'learning.oreilly.com/home/' in compat_str(urlh.geturl()) + + if is_logged(urlh): + self.LOGGED_IN = True + return + + redirect_url = compat_str(urlh.geturl()) + parsed_url = compat_urlparse.urlparse(redirect_url) + qs = compat_parse_qs(parsed_url.query) + next_uri = compat_urlparse.urljoin( + 'https://api.oreilly.com', qs['next'][0]) + + auth, urlh = self._download_json_handle( + 'https://www.oreilly.com/member/auth/login/', None, 'Logging in', + data=json.dumps({ + 'email': username, + 'password': password, + 'redirect_uri': next_uri, + }).encode(), headers={ + 'Content-Type': 'application/json', + 'Referer': redirect_url, + }, expected_status=400) + + credentials = auth.get('credentials') + if (not auth.get('logged_in') and not auth.get('redirect_uri') + and credentials): + raise ExtractorError( + 'Unable to login: %s' % credentials, expected=True) - request = sanitized_Request( - self._LOGIN_URL, urlencode_postdata(login_form), headers=headers) - login_page = self._download_webpage( - request, None, 'Logging in as %s' % username) + # oreilly serves two same groot_sessionid cookies in Set-Cookie header + # and expects first one to be actually set + self._apply_first_set_cookie_header(urlh, 'groot_sessionid') - if re.search(self._SUCCESSFUL_LOGIN_REGEX, login_page) is None: - raise ExtractorError( - 'Login failed; make sure your credentials are correct and try again.', - expected=True) + _, urlh = self._download_webpage_handle( + auth.get('redirect_uri') or next_uri, None, 'Completing login',) + + if is_logged(urlh): + self.LOGGED_IN = True + return - self.to_screen('Login successful') + raise ExtractorError('Unable to log in') class SafariIE(SafariBaseIE): IE_NAME = 'safari' IE_DESC = 'safaribooksonline.com online video' - _VALID_URL = r'''(?x)https?:// - (?:www\.)?safaribooksonline\.com/ - (?: - library/view/[^/]+| - api/v1/book - )/ - (?P[^/]+)/ - (?:chapter(?:-content)?/)? - (?Ppart\d+)\.html - ''' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/ + (?: + library/view/[^/]+/(?P[^/]+)/(?P[^/?\#&]+)\.html| + videos/[^/]+/[^/]+/(?P[^-]+-[^/?\#&]+) + ) + ''' _TESTS = [{ 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html', - 'md5': '5b0c4cc1b3c1ba15dda7344085aa5592', + 'md5': 'dcc5a425e79f2564148652616af1f2a3', 'info_dict': { - 'id': '2842601850001', + 'id': '0_qbqx90ic', 'ext': 'mp4', - 'title': 'Introduction', + 'title': 'Introduction to Hadoop Fundamentals LiveLessons', + 'timestamp': 1437758058, + 'upload_date': '20150724', + 'uploader_id': 'stork', }, - 'skip': 'Requires safaribooksonline account credentials', - }, { - 'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html', - 'only_matching': True, }, { # non-digits in course id 'url': 'https://www.safaribooksonline.com/library/view/create-a-nodejs/100000006A0210/part00.html', 'only_matching': True, + }, { + 'url': 'https://www.safaribooksonline.com/library/view/learning-path-red/9780134664057/RHCE_Introduction.html', + 'only_matching': True, + }, { + 'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314/9780134217314-PYMC_13_00', + 'only_matching': True, + }, { + 'url': 'https://learning.oreilly.com/videos/hadoop-fundamentals-livelessons/9780133392838/9780133392838-00_SeriesIntro', + 'only_matching': True, + }, { + 'url': 'https://www.oreilly.com/library/view/hadoop-fundamentals-livelessons/9780133392838/00_SeriesIntro.html', + 'only_matching': True, }] + _PARTNER_ID = '1926081' + _UICONF_ID = '29375172' + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - course_id = mobj.group('course_id') - part = mobj.group('part') - webpage = self._download_webpage( - '%s/%s/chapter-content/%s.html' % (self._API_BASE, course_id, part), - part) - - bc_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) - if not bc_url: - raise ExtractorError('Could not extract Brightcove URL from %s' % url, expected=True) + reference_id = mobj.group('reference_id') + if reference_id: + video_id = reference_id + partner_id = self._PARTNER_ID + ui_id = self._UICONF_ID + else: + video_id = '%s-%s' % (mobj.group('course_id'), mobj.group('part')) + + webpage, urlh = self._download_webpage_handle(url, video_id) + + mobj = re.match(self._VALID_URL, urlh.geturl()) + reference_id = mobj.group('reference_id') + if not reference_id: + reference_id = self._search_regex( + r'data-reference-id=(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'kaltura reference id', group='id') + partner_id = self._search_regex( + r'data-partner-id=(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'kaltura widget id', default=self._PARTNER_ID, + group='id') + ui_id = self._search_regex( + r'data-ui-id=(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'kaltura uiconf id', default=self._UICONF_ID, + group='id') + + query = { + 'wid': '_%s' % partner_id, + 'uiconf_id': ui_id, + 'flashvars[referenceId]': reference_id, + } + + if self.LOGGED_IN: + kaltura_session = self._download_json( + '%s/player/kaltura_session/?reference_id=%s' % (self._API_BASE, reference_id), + video_id, 'Downloading kaltura session JSON', + 'Unable to download kaltura session JSON', fatal=False) + if kaltura_session: + session = kaltura_session.get('session') + if session: + query['flashvars[ks]'] = session + + return self.url_result(update_url_query( + 'https://cdnapisec.kaltura.com/html5/html5lib/v2.37.1/mwEmbedFrame.php', query), + 'Kaltura') + + +class SafariApiIE(SafariBaseIE): + IE_NAME = 'safari:api' + _VALID_URL = r'https?://(?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/api/v1/book/(?P[^/]+)/chapter(?:-content)?/(?P[^/?#&]+)\.html' + + _TESTS = [{ + 'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html', + 'only_matching': True, + }, { + 'url': 'https://www.safaribooksonline.com/api/v1/book/9780134664057/chapter/RHCE_Introduction.html', + 'only_matching': True, + }] - return self.url_result(smuggle_url(bc_url, {'Referer': url}), 'BrightcoveLegacy') + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + part = self._download_json( + url, '%s/%s' % (mobj.group('course_id'), mobj.group('part')), + 'Downloading part JSON') + return self.url_result(part['web_url'], SafariIE.ie_key()) class SafariCourseIE(SafariBaseIE): IE_NAME = 'safari:course' IE_DESC = 'safaribooksonline.com online courses' - _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/(?:library/view/[^/]+|api/v1/book)/(?P[^/]+)/?(?:[#?]|$)' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/ + (?: + library/view/[^/]+| + api/v1/book| + videos/[^/]+ + )| + techbus\.safaribooksonline\.com + ) + /(?P[^/]+) + ''' _TESTS = [{ 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/', @@ -134,13 +224,30 @@ }, { 'url': 'https://www.safaribooksonline.com/api/v1/book/9781449396459/?override_format=json', 'only_matching': True, + }, { + 'url': 'http://techbus.safaribooksonline.com/9780134426365', + 'only_matching': True, + }, { + 'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314', + 'only_matching': True, + }, { + 'url': 'https://learning.oreilly.com/videos/hadoop-fundamentals-livelessons/9780133392838', + 'only_matching': True, + }, { + 'url': 'https://www.oreilly.com/library/view/hadoop-fundamentals-livelessons/9780133392838/', + 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return (False if SafariIE.suitable(url) or SafariApiIE.suitable(url) + else super(SafariCourseIE, cls).suitable(url)) + def _real_extract(self, url): course_id = self._match_id(url) course_json = self._download_json( - '%s/%s/?override_format=%s' % (self._API_BASE, course_id, self._API_FORMAT), + '%s/book/%s/?override_format=%s' % (self._API_BASE, course_id, self._API_FORMAT), course_id, 'Downloading course JSON') if 'chapters' not in course_json: @@ -148,7 +255,7 @@ 'No chapters found for course %s' % course_id, expected=True) entries = [ - self.url_result(chapter, 'Safari') + self.url_result(chapter, SafariApiIE.ie_key()) for chapter in course_json['chapters']] course_title = course_json['title'] diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/sandia.py youtube-dl-2019.05.20/youtube_dl/extractor/sandia.py --- youtube-dl-2016.02.22/youtube_dl/extractor/sandia.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/sandia.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,115 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import itertools -import json -import re - -from .common import InfoExtractor -from ..compat import compat_urlparse -from ..utils import ( - int_or_none, - js_to_json, - mimetype2ext, - sanitized_Request, - unified_strdate, -) - - -class SandiaIE(InfoExtractor): - IE_DESC = 'Sandia National Laboratories' - _VALID_URL = r'https?://digitalops\.sandia\.gov/Mediasite/Play/(?P[0-9a-f]+)' - _TEST = { - 'url': 'http://digitalops.sandia.gov/Mediasite/Play/24aace4429fc450fb5b38cdbf424a66e1d', - 'md5': '9422edc9b9a60151727e4b6d8bef393d', - 'info_dict': { - 'id': '24aace4429fc450fb5b38cdbf424a66e1d', - 'ext': 'mp4', - 'title': 'Xyce Software Training - Section 1', - 'description': 're:(?s)SAND Number: SAND 2013-7800.{200,}', - 'upload_date': '20120904', - 'duration': 7794, - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - req = sanitized_Request(url) - req.add_header('Cookie', 'MediasitePlayerCaps=ClientPlugins=4') - webpage = self._download_webpage(req, video_id) - - js_path = self._search_regex( - r'', webpage, 'embed vars'), - display_id) - - title = embed_vars['contentName'] - - formats = [] - bitrates = [] - for f in embed_vars.get('media', []): - if not f.get('uri') or f.get('mediaPurpose') != 'play': - continue - bitrate = int_or_none(f.get('bitRate')) - if bitrate: - bitrates.append(bitrate) - formats.append({ - 'url': f['uri'], - 'format_id': 'http-%d' % bitrate if bitrate else 'http', - 'width': int_or_none(f.get('width')), - 'height': int_or_none(f.get('height')), - 'tbr': bitrate, - 'format': 'mp4', - }) - - if not bitrates: - # When subscriptionLevel > 0, i.e. plus subscription is required - # media list will be empty. However, hds and hls uris are still - # available. We can grab them assuming bitrates to be default. - bitrates = self._DEFAULT_BITRATES - - auth_token = embed_vars.get('AuthToken') - - def construct_manifest_url(base_url, ext): - pieces = [base_url] - pieces.extend([compat_str(b) for b in bitrates]) - pieces.append('_kbps.mp4.%s?%s' % (ext, auth_token)) - return ','.join(pieces) - - if bitrates and auth_token: - hds_url = embed_vars.get('hdsUri') - if hds_url: - f4m_formats = self._extract_f4m_formats( - construct_manifest_url(hds_url, 'f4m'), - display_id, f4m_id='hds', fatal=False) - if len(f4m_formats) == len(bitrates): - for f, bitrate in zip(f4m_formats, bitrates): - if not f.get('tbr'): - f['format_id'] = 'hds-%d' % bitrate - f['tbr'] = bitrate - # TODO: fix f4m downloader to handle manifests without bitrates if possible - # formats.extend(f4m_formats) - - hls_url = embed_vars.get('hlsUri') - if hls_url: - formats.extend(self._extract_m3u8_formats( - construct_manifest_url(hls_url, 'm3u8'), - display_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) - self._sort_formats(formats) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'thumbnail': embed_vars.get('thumbUri'), - 'duration': int_or_none(embed_vars.get('videoLengthInSeconds')) or None, - 'age_limit': parse_age_limit(embed_vars.get('audienceRating')), - 'tags': embed_vars.get('tags', '').split(','), - 'formats': formats, - } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/screenwavemedia.py youtube-dl-2019.05.20/youtube_dl/extractor/screenwavemedia.py --- youtube-dl-2016.02.22/youtube_dl/extractor/screenwavemedia.py 2016-02-14 22:26:17.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/screenwavemedia.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,144 +0,0 @@ -# encoding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - unified_strdate, - js_to_json, -) - - -class ScreenwaveMediaIE(InfoExtractor): - _VALID_URL = r'https?://player\d?\.screenwavemedia\.com/(?:play/)?[a-zA-Z]+\.php\?.*\bid=(?P[A-Za-z0-9-]+)' - EMBED_PATTERN = r'src=(["\'])(?P(?:https?:)?//player\d?\.screenwavemedia\.com/(?:play/)?[a-zA-Z]+\.php\?.*\bid=.+?)\1' - _TESTS = [{ - 'url': 'http://player.screenwavemedia.com/play/play.php?playerdiv=videoarea&companiondiv=squareAd&id=Cinemassacre-19911', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - playerdata = self._download_webpage( - 'http://player.screenwavemedia.com/player.php?id=%s' % video_id, - video_id, 'Downloading player webpage') - - vidtitle = self._search_regex( - r'\'vidtitle\'\s*:\s*"([^"]+)"', playerdata, 'vidtitle').replace('\\/', '/') - - playerconfig = self._download_webpage( - 'http://player.screenwavemedia.com/player.js', - video_id, 'Downloading playerconfig webpage') - - videoserver = self._search_regex(r'SWMServer\s*=\s*"([\d\.]+)"', playerdata, 'videoserver') - - sources = self._parse_json( - js_to_json( - re.sub( - r'(?s)/\*.*?\*/', '', - self._search_regex( - r'sources\s*:\s*(\[[^\]]+?\])', playerconfig, - 'sources', - ).replace( - "' + thisObj.options.videoserver + '", - videoserver - ).replace( - "' + playerVidId + '", - video_id - ) - ) - ), - video_id, fatal=False - ) - - # Fallback to hardcoded sources if JS changes again - if not sources: - self.report_warning('Falling back to a hardcoded list of streams') - sources = [{ - 'file': 'http://%s/vod/%s_%s.mp4' % (videoserver, video_id, format_id), - 'type': 'mp4', - 'label': format_label, - } for format_id, format_label in ( - ('low', '144p Low'), ('med', '160p Med'), ('high', '360p High'), ('hd1', '720p HD1'))] - sources.append({ - 'file': 'http://%s/vod/smil:%s.smil/playlist.m3u8' % (videoserver, video_id), - 'type': 'hls', - }) - - formats = [] - for source in sources: - if source['type'] == 'hls': - formats.extend(self._extract_m3u8_formats(source['file'], video_id, ext='mp4')) - else: - file_ = source.get('file') - if not file_: - continue - format_label = source.get('label') - format_id = self._search_regex( - r'_(.+?)\.[^.]+$', file_, 'format id', default=None) - height = int_or_none(self._search_regex( - r'^(\d+)[pP]', format_label, 'height', default=None)) - formats.append({ - 'url': source['file'], - 'format_id': format_id, - 'format': format_label, - 'ext': source.get('type'), - 'height': height, - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': vidtitle, - 'formats': formats, - } - - -class TeamFourIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?teamfourstar\.com/video/(?P[a-z0-9\-]+)/?' - _TEST = { - 'url': 'http://teamfourstar.com/video/a-moment-with-tfs-episode-4/', - 'info_dict': { - 'id': 'TeamFourStar-5292a02f20bfa', - 'ext': 'mp4', - 'upload_date': '20130401', - 'description': 'Check out this and more on our website: http://teamfourstar.com\nTFS Store: http://sharkrobot.com/team-four-star\nFollow on Twitter: http://twitter.com/teamfourstar\nLike on FB: http://facebook.com/teamfourstar', - 'title': 'A Moment With TFS Episode 4', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - playerdata_url = self._search_regex( - r'src="(http://player\d?\.screenwavemedia\.com/(?:play/)?[a-zA-Z]+\.php\?[^"]*\bid=.+?)"', - webpage, 'player data URL') - - video_title = self._html_search_regex( - r'
(?P.+?)</div>', - webpage, 'title') - video_date = unified_strdate(self._html_search_regex( - r'<div class="heroheadingdate">(?P<date>.+?)</div>', - webpage, 'date', fatal=False)) - video_description = self._html_search_regex( - r'(?s)<div class="postcontent">(?P<description>.+?)</div>', - webpage, 'description', fatal=False) - video_thumbnail = self._og_search_thumbnail(webpage) - - return { - '_type': 'url_transparent', - 'display_id': display_id, - 'title': video_title, - 'description': video_description, - 'upload_date': video_date, - 'thumbnail': video_thumbnail, - 'url': playerdata_url, - } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/scrippsnetworks.py youtube-dl-2019.05.20/youtube_dl/extractor/scrippsnetworks.py --- youtube-dl-2016.02.22/youtube_dl/extractor/scrippsnetworks.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/scrippsnetworks.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,104 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import hashlib +import re + +from .aws import AWSIE +from .anvato import AnvatoIE +from ..utils import ( + smuggle_url, + urlencode_postdata, + xpath_text, +) + + +class ScrippsNetworksWatchIE(AWSIE): + IE_NAME = 'scrippsnetworks:watch' + _VALID_URL = r'''(?x) + https?:// + watch\. + (?P<site>geniuskitchen)\.com/ + (?: + player\.[A-Z0-9]+\.html\#| + show/(?:[^/]+/){2}| + player/ + ) + (?P<id>\d+) + ''' + _TESTS = [{ + 'url': 'http://watch.geniuskitchen.com/player/3787617/Ample-Hills-Ice-Cream-Bike/', + 'info_dict': { + 'id': '4194875', + 'ext': 'mp4', + 'title': 'Ample Hills Ice Cream Bike', + 'description': 'Courtney Rada churns up a signature GK Now ice cream with The Scoopmaster.', + 'uploader': 'ANV', + 'upload_date': '20171011', + 'timestamp': 1507698000, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [AnvatoIE.ie_key()], + }] + + _SNI_TABLE = { + 'geniuskitchen': 'genius', + } + + _AWS_API_KEY = 'E7wSQmq0qK6xPrF13WmzKiHo4BQ7tip4pQcSXVl1' + _AWS_PROXY_HOST = 'web.api.video.snidigital.com' + + _AWS_USER_AGENT = 'aws-sdk-js/2.80.0 callback' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + site_id, video_id = mobj.group('site', 'id') + + aws_identity_id_json = json.dumps({ + 'IdentityId': '%s:7655847c-0ae7-4d9b-80d6-56c062927eb3' % self._AWS_REGION + }).encode('utf-8') + token = self._download_json( + 'https://cognito-identity.%s.amazonaws.com/' % self._AWS_REGION, video_id, + data=aws_identity_id_json, + headers={ + 'Accept': '*/*', + 'Content-Type': 'application/x-amz-json-1.1', + 'Referer': url, + 'X-Amz-Content-Sha256': hashlib.sha256(aws_identity_id_json).hexdigest(), + 'X-Amz-Target': 'AWSCognitoIdentityService.GetOpenIdToken', + 'X-Amz-User-Agent': self._AWS_USER_AGENT, + })['Token'] + + sts = self._download_xml( + 'https://sts.amazonaws.com/', video_id, data=urlencode_postdata({ + 'Action': 'AssumeRoleWithWebIdentity', + 'RoleArn': 'arn:aws:iam::710330595350:role/Cognito_WebAPIUnauth_Role', + 'RoleSessionName': 'web-identity', + 'Version': '2011-06-15', + 'WebIdentityToken': token, + }), headers={ + 'Referer': url, + 'X-Amz-User-Agent': self._AWS_USER_AGENT, + 'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8', + }) + + def get(key): + return xpath_text( + sts, './/{https://sts.amazonaws.com/doc/2011-06-15/}%s' % key, + fatal=True) + + mcp_id = self._aws_execute_api({ + 'uri': '/1/web/brands/%s/episodes/scrid/%s' % (self._SNI_TABLE[site_id], video_id), + 'access_key': get('AccessKeyId'), + 'secret_key': get('SecretAccessKey'), + 'session_token': get('SessionToken'), + }, video_id)['results'][0]['mcpId'] + + return self.url_result( + smuggle_url( + 'anvato:anvato_scripps_app_web_prod_0837996dbe373629133857ae9eb72e740424d80a:%s' % mcp_id, + {'geo_countries': ['US']}), + AnvatoIE.ie_key(), video_id=mcp_id) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/seeker.py youtube-dl-2019.05.20/youtube_dl/extractor/seeker.py --- youtube-dl-2016.02.22/youtube_dl/extractor/seeker.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/seeker.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,57 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class SeekerIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?seeker\.com/(?P<display_id>.*)-(?P<article_id>\d+)\.html' + _TESTS = [{ + # player.loadRevision3Item + 'url': 'http://www.seeker.com/should-trump-be-required-to-release-his-tax-returns-1833805621.html', + 'md5': '30c1dc4030cc715cf05b423d0947ac18', + 'info_dict': { + 'id': '76243', + 'ext': 'webm', + 'title': 'Should Trump Be Required To Release His Tax Returns?', + 'description': 'Donald Trump has been secretive about his "big," "beautiful" tax returns. So what can we learn if he decides to release them?', + 'uploader': 'Seeker Daily', + 'uploader_id': 'seekerdaily', + } + }, { + 'url': 'http://www.seeker.com/changes-expected-at-zoos-following-recent-gorilla-lion-shootings-1834116536.html', + 'playlist': [ + { + 'md5': '83bcd157cab89ad7318dd7b8c9cf1306', + 'info_dict': { + 'id': '67558', + 'ext': 'mp4', + 'title': 'The Pros & Cons Of Zoos', + 'description': 'Zoos are often depicted as a terrible place for animals to live, but is there any truth to this?', + 'uploader': 'DNews', + 'uploader_id': 'dnews', + }, + } + ], + 'info_dict': { + 'id': '1834116536', + 'title': 'After Gorilla Killing, Changes Ahead for Zoos', + 'description': 'The largest association of zoos and others are hoping to learn from recent incidents that led to the shooting deaths of a gorilla and two lions.', + }, + }] + + def _real_extract(self, url): + display_id, article_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, display_id) + mobj = re.search(r"player\.loadRevision3Item\('([^']+)'\s*,\s*(\d+)\);", webpage) + if mobj: + playlist_type, playlist_id = mobj.groups() + return self.url_result( + 'revision3:%s:%s' % (playlist_type, playlist_id), 'Revision3Embed', playlist_id) + else: + entries = [self.url_result('revision3:video_id:%s' % video_id, 'Revision3Embed', video_id) for video_id in re.findall( + r'<iframe[^>]+src=[\'"](?:https?:)?//api\.seekernetwork\.com/player/embed\?videoId=(\d+)', webpage)] + return self.playlist_result( + entries, article_id, self._og_search_title(webpage), self._og_search_description(webpage)) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/senateisvp.py youtube-dl-2019.05.20/youtube_dl/extractor/senateisvp.py --- youtube-dl-2016.02.22/youtube_dl/extractor/senateisvp.py 2016-02-14 22:26:17.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/senateisvp.py 2019-06-07 18:49:07.000000000 +0000 @@ -48,14 +48,14 @@ ['arch', '', 'http://ussenate-f.akamaihd.net/'] ] _IE_NAME = 'senate.gov' - _VALID_URL = r'http://www\.senate\.gov/isvp/?\?(?P<qs>.+)' + _VALID_URL = r'https?://(?:www\.)?senate\.gov/isvp/?\?(?P<qs>.+)' _TESTS = [{ 'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png', 'info_dict': { 'id': 'judiciary031715', 'ext': 'mp4', 'title': 'Integrated Senate Video Player', - 'thumbnail': 're:^https?://.*\.(?:jpg|png)$', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', }, 'params': { # m3u8 download @@ -89,7 +89,7 @@ @staticmethod def _search_iframe_url(webpage): mobj = re.search( - r"<iframe[^>]+src=['\"](?P<url>http://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]", + r"<iframe[^>]+src=['\"](?P<url>https?://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]", webpage) if mobj: return mobj.group('url') diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/sendtonews.py youtube-dl-2019.05.20/youtube_dl/extractor/sendtonews.py --- youtube-dl-2016.02.22/youtube_dl/extractor/sendtonews.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/sendtonews.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,105 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + float_or_none, + parse_iso8601, + update_url_query, + int_or_none, + determine_protocol, + unescapeHTML, +) + + +class SendtoNewsIE(InfoExtractor): + _VALID_URL = r'https?://embed\.sendtonews\.com/player2/embedplayer\.php\?.*\bSC=(?P<id>[0-9A-Za-z-]+)' + + _TEST = { + # From http://cleveland.cbslocal.com/2016/05/16/indians-score-season-high-15-runs-in-blowout-win-over-reds-rapid-reaction/ + 'url': 'http://embed.sendtonews.com/player2/embedplayer.php?SC=GxfCe0Zo7D-175909-5588&type=single&autoplay=on&sound=YES', + 'info_dict': { + 'id': 'GxfCe0Zo7D-175909-5588' + }, + 'playlist_count': 8, + # test the first video only to prevent lengthy tests + 'playlist': [{ + 'info_dict': { + 'id': '240385', + 'ext': 'mp4', + 'title': 'Indians introduce Encarnacion', + 'description': 'Indians president of baseball operations Chris Antonetti and Edwin Encarnacion discuss the slugger\'s three-year contract with Cleveland', + 'duration': 137.898, + 'thumbnail': r're:https?://.*\.jpg$', + 'upload_date': '20170105', + 'timestamp': 1483649762, + }, + }], + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + _URL_TEMPLATE = '//embed.sendtonews.com/player2/embedplayer.php?SC=%s' + + @classmethod + def _extract_url(cls, webpage): + mobj = re.search(r'''(?x)<script[^>]+src=([\'"]) + (?:https?:)?//embed\.sendtonews\.com/player/responsiveembed\.php\? + .*\bSC=(?P<SC>[0-9a-zA-Z-]+).* + \1>''', webpage) + if mobj: + sc = mobj.group('SC') + return cls._URL_TEMPLATE % sc + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + data_url = update_url_query( + url.replace('embedplayer.php', 'data_read.php'), + {'cmd': 'loadInitial'}) + playlist_data = self._download_json(data_url, playlist_id) + + entries = [] + for video in playlist_data['playlistData'][0]: + info_dict = self._parse_jwplayer_data( + video['jwconfiguration'], + require_title=False, m3u8_id='hls', rtmp_params={'no_resume': True}) + + for f in info_dict['formats']: + if f.get('tbr'): + continue + tbr = int_or_none(self._search_regex( + r'/(\d+)k/', f['url'], 'bitrate', default=None)) + if not tbr: + continue + f.update({ + 'format_id': '%s-%d' % (determine_protocol(f), tbr), + 'tbr': tbr, + }) + self._sort_formats(info_dict['formats'], ('tbr', 'height', 'width', 'format_id')) + + thumbnails = [] + if video.get('thumbnailUrl'): + thumbnails.append({ + 'id': 'normal', + 'url': video['thumbnailUrl'], + }) + if video.get('smThumbnailUrl'): + thumbnails.append({ + 'id': 'small', + 'url': video['smThumbnailUrl'], + }) + info_dict.update({ + 'title': video['S_headLine'].strip(), + 'description': unescapeHTML(video.get('S_fullStory')), + 'thumbnails': thumbnails, + 'duration': float_or_none(video.get('SM_length')), + 'timestamp': parse_iso8601(video.get('S_sysDate'), delimiter=' '), + }) + entries.append(info_dict) + + return self.playlist_result(entries, playlist_id) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/servus.py youtube-dl-2019.05.20/youtube_dl/extractor/servus.py --- youtube-dl-2016.02.22/youtube_dl/extractor/servus.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/servus.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,56 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class ServusIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)/(?P<id>[aA]{2}-\w+|\d+-\d+)' + _TESTS = [{ + 'url': 'https://www.servus.com/de/p/Die-Gr%C3%BCnen-aus-Sicht-des-Volkes/AA-1T6VBU5PW1W12/', + 'md5': '3e1dd16775aa8d5cbef23628cfffc1f4', + 'info_dict': { + 'id': 'AA-1T6VBU5PW1W12', + 'ext': 'mp4', + 'title': 'Die Grünen aus Sicht des Volkes', + 'description': 'md5:1247204d85783afe3682644398ff2ec4', + 'thumbnail': r're:^https?://.*\.jpg', + } + }, { + 'url': 'https://www.servus.com/at/p/Wie-das-Leben-beginnt/1309984137314-381415152/', + 'only_matching': True, + }, { + 'url': 'https://www.servus.com/tv/videos/aa-1t6vbu5pw1w12/', + 'only_matching': True, + }, { + 'url': 'https://www.servus.com/tv/videos/1380889096408-1235196658/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url).upper() + webpage = self._download_webpage(url, video_id) + + title = self._search_regex( + (r'videoLabel\s*=\s*(["\'])(?P<title>(?:(?!\1).)+)\1', + r'<h\d+[^>]+\bclass=["\']heading--(?:one|two)["\'][^>]*>(?P<title>[^<]+)'), + webpage, 'title', default=None, + group='title') or self._og_search_title(webpage) + title = re.sub(r'\s*-\s*Servus TV\s*$', '', title) + description = self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail(webpage) + + formats = self._extract_m3u8_formats( + 'https://stv.rbmbtnx.net/api/v1/manifests/%s.m3u8' % video_id, + video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/sevenplus.py youtube-dl-2019.05.20/youtube_dl/extractor/sevenplus.py --- youtube-dl-2016.02.22/youtube_dl/extractor/sevenplus.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/sevenplus.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,84 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .brightcove import BrightcoveNewIE +from ..compat import compat_str +from ..utils import ( + try_get, + update_url_query, +) + + +class SevenPlusIE(BrightcoveNewIE): + IE_NAME = '7plus' + _VALID_URL = r'https?://(?:www\.)?7plus\.com\.au/(?P<path>[^?]+\?.*?\bepisode-id=(?P<id>[^&#]+))' + _TESTS = [{ + 'url': 'https://7plus.com.au/MTYS?episode-id=MTYS7-003', + 'info_dict': { + 'id': 'MTYS7-003', + 'ext': 'mp4', + 'title': 'S7 E3 - Wind Surf', + 'description': 'md5:29c6a69f21accda7601278f81b46483d', + 'uploader_id': '5303576322001', + 'upload_date': '20171201', + 'timestamp': 1512106377, + 'series': 'Mighty Ships', + 'season_number': 7, + 'episode_number': 3, + 'episode': 'Wind Surf', + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + } + }, { + 'url': 'https://7plus.com.au/UUUU?episode-id=AUMS43-001', + 'only_matching': True, + }] + + def _real_extract(self, url): + path, episode_id = re.match(self._VALID_URL, url).groups() + + media = self._download_json( + 'https://videoservice.swm.digital/playback', episode_id, query={ + 'appId': '7plus', + 'deviceType': 'web', + 'platformType': 'web', + 'accountId': 5303576322001, + 'referenceId': 'ref:' + episode_id, + 'deliveryId': 'csai', + 'videoType': 'vod', + })['media'] + + for source in media.get('sources', {}): + src = source.get('src') + if not src: + continue + source['src'] = update_url_query(src, {'rule': ''}) + + info = self._parse_brightcove_metadata(media, episode_id) + + content = self._download_json( + 'https://component-cdn.swm.digital/content/' + path, + episode_id, headers={ + 'market-id': 4, + }, fatal=False) or {} + for item in content.get('items', {}): + if item.get('componentData', {}).get('componentType') == 'infoPanel': + for src_key, dst_key in [('title', 'title'), ('shortSynopsis', 'description')]: + value = item.get(src_key) + if value: + info[dst_key] = value + info['series'] = try_get( + item, lambda x: x['seriesLogo']['name'], compat_str) + mobj = re.search(r'^S(\d+)\s+E(\d+)\s+-\s+(.+)$', info['title']) + if mobj: + info.update({ + 'season_number': int(mobj.group(1)), + 'episode_number': int(mobj.group(2)), + 'episode': mobj.group(3), + }) + + return info diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/sexu.py youtube-dl-2019.05.20/youtube_dl/extractor/sexu.py --- youtube-dl-2016.02.22/youtube_dl/extractor/sexu.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/sexu.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,7 +1,5 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor @@ -14,9 +12,9 @@ 'id': '961791', 'ext': 'mp4', 'title': 'md5:4d05a19a5fc049a63dbbaf05fb71d91b', - 'description': 'md5:c5ed8625eb386855d5a7967bd7b77a54', + 'description': 'md5:2b75327061310a3afb3fbd7d09e2e403', 'categories': list, # NSFW - 'thumbnail': 're:https?://.*\.jpg$', + 'thumbnail': r're:https?://.*\.jpg$', 'age_limit': 18, } } @@ -25,13 +23,19 @@ video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - quality_arr = self._search_regex( - r'sources:\s*\[([^\]]+)\]', webpage, 'forrmat string') + jwvideo = self._parse_json( + self._search_regex(r'\.setup\(\s*({.+?})\s*\);', webpage, 'jwvideo'), + video_id) + + sources = jwvideo['sources'] + formats = [{ - 'url': fmt[0].replace('\\', ''), - 'format_id': fmt[1], - 'height': int(fmt[1][:3]), - } for fmt in re.findall(r'"file":"([^"]+)","label":"([^"]+)"', quality_arr)] + 'url': source['file'].replace('\\', ''), + 'format_id': source.get('label'), + 'height': int(self._search_regex( + r'^(\d+)[pP]', source.get('label', ''), 'height', + default=None)), + } for source in sources if source.get('file')] self._sort_formats(formats) title = self._html_search_regex( @@ -40,9 +44,7 @@ description = self._html_search_meta( 'description', webpage, 'description') - thumbnail = self._html_search_regex( - r'image:\s*"([^"]+)"', - webpage, 'thumbnail', fatal=False) + thumbnail = jwvideo.get('image') categories_str = self._html_search_meta( 'keywords', webpage, 'categories') diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/sexykarma.py youtube-dl-2019.05.20/youtube_dl/extractor/sexykarma.py --- youtube-dl-2016.02.22/youtube_dl/extractor/sexykarma.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/sexykarma.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,121 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - unified_strdate, - parse_duration, - int_or_none, -) - - -class SexyKarmaIE(InfoExtractor): - IE_DESC = 'Sexy Karma and Watch Indian Porn' - _VALID_URL = r'https?://(?:www\.)?(?:sexykarma\.com|watchindianporn\.net)/(?:[^/]+/)*video/(?P<display_id>[^/]+)-(?P<id>[a-zA-Z0-9]+)\.html' - _TESTS = [{ - 'url': 'http://www.sexykarma.com/gonewild/video/taking-a-quick-pee-yHI70cOyIHt.html', - 'md5': 'b9798e7d1ef1765116a8f516c8091dbd', - 'info_dict': { - 'id': 'yHI70cOyIHt', - 'display_id': 'taking-a-quick-pee', - 'ext': 'mp4', - 'title': 'Taking a quick pee.', - 'thumbnail': 're:^https?://.*\.jpg$', - 'uploader': 'wildginger7', - 'upload_date': '20141008', - 'duration': 22, - 'view_count': int, - 'comment_count': int, - 'categories': list, - 'age_limit': 18, - } - }, { - 'url': 'http://www.sexykarma.com/gonewild/video/pot-pixie-tribute-8Id6EZPbuHf.html', - 'md5': 'dd216c68d29b49b12842b9babe762a5d', - 'info_dict': { - 'id': '8Id6EZPbuHf', - 'display_id': 'pot-pixie-tribute', - 'ext': 'mp4', - 'title': 'pot_pixie tribute', - 'thumbnail': 're:^https?://.*\.jpg$', - 'uploader': 'banffite', - 'upload_date': '20141013', - 'duration': 16, - 'view_count': int, - 'comment_count': int, - 'categories': list, - 'age_limit': 18, - } - }, { - 'url': 'http://www.watchindianporn.net/video/desi-dancer-namrata-stripping-completely-nude-and-dancing-on-a-hot-number-dW2mtctxJfs.html', - 'md5': '9afb80675550406ed9a63ac2819ef69d', - 'info_dict': { - 'id': 'dW2mtctxJfs', - 'display_id': 'desi-dancer-namrata-stripping-completely-nude-and-dancing-on-a-hot-number', - 'ext': 'mp4', - 'title': 'Desi dancer namrata stripping completely nude and dancing on a hot number', - 'thumbnail': 're:^https?://.*\.jpg$', - 'uploader': 'Don', - 'upload_date': '20140213', - 'duration': 83, - 'view_count': int, - 'comment_count': int, - 'categories': list, - 'age_limit': 18, - } - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') - - webpage = self._download_webpage(url, display_id) - - video_url = self._html_search_regex( - r"url: escape\('([^']+)'\)", webpage, 'url') - - title = self._html_search_regex( - r'<h2 class="he2"><span>(.*?)</span>', - webpage, 'title') - thumbnail = self._html_search_regex( - r'<span id="container"><img\s+src="([^"]+)"', - webpage, 'thumbnail', fatal=False) - - uploader = self._html_search_regex( - r'class="aupa">\s*(.*?)</a>', - webpage, 'uploader') - upload_date = unified_strdate(self._html_search_regex( - r'Added: <strong>(.+?)</strong>', webpage, 'upload date', fatal=False)) - - duration = parse_duration(self._search_regex( - r'<td>Time:\s*</td>\s*<td align="right"><span>\s*(.+?)\s*</span>', - webpage, 'duration', fatal=False)) - - view_count = int_or_none(self._search_regex( - r'<td>Views:\s*</td>\s*<td align="right"><span>\s*(\d+)\s*</span>', - webpage, 'view count', fatal=False)) - comment_count = int_or_none(self._search_regex( - r'<td>Comments:\s*</td>\s*<td align="right"><span>\s*(\d+)\s*</span>', - webpage, 'comment count', fatal=False)) - - categories = re.findall( - r'<a href="[^"]+/search/video/desi"><span>([^<]+)</span></a>', - webpage) - - return { - 'id': video_id, - 'display_id': display_id, - 'url': video_url, - 'title': title, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'upload_date': upload_date, - 'duration': duration, - 'view_count': view_count, - 'comment_count': comment_count, - 'categories': categories, - 'age_limit': 18, - } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/seznamzpravy.py youtube-dl-2019.05.20/youtube_dl/extractor/seznamzpravy.py --- youtube-dl-2016.02.22/youtube_dl/extractor/seznamzpravy.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/seznamzpravy.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,169 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_str, + compat_urllib_parse_urlparse, +) +from ..utils import ( + urljoin, + int_or_none, + parse_codecs, + try_get, +) + + +def _raw_id(src_url): + return compat_urllib_parse_urlparse(src_url).path.split('/')[-1] + + +class SeznamZpravyIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?seznamzpravy\.cz/iframe/player\?.*\bsrc=' + _TESTS = [{ + 'url': 'https://www.seznamzpravy.cz/iframe/player?duration=241&serviceSlug=zpravy&src=https%3A%2F%2Fv39-a.sdn.szn.cz%2Fv_39%2Fvmd%2F5999c902ea707c67d8e267a9%3Ffl%3Dmdk%2C432f65a0%7C&itemType=video&autoPlay=false&title=Sv%C4%9Bt%20bez%20obalu%3A%20%C4%8Ce%C5%A1t%C3%AD%20voj%C3%A1ci%20na%20mis%C3%ADch%20(kr%C3%A1tk%C3%A1%20verze)&series=Sv%C4%9Bt%20bez%20obalu&serviceName=Seznam%20Zpr%C3%A1vy&poster=%2F%2Fd39-a.sdn.szn.cz%2Fd_39%2Fc_img_F_I%2FR5puJ.jpeg%3Ffl%3Dcro%2C0%2C0%2C1920%2C1080%7Cres%2C1200%2C%2C1%7Cjpg%2C80%2C%2C1&width=1920&height=1080&cutFrom=0&cutTo=0&splVersion=VOD&contentId=170889&contextId=35990&showAdvert=true&collocation=&autoplayPossible=true&embed=&isVideoTooShortForPreroll=false&isVideoTooLongForPostroll=true&videoCommentOpKey=&videoCommentId=&version=4.0.76&dotService=zpravy&gemiusPrismIdentifier=bVc1ZIb_Qax4W2v5xOPGpMeCP31kFfrTzj0SqPTLh_b.Z7&zoneIdPreroll=seznam.pack.videospot&skipOffsetPreroll=5§ionPrefixPreroll=%2Fzpravy', + 'info_dict': { + 'id': '170889', + 'ext': 'mp4', + 'title': 'Svět bez obalu: Čeští vojáci na misích (krátká verze)', + 'thumbnail': r're:^https?://.*\.jpe?g', + 'duration': 241, + 'series': 'Svět bez obalu', + }, + 'params': { + 'skip_download': True, + }, + }, { + # with Location key + 'url': 'https://www.seznamzpravy.cz/iframe/player?duration=null&serviceSlug=zpravy&src=https%3A%2F%2Flive-a.sdn.szn.cz%2Fv_39%2F59e468fe454f8472a96af9fa%3Ffl%3Dmdk%2C5c1e2840%7C&itemType=livevod&autoPlay=false&title=P%C5%99edseda%20KDU-%C4%8CSL%20Pavel%20B%C4%9Blobr%C3%A1dek%20ve%20volebn%C3%AD%20V%C3%BDzv%C4%9B%20Seznamu&series=V%C3%BDzva&serviceName=Seznam%20Zpr%C3%A1vy&poster=%2F%2Fd39-a.sdn.szn.cz%2Fd_39%2Fc_img_G_J%2FjTBCs.jpeg%3Ffl%3Dcro%2C0%2C0%2C1280%2C720%7Cres%2C1200%2C%2C1%7Cjpg%2C80%2C%2C1&width=16&height=9&cutFrom=0&cutTo=0&splVersion=VOD&contentId=185688&contextId=38489&showAdvert=true&collocation=&hideFullScreen=false&hideSubtitles=false&embed=&isVideoTooShortForPreroll=false&isVideoTooShortForPreroll2=false&isVideoTooLongForPostroll=false&fakePostrollZoneID=seznam.clanky.zpravy.preroll&fakePrerollZoneID=seznam.clanky.zpravy.preroll&videoCommentId=&trim=default_16x9&noPrerollVideoLength=30&noPreroll2VideoLength=undefined&noMidrollVideoLength=0&noPostrollVideoLength=999999&autoplayPossible=true&version=5.0.41&dotService=zpravy&gemiusPrismIdentifier=zD3g7byfW5ekpXmxTVLaq5Srjw5i4hsYo0HY1aBwIe..27&zoneIdPreroll=seznam.pack.videospot&skipOffsetPreroll=5§ionPrefixPreroll=%2Fzpravy%2Fvyzva&zoneIdPostroll=seznam.pack.videospot&skipOffsetPostroll=5§ionPrefixPostroll=%2Fzpravy%2Fvyzva®ression=false', + 'info_dict': { + 'id': '185688', + 'ext': 'mp4', + 'title': 'Předseda KDU-ČSL Pavel Bělobrádek ve volební Výzvě Seznamu', + 'thumbnail': r're:^https?://.*\.jpe?g', + 'series': 'Výzva', + }, + 'params': { + 'skip_download': True, + }, + }] + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') for mobj in re.finditer( + r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?seznamzpravy\.cz/iframe/player\?.*?)\1', + webpage)] + + def _extract_sdn_formats(self, sdn_url, video_id): + sdn_data = self._download_json(sdn_url, video_id) + + if sdn_data.get('Location'): + sdn_url = sdn_data['Location'] + sdn_data = self._download_json(sdn_url, video_id) + + formats = [] + mp4_formats = try_get(sdn_data, lambda x: x['data']['mp4'], dict) or {} + for format_id, format_data in mp4_formats.items(): + relative_url = format_data.get('url') + if not relative_url: + continue + + try: + width, height = format_data.get('resolution') + except (TypeError, ValueError): + width, height = None, None + + f = { + 'url': urljoin(sdn_url, relative_url), + 'format_id': 'http-%s' % format_id, + 'tbr': int_or_none(format_data.get('bandwidth'), scale=1000), + 'width': int_or_none(width), + 'height': int_or_none(height), + } + f.update(parse_codecs(format_data.get('codec'))) + formats.append(f) + + pls = sdn_data.get('pls', {}) + + def get_url(format_id): + return try_get(pls, lambda x: x[format_id]['url'], compat_str) + + dash_rel_url = get_url('dash') + if dash_rel_url: + formats.extend(self._extract_mpd_formats( + urljoin(sdn_url, dash_rel_url), video_id, mpd_id='dash', + fatal=False)) + + hls_rel_url = get_url('hls') + if hls_rel_url: + formats.extend(self._extract_m3u8_formats( + urljoin(sdn_url, hls_rel_url), video_id, ext='mp4', + m3u8_id='hls', fatal=False)) + + self._sort_formats(formats) + return formats + + def _real_extract(self, url): + params = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + + src = params['src'][0] + title = params['title'][0] + video_id = params.get('contentId', [_raw_id(src)])[0] + formats = self._extract_sdn_formats(src + 'spl2,2,VOD', video_id) + + duration = int_or_none(params.get('duration', [None])[0]) + series = params.get('series', [None])[0] + thumbnail = params.get('poster', [None])[0] + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'series': series, + 'formats': formats, + } + + +class SeznamZpravyArticleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?:seznam\.cz/zpravy|seznamzpravy\.cz)/clanek/(?:[^/?#&]+)-(?P<id>\d+)' + _API_URL = 'https://apizpravy.seznam.cz/' + + _TESTS = [{ + # two videos on one page, with SDN URL + 'url': 'https://www.seznamzpravy.cz/clanek/jejich-svet-na-nas-utoci-je-lepsi-branit-se-na-jejich-pisecku-rika-reziser-a-major-v-zaloze-marhoul-35990', + 'info_dict': { + 'id': '35990', + 'title': 'md5:6011c877a36905f28f271fcd8dcdb0f2', + 'description': 'md5:933f7b06fa337a814ba199d3596d27ba', + }, + 'playlist_count': 2, + }, { + # video with live stream URL + 'url': 'https://www.seznam.cz/zpravy/clanek/znovu-do-vlady-s-ano-pavel-belobradek-ve-volebnim-specialu-seznamu-38489', + 'info_dict': { + 'id': '38489', + 'title': 'md5:8fa1afdc36fd378cf0eba2b74c5aca60', + 'description': 'md5:428e7926a1a81986ec7eb23078004fb4', + }, + 'playlist_count': 1, + }] + + def _real_extract(self, url): + article_id = self._match_id(url) + + webpage = self._download_webpage(url, article_id) + + info = self._search_json_ld(webpage, article_id, default={}) + + title = info.get('title') or self._og_search_title(webpage, fatal=False) + description = info.get('description') or self._og_search_description(webpage) + + return self.playlist_result([ + self.url_result(entry_url, ie=SeznamZpravyIE.ie_key()) + for entry_url in SeznamZpravyIE._extract_urls(webpage)], + article_id, title, description) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/shahid.py youtube-dl-2019.05.20/youtube_dl/extractor/shahid.py --- youtube-dl-2016.02.22/youtube_dl/extractor/shahid.py 2016-01-14 09:35:12.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/shahid.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,99 +1,144 @@ # coding: utf-8 from __future__ import unicode_literals -from .common import InfoExtractor -from ..compat import compat_urllib_parse +import json +import math +import re + +from .aws import AWSIE +from ..compat import compat_HTTPError from ..utils import ( + clean_html, ExtractorError, + InAdvancePagedList, int_or_none, parse_iso8601, + str_or_none, + urlencode_postdata, ) -class ShahidIE(InfoExtractor): - _VALID_URL = r'https?://shahid\.mbc\.net/ar/episode/(?P<id>\d+)/?' +class ShahidBaseIE(AWSIE): + _AWS_PROXY_HOST = 'api2.shahid.net' + _AWS_API_KEY = '2RRtuMHx95aNI1Kvtn2rChEuwsCogUd4samGPjLh' + + def _handle_error(self, e): + fail_data = self._parse_json( + e.cause.read().decode('utf-8'), None, fatal=False) + if fail_data: + faults = fail_data.get('faults', []) + faults_message = ', '.join([clean_html(fault['userMessage']) for fault in faults if fault.get('userMessage')]) + if faults_message: + raise ExtractorError(faults_message, expected=True) + + def _call_api(self, path, video_id, request=None): + query = {} + if request: + query['request'] = json.dumps(request) + try: + return self._aws_execute_api({ + 'uri': '/proxy/v2/' + path, + 'access_key': 'AKIAI6X4TYCIXM2B7MUQ', + 'secret_key': '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', + }, video_id, query) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError): + self._handle_error(e) + raise + + +class ShahidIE(ShahidBaseIE): + _NETRC_MACHINE = 'shahid' + _VALID_URL = r'https?://shahid\.mbc\.net/ar/(?:serie|show|movie)s/[^/]+/(?P<type>episode|clip|movie)-(?P<id>\d+)' _TESTS = [{ - 'url': 'https://shahid.mbc.net/ar/episode/90574/%D8%A7%D9%84%D9%85%D9%84%D9%83-%D8%B9%D8%A8%D8%AF%D8%A7%D9%84%D9%84%D9%87-%D8%A7%D9%84%D8%A5%D9%86%D8%B3%D8%A7%D9%86-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-3.html', + 'url': 'https://shahid.mbc.net/ar/shows/%D9%85%D8%AC%D9%84%D8%B3-%D8%A7%D9%84%D8%B4%D8%A8%D8%A7%D8%A8-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-1/clip-275286', 'info_dict': { - 'id': '90574', + 'id': '275286', 'ext': 'mp4', - 'title': 'الملك عبدالله الإنسان الموسم 1 كليب 3', - 'description': 'الفيلم الوثائقي - الملك عبد الله الإنسان', - 'duration': 2972, - 'timestamp': 1422057420, - 'upload_date': '20150123', + 'title': 'مجلس الشباب الموسم 1 كليب 1', + 'timestamp': 1506988800, + 'upload_date': '20171003', }, 'params': { # m3u8 download 'skip_download': True, } }, { + 'url': 'https://shahid.mbc.net/ar/movies/%D8%A7%D9%84%D9%82%D9%86%D8%A7%D8%B5%D8%A9/movie-151746', + 'only_matching': True + }, { # shahid plus subscriber only - 'url': 'https://shahid.mbc.net/ar/episode/90511/%D9%85%D8%B1%D8%A7%D9%8A%D8%A7-2011-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1.html', + 'url': 'https://shahid.mbc.net/ar/series/%D9%85%D8%B1%D8%A7%D9%8A%D8%A7-2011-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/episode-90511', 'only_matching': True }] - def _handle_error(self, response): - if not isinstance(response, dict): + def _real_initialize(self): + email, password = self._get_login_info() + if email is None: return - error = response.get('error') - if error: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, '\n'.join(error.values())), - expected=True) - def _download_json(self, url, video_id, note='Downloading JSON metadata'): - response = super(ShahidIE, self)._download_json(url, video_id, note)['data'] - self._handle_error(response) - return response + try: + user_data = self._download_json( + 'https://shahid.mbc.net/wd/service/users/login', + None, 'Logging in', data=json.dumps({ + 'email': email, + 'password': password, + 'basic': 'false', + }).encode('utf-8'), headers={ + 'Content-Type': 'application/json; charset=UTF-8', + })['user'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError): + self._handle_error(e) + raise + + self._download_webpage( + 'https://shahid.mbc.net/populateContext', + None, 'Populate Context', data=urlencode_postdata({ + 'firstName': user_data['firstName'], + 'lastName': user_data['lastName'], + 'userName': user_data['email'], + 'csg_user_name': user_data['email'], + 'subscriberId': user_data['id'], + 'sessionId': user_data['sessionId'], + })) def _real_extract(self, url): - video_id = self._match_id(url) + page_type, video_id = re.match(self._VALID_URL, url).groups() + if page_type == 'clip': + page_type = 'episode' - webpage = self._download_webpage(url, video_id) + playout = self._call_api( + 'playout/url/' + video_id, video_id)['playout'] - api_vars = { - 'id': video_id, - 'type': 'player', - 'url': 'http://api.shahid.net/api/v1_1', - 'playerType': 'episode', - } - - flashvars = self._search_regex( - r'var\s+flashvars\s*=\s*({[^}]+})', webpage, 'flashvars', default=None) - if flashvars: - for key in api_vars.keys(): - value = self._search_regex( - r'\b%s\s*:\s*(?P<q>["\'])(?P<value>.+?)(?P=q)' % key, - flashvars, 'type', default=None, group='value') - if value: - api_vars[key] = value - - player = self._download_json( - 'https://shahid.mbc.net/arContent/getPlayerContent-param-.id-%s.type-%s.html' - % (video_id, api_vars['type']), video_id, 'Downloading player JSON') - - if player.get('drm'): + if playout.get('drm'): raise ExtractorError('This video is DRM protected.', expected=True) - formats = self._extract_m3u8_formats(player['url'], video_id, 'mp4') + formats = self._extract_m3u8_formats(playout['url'], video_id, 'mp4') + self._sort_formats(formats) - video = self._download_json( - '%s/%s/%s?%s' % ( - api_vars['url'], api_vars['playerType'], api_vars['id'], - compat_urllib_parse.urlencode({ - 'apiKey': 'sh@hid0nlin3', - 'hash': 'b2wMCTHpSmyxGqQjJFOycRmLSex+BpTK/ooxy6vHaqs=', - })), - video_id, 'Downloading video JSON') - - video = video[api_vars['playerType']] + # video = self._call_api( + # 'product/id', video_id, { + # 'id': video_id, + # 'productType': 'ASSET', + # 'productSubType': page_type.upper() + # })['productModel'] + + response = self._download_json( + 'http://api.shahid.net/api/v1_1/%s/%s' % (page_type, video_id), + video_id, 'Downloading video JSON', query={ + 'apiKey': 'sh@hid0nlin3', + 'hash': 'b2wMCTHpSmyxGqQjJFOycRmLSex+BpTK/ooxy6vHaqs=', + }) + data = response.get('data', {}) + error = data.get('error') + if error: + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, '\n'.join(error.values())), + expected=True) + video = data[page_type] title = video['title'] - description = video.get('description') - thumbnail = video.get('thumbnailUrl') - duration = int_or_none(video.get('duration')) - timestamp = parse_iso8601(video.get('referenceDate')) categories = [ category['name'] for category in video.get('genres', []) if 'name' in category] @@ -101,10 +146,70 @@ return { 'id': video_id, 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'timestamp': timestamp, + 'description': video.get('description'), + 'thumbnail': video.get('thumbnailUrl'), + 'duration': int_or_none(video.get('duration')), + 'timestamp': parse_iso8601(video.get('referenceDate')), 'categories': categories, + 'series': video.get('showTitle') or video.get('showName'), + 'season': video.get('seasonTitle'), + 'season_number': int_or_none(video.get('seasonNumber')), + 'season_id': str_or_none(video.get('seasonId')), + 'episode_number': int_or_none(video.get('number')), + 'episode_id': video_id, 'formats': formats, } + + +class ShahidShowIE(ShahidBaseIE): + _VALID_URL = r'https?://shahid\.mbc\.net/ar/(?:show|serie)s/[^/]+/(?:show|series)-(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://shahid.mbc.net/ar/shows/%D8%B1%D8%A7%D9%85%D8%B2-%D9%82%D8%B1%D8%B4-%D8%A7%D9%84%D8%A8%D8%AD%D8%B1/show-79187', + 'info_dict': { + 'id': '79187', + 'title': 'رامز قرش البحر', + 'description': 'md5:c88fa7e0f02b0abd39d417aee0d046ff', + }, + 'playlist_mincount': 32, + }, { + 'url': 'https://shahid.mbc.net/ar/series/How-to-live-Longer-(The-Big-Think)/series-291861', + 'only_matching': True + }] + _PAGE_SIZE = 30 + + def _real_extract(self, url): + show_id = self._match_id(url) + + product = self._call_api( + 'playableAsset', show_id, {'showId': show_id})['productModel'] + playlist = product['playlist'] + playlist_id = playlist['id'] + show = product.get('show', {}) + + def page_func(page_num): + playlist = self._call_api( + 'product/playlist', show_id, { + 'playListId': playlist_id, + 'pageNumber': page_num, + 'pageSize': 30, + 'sorts': [{ + 'order': 'DESC', + 'type': 'SORTDATE' + }], + }) + for product in playlist.get('productList', {}).get('products', []): + product_url = product.get('productUrl', []).get('url') + if not product_url: + continue + yield self.url_result( + product_url, 'Shahid', + str_or_none(product.get('id')), + product.get('title')) + + entries = InAdvancePagedList( + page_func, + math.ceil(playlist['count'] / self._PAGE_SIZE), + self._PAGE_SIZE) + + return self.playlist_result( + entries, show_id, show.get('title'), show.get('description')) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/shared.py youtube-dl-2019.05.20/youtube_dl/extractor/shared.py --- youtube-dl-2016.02.22/youtube_dl/extractor/shared.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/shared.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,21 +1,56 @@ from __future__ import unicode_literals -import base64 - from .common import InfoExtractor -from ..compat import compat_urllib_parse +from ..compat import compat_b64decode from ..utils import ( + determine_ext, ExtractorError, int_or_none, - sanitized_Request, + KNOWN_EXTENSIONS, + parse_filesize, + url_or_none, + urlencode_postdata, ) -class SharedIE(InfoExtractor): - IE_DESC = 'shared.sx and vivo.sx' - _VALID_URL = r'http://(?:shared|vivo)\.sx/(?P<id>[\da-z]{10})' +class SharedBaseIE(InfoExtractor): + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage, urlh = self._download_webpage_handle(url, video_id) + + if self._FILE_NOT_FOUND in webpage: + raise ExtractorError( + 'Video %s does not exist' % video_id, expected=True) - _TESTS = [{ + video_url = self._extract_video_url(webpage, video_id, url) + + title = self._extract_title(webpage) + filesize = int_or_none(self._extract_filesize(webpage)) + + return { + 'id': video_id, + 'url': video_url, + 'ext': 'mp4', + 'filesize': filesize, + 'title': title, + } + + def _extract_title(self, webpage): + return compat_b64decode(self._html_search_meta( + 'full:title', webpage, 'title')).decode('utf-8') + + def _extract_filesize(self, webpage): + return self._html_search_meta( + 'full:size', webpage, 'file size', fatal=False) + + +class SharedIE(SharedBaseIE): + IE_DESC = 'shared.sx' + _VALID_URL = r'https?://shared\.sx/(?P<id>[\da-z]{10})' + _FILE_NOT_FOUND = '>File does not exist<' + + _TEST = { 'url': 'http://shared.sx/0060718775', 'md5': '106fefed92a8a2adb8c98e6a0652f49b', 'info_dict': { @@ -24,47 +59,69 @@ 'title': 'Bmp4', 'filesize': 1720110, }, - }, { - 'url': 'http://vivo.sx/d7ddda0e78', - 'md5': '15b3af41be0b4fe01f4df075c2678b2c', - 'info_dict': { - 'id': 'd7ddda0e78', - 'ext': 'mp4', - 'title': 'Chicken', - 'filesize': 528031, - }, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - if '>File does not exist<' in webpage: - raise ExtractorError( - 'Video %s does not exist' % video_id, expected=True) + } + def _extract_video_url(self, webpage, video_id, url): download_form = self._hidden_inputs(webpage) - request = sanitized_Request( - url, compat_urllib_parse.urlencode(download_form)) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') video_page = self._download_webpage( - request, video_id, 'Downloading video page') + url, video_id, 'Downloading video page', + data=urlencode_postdata(download_form), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': url, + }) video_url = self._html_search_regex( - r'data-url="([^"]+)"', video_page, 'video URL') - title = base64.b64decode(self._html_search_meta( - 'full:title', webpage, 'title').encode('utf-8')).decode('utf-8') - filesize = int_or_none(self._html_search_meta( - 'full:size', webpage, 'file size', fatal=False)) - thumbnail = self._html_search_regex( - r'data-poster="([^"]+)"', video_page, 'thumbnail', default=None) + r'data-url=(["\'])(?P<url>(?:(?!\1).)+)\1', + video_page, 'video URL', group='url') - return { - 'id': video_id, - 'url': video_url, + return video_url + + +class VivoIE(SharedBaseIE): + IE_DESC = 'vivo.sx' + _VALID_URL = r'https?://vivo\.sx/(?P<id>[\da-z]{10})' + _FILE_NOT_FOUND = '>The file you have requested does not exists or has been removed' + + _TEST = { + 'url': 'http://vivo.sx/d7ddda0e78', + 'md5': '15b3af41be0b4fe01f4df075c2678b2c', + 'info_dict': { + 'id': 'd7ddda0e78', 'ext': 'mp4', - 'filesize': filesize, - 'title': title, - 'thumbnail': thumbnail, - } + 'title': 'Chicken', + 'filesize': 515659, + }, + } + + def _extract_title(self, webpage): + title = self._html_search_regex( + r'data-name\s*=\s*(["\'])(?P<title>(?:(?!\1).)+)\1', webpage, + 'title', default=None, group='title') + if title: + ext = determine_ext(title) + if ext.lower() in KNOWN_EXTENSIONS: + title = title.rpartition('.' + ext)[0] + return title + return self._og_search_title(webpage) + + def _extract_filesize(self, webpage): + return parse_filesize(self._search_regex( + r'data-type=["\']video["\'][^>]*>Watch.*?<strong>\s*\((.+?)\)', + webpage, 'filesize', fatal=False)) + + def _extract_video_url(self, webpage, video_id, url): + def decode_url(encoded_url): + return compat_b64decode(encoded_url).decode('utf-8') + + stream_url = url_or_none(decode_url(self._search_regex( + r'data-stream\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'stream url', default=None, group='url'))) + if stream_url: + return stream_url + return self._parse_json( + self._search_regex( + r'InitializeStream\s*\(\s*(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage, 'stream', group='url'), + video_id, transform_source=decode_url)[0] diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/sharesix.py youtube-dl-2019.05.20/youtube_dl/extractor/sharesix.py --- youtube-dl-2016.02.22/youtube_dl/extractor/sharesix.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/sharesix.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,91 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_urllib_parse -from ..utils import ( - parse_duration, - sanitized_Request, -) - - -class ShareSixIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?sharesix\.com/(?:f/)?(?P<id>[0-9a-zA-Z]+)' - _TESTS = [ - { - 'url': 'http://sharesix.com/f/OXjQ7Y6', - 'md5': '9e8e95d8823942815a7d7c773110cc93', - 'info_dict': { - 'id': 'OXjQ7Y6', - 'ext': 'mp4', - 'title': 'big_buck_bunny_480p_surround-fix.avi', - 'duration': 596, - 'width': 854, - 'height': 480, - }, - }, - { - 'url': 'http://sharesix.com/lfrwoxp35zdd', - 'md5': 'dd19f1435b7cec2d7912c64beeee8185', - 'info_dict': { - 'id': 'lfrwoxp35zdd', - 'ext': 'flv', - 'title': 'WhiteBoard___a_Mac_vs_PC_Parody_Cartoon.mp4.flv', - 'duration': 65, - 'width': 1280, - 'height': 720, - }, - } - ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - fields = { - 'method_free': 'Free' - } - post = compat_urllib_parse.urlencode(fields) - req = sanitized_Request(url, post) - req.add_header('Content-type', 'application/x-www-form-urlencoded') - - webpage = self._download_webpage(req, video_id, - 'Downloading video page') - - video_url = self._search_regex( - r"var\slnk1\s=\s'([^']+)'", webpage, 'video URL') - title = self._html_search_regex( - r'(?s)<dt>Filename:</dt>.+?<dd>(.+?)</dd>', webpage, 'title') - duration = parse_duration( - self._search_regex( - r'(?s)<dt>Length:</dt>.+?<dd>(.+?)</dd>', - webpage, - 'duration', - fatal=False - ) - ) - - m = re.search( - r'''(?xs)<dt>Width\sx\sHeight</dt>.+? - <dd>(?P<width>\d+)\sx\s(?P<height>\d+)</dd>''', - webpage - ) - width = height = None - if m: - width, height = int(m.group('width')), int(m.group('height')) - - formats = [{ - 'format_id': 'sd', - 'url': video_url, - 'width': width, - 'height': height, - }] - - return { - 'id': video_id, - 'title': title, - 'duration': duration, - 'formats': formats, - } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/showroomlive.py youtube-dl-2019.05.20/youtube_dl/extractor/showroomlive.py --- youtube-dl-2016.02.22/youtube_dl/extractor/showroomlive.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/showroomlive.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,84 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + urljoin, +) + + +class ShowRoomLiveIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?showroom-live\.com/(?!onlive|timetable|event|campaign|news|ranking|room)(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://www.showroom-live.com/48_Nana_Okada', + 'only_matching': True, + } + + def _real_extract(self, url): + broadcaster_id = self._match_id(url) + + webpage = self._download_webpage(url, broadcaster_id) + + room_id = self._search_regex( + (r'SrGlobal\.roomId\s*=\s*(\d+)', + r'(?:profile|room)\?room_id\=(\d+)'), webpage, 'room_id') + + room = self._download_json( + urljoin(url, '/api/room/profile?room_id=%s' % room_id), + broadcaster_id) + + is_live = room.get('is_onlive') + if is_live is not True: + raise ExtractorError('%s is offline' % broadcaster_id, expected=True) + + uploader = room.get('performer_name') or broadcaster_id + title = room.get('room_name') or room.get('main_name') or uploader + + streaming_url_list = self._download_json( + urljoin(url, '/api/live/streaming_url?room_id=%s' % room_id), + broadcaster_id)['streaming_url_list'] + + formats = [] + for stream in streaming_url_list: + stream_url = stream.get('url') + if not stream_url: + continue + stream_type = stream.get('type') + if stream_type == 'hls': + m3u8_formats = self._extract_m3u8_formats( + stream_url, broadcaster_id, ext='mp4', m3u8_id='hls', + live=True) + for f in m3u8_formats: + f['quality'] = int_or_none(stream.get('quality', 100)) + formats.extend(m3u8_formats) + elif stream_type == 'rtmp': + stream_name = stream.get('stream_name') + if not stream_name: + continue + formats.append({ + 'url': stream_url, + 'play_path': stream_name, + 'page_url': url, + 'player_url': 'https://www.showroom-live.com/assets/swf/v3/ShowRoomLive.swf', + 'rtmp_live': True, + 'ext': 'flv', + 'format_id': 'rtmp', + 'format_note': stream.get('label'), + 'quality': int_or_none(stream.get('quality', 100)), + }) + self._sort_formats(formats) + + return { + 'id': compat_str(room.get('live_id') or broadcaster_id), + 'title': self._live_title(title), + 'description': room.get('description'), + 'timestamp': int_or_none(room.get('current_live_started_at')), + 'uploader': uploader, + 'uploader_id': broadcaster_id, + 'view_count': int_or_none(room.get('view_num')), + 'formats': formats, + 'is_live': True, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/sina.py youtube-dl-2019.05.20/youtube_dl/extractor/sina.py --- youtube-dl-2016.02.22/youtube_dl/extractor/sina.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/sina.py 2019-06-07 18:49:07.000000000 +0000 @@ -4,28 +4,35 @@ import re from .common import InfoExtractor -from ..compat import compat_urllib_parse -from ..utils import sanitized_Request +from ..utils import ( + HEADRequest, + ExtractorError, + int_or_none, + update_url_query, + qualities, + get_element_by_attribute, + clean_html, +) class SinaIE(InfoExtractor): - _VALID_URL = r'''(?x)https?://(.*?\.)?video\.sina\.com\.cn/ - ( - (.+?/(((?P<pseudo_id>\d+).html)|(.*?(\#|(vid=)|b/)(?P<id>\d+?)($|&|\-)))) - | + _VALID_URL = r'''(?x)https?://(?:.*?\.)?video\.sina\.com\.cn/ + (?: + (?:view/|.*\#)(?P<video_id>\d+)| + .+?/(?P<pseudo_id>[^/?#]+)(?:\.s?html)| # This is used by external sites like Weibo - (api/sinawebApi/outplay.php/(?P<token>.+?)\.swf) + api/sinawebApi/outplay.php/(?P<token>.+?)\.swf ) ''' _TESTS = [ { - 'url': 'http://video.sina.com.cn/news/vlist/zt/chczlj2013/?opsubject_id=top12#110028898', - 'md5': 'd65dd22ddcf44e38ce2bf58a10c3e71f', + 'url': 'http://video.sina.com.cn/news/spj/topvideoes20160504/?opsubject_id=top1#250576622', + 'md5': 'd38433e2fc886007729735650ae4b3e9', 'info_dict': { - 'id': '110028898', - 'ext': 'flv', - 'title': '《中国新闻》 朝鲜要求巴拿马立即释放被扣船员', + 'id': '250576622', + 'ext': 'mp4', + 'title': '现场:克鲁兹宣布退选 特朗普将稳获提名', } }, { @@ -35,37 +42,74 @@ 'ext': 'flv', 'title': '军方提高对朝情报监视级别', }, + 'skip': 'the page does not exist or has been deleted', + }, + { + 'url': 'http://video.sina.com.cn/view/250587748.html', + 'md5': '3d1807a25c775092aab3bc157fff49b4', + 'info_dict': { + 'id': '250587748', + 'ext': 'mp4', + 'title': '瞬间泪目:8年前汶川地震珍贵视频首曝光', + }, }, ] - def _extract_video(self, video_id): - data = compat_urllib_parse.urlencode({'vid': video_id}) - url_doc = self._download_xml('http://v.iask.com/v_play.php?%s' % data, - video_id, 'Downloading video url') - image_page = self._download_webpage( - 'http://interface.video.sina.com.cn/interface/common/getVideoImage.php?%s' % data, - video_id, 'Downloading thumbnail info') - - return {'id': video_id, - 'url': url_doc.find('./durl/url').text, - 'ext': 'flv', - 'title': url_doc.find('./vname').text, - 'thumbnail': image_page.split('=')[1], - } - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - if mobj.group('token') is not None: - # The video id is in the redirected url - self.to_screen('Getting video id') - request = sanitized_Request(url) - request.get_method = lambda: 'HEAD' - (_, urlh) = self._download_webpage_handle(request, 'NA', False) - return self._real_extract(urlh.geturl()) - elif video_id is None: - pseudo_id = mobj.group('pseudo_id') - webpage = self._download_webpage(url, pseudo_id) - video_id = self._search_regex(r'vid:\'(\d+?)\'', webpage, 'video id') - return self._extract_video(video_id) + video_id = mobj.group('video_id') + if not video_id: + if mobj.group('token') is not None: + # The video id is in the redirected url + self.to_screen('Getting video id') + request = HEADRequest(url) + _, urlh = self._download_webpage_handle(request, 'NA', False) + return self._real_extract(urlh.geturl()) + else: + pseudo_id = mobj.group('pseudo_id') + webpage = self._download_webpage(url, pseudo_id) + error = get_element_by_attribute('class', 'errtitle', webpage) + if error: + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, clean_html(error)), expected=True) + video_id = self._search_regex( + r"video_id\s*:\s*'(\d+)'", webpage, 'video id') + + video_data = self._download_json( + 'http://s.video.sina.com.cn/video/h5play', + video_id, query={'video_id': video_id}) + if video_data['code'] != 1: + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, video_data['message']), expected=True) + else: + video_data = video_data['data'] + title = video_data['title'] + description = video_data.get('description') + if description: + description = description.strip() + + preference = qualities(['cif', 'sd', 'hd', 'fhd', 'ffd']) + formats = [] + for quality_id, quality in video_data.get('videos', {}).get('mp4', {}).items(): + file_api = quality.get('file_api') + file_id = quality.get('file_id') + if not file_api or not file_id: + continue + formats.append({ + 'format_id': quality_id, + 'url': update_url_query(file_api, {'vid': file_id}), + 'preference': preference(quality_id), + 'ext': 'mp4', + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': video_data.get('image'), + 'duration': int_or_none(video_data.get('length')), + 'timestamp': int_or_none(video_data.get('create_time')), + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/sixplay.py youtube-dl-2019.05.20/youtube_dl/extractor/sixplay.py --- youtube-dl-2016.02.22/youtube_dl/extractor/sixplay.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/sixplay.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,125 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_str, + compat_urllib_parse_urlparse, +) +from ..utils import ( + determine_ext, + int_or_none, + try_get, + qualities, +) + + +class SixPlayIE(InfoExtractor): + IE_NAME = '6play' + _VALID_URL = r'(?:6play:|https?://(?:www\.)?(?P<domain>6play\.fr|rtlplay\.be|play\.rtl\.hr)/.+?-c_)(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://www.6play.fr/minute-par-minute-p_9533/le-but-qui-a-marque-lhistoire-du-football-francais-c_12041051', + 'md5': '31fcd112637baa0c2ab92c4fcd8baf27', + 'info_dict': { + 'id': '12041051', + 'ext': 'mp4', + 'title': 'Le but qui a marqué l\'histoire du football français !', + 'description': 'md5:b59e7e841d646ef1eb42a7868eb6a851', + }, + }, { + 'url': 'https://www.rtlplay.be/rtl-info-13h-p_8551/les-titres-du-rtlinfo-13h-c_12045869', + 'only_matching': True, + }, { + 'url': 'https://play.rtl.hr/pj-masks-p_9455/epizoda-34-sezona-1-catboyevo-cudo-na-dva-kotaca-c_11984989', + 'only_matching': True, + }] + + def _real_extract(self, url): + domain, video_id = re.search(self._VALID_URL, url).groups() + service, consumer_name = { + '6play.fr': ('6play', 'm6web'), + 'rtlplay.be': ('rtlbe_rtl_play', 'rtlbe'), + 'play.rtl.hr': ('rtlhr_rtl_play', 'rtlhr'), + }.get(domain, ('6play', 'm6web')) + + data = self._download_json( + 'https://pc.middleware.6play.fr/6play/v2/platforms/m6group_web/services/%s/videos/clip_%s' % (service, video_id), + video_id, headers={ + 'x-customer-name': consumer_name + }, query={ + 'csa': 5, + 'with': 'clips', + }) + + clip_data = data['clips'][0] + title = clip_data['title'] + + urls = [] + quality_key = qualities(['lq', 'sd', 'hq', 'hd']) + formats = [] + subtitles = {} + assets = clip_data.get('assets') or [] + for asset in assets: + asset_url = asset.get('full_physical_path') + protocol = asset.get('protocol') + if not asset_url or ((protocol == 'primetime' or asset.get('type') == 'usp_hlsfp_h264') and not ('_drmnp.ism/' in asset_url or '_unpnp.ism/' in asset_url)) or asset_url in urls: + continue + urls.append(asset_url) + container = asset.get('video_container') + ext = determine_ext(asset_url) + if protocol == 'http_subtitle' or ext == 'vtt': + subtitles.setdefault('fr', []).append({'url': asset_url}) + continue + if container == 'm3u8' or ext == 'm3u8': + if protocol == 'usp': + if compat_parse_qs(compat_urllib_parse_urlparse(asset_url).query).get('token', [None])[0]: + urlh = self._request_webpage( + asset_url, video_id, fatal=False, + headers=self.geo_verification_headers()) + if not urlh: + continue + asset_url = urlh.geturl() + asset_url = asset_url.replace('_drmnp.ism/', '_unpnp.ism/') + for i in range(3, 0, -1): + asset_url = asset_url = asset_url.replace('_sd1/', '_sd%d/' % i) + m3u8_formats = self._extract_m3u8_formats( + asset_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False) + formats.extend(m3u8_formats) + formats.extend(self._extract_mpd_formats( + asset_url.replace('.m3u8', '.mpd'), + video_id, mpd_id='dash', fatal=False)) + if m3u8_formats: + break + else: + formats.extend(self._extract_m3u8_formats( + asset_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif container == 'mp4' or ext == 'mp4': + quality = asset.get('video_quality') + formats.append({ + 'url': asset_url, + 'format_id': quality, + 'quality': quality_key(quality), + 'ext': ext, + }) + self._sort_formats(formats) + + def get(getter): + for src in (data, clip_data): + v = try_get(src, getter, compat_str) + if v: + return v + + return { + 'id': video_id, + 'title': title, + 'description': get(lambda x: x['description']), + 'duration': int_or_none(clip_data.get('duration')), + 'series': get(lambda x: x['program']['title']), + 'formats': formats, + 'subtitles': subtitles, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/skylinewebcams.py youtube-dl-2019.05.20/youtube_dl/extractor/skylinewebcams.py --- youtube-dl-2016.02.22/youtube_dl/extractor/skylinewebcams.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/skylinewebcams.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,42 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class SkylineWebcamsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?skylinewebcams\.com/[^/]+/webcam/(?:[^/]+/)+(?P<id>[^/]+)\.html' + _TEST = { + 'url': 'https://www.skylinewebcams.com/it/webcam/italia/lazio/roma/scalinata-piazza-di-spagna-barcaccia.html', + 'info_dict': { + 'id': 'scalinata-piazza-di-spagna-barcaccia', + 'ext': 'mp4', + 'title': 're:^Live Webcam Scalinata di Piazza di Spagna - La Barcaccia [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 'Roma, veduta sulla Scalinata di Piazza di Spagna e sulla Barcaccia', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + stream_url = self._search_regex( + r'(?:url|source)\s*:\s*(["\'])(?P<url>(?:https?:)?//.+?\.m3u8.*?)\1', webpage, + 'stream url', group='url') + + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) + + return { + 'id': video_id, + 'url': stream_url, + 'ext': 'mp4', + 'title': self._live_title(title), + 'description': description, + 'is_live': True, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/skynewsarabia.py youtube-dl-2019.05.20/youtube_dl/extractor/skynewsarabia.py --- youtube-dl-2016.02.22/youtube_dl/extractor/skynewsarabia.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/skynewsarabia.py 2019-06-07 18:49:07.000000000 +0000 @@ -67,7 +67,7 @@ class SkyNewsArabiaArticleIE(SkyNewsArabiaBaseIE): - IE_NAME = 'skynewsarabia:video' + IE_NAME = 'skynewsarabia:article' _VALID_URL = r'https?://(?:www\.)?skynewsarabia\.com/web/article/(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://www.skynewsarabia.com/web/article/794549/%D8%A7%D9%94%D8%AD%D8%AF%D8%A7%D8%AB-%D8%A7%D9%84%D8%B4%D8%B1%D9%82-%D8%A7%D9%84%D8%A7%D9%94%D9%88%D8%B3%D8%B7-%D8%AE%D8%B1%D9%8A%D8%B7%D8%A9-%D8%A7%D9%84%D8%A7%D9%94%D9%84%D8%B9%D8%A7%D8%A8-%D8%A7%D9%84%D8%B0%D9%83%D9%8A%D8%A9', diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/sky.py youtube-dl-2019.05.20/youtube_dl/extractor/sky.py --- youtube-dl-2016.02.22/youtube_dl/extractor/sky.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/sky.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,70 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + smuggle_url, + strip_or_none, + urljoin, +) + + +class SkyBaseIE(InfoExtractor): + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video_data = extract_attributes(self._search_regex( + r'(<div.+?class="[^"]*sdc-article-video__media-ooyala[^"]*"[^>]+>)', + webpage, 'video data')) + + video_url = 'ooyala:%s' % video_data['data-video-id'] + if video_data.get('data-token-required') == 'true': + token_fetch_options = self._parse_json(video_data.get( + 'data-token-fetch-options', '{}'), video_id, fatal=False) or {} + token_fetch_url = token_fetch_options.get('url') + if token_fetch_url: + embed_token = self._download_webpage(urljoin( + url, token_fetch_url), video_id, fatal=False) + if embed_token: + video_url = smuggle_url( + video_url, {'embed_token': embed_token.strip('"')}) + + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': video_url, + 'title': self._og_search_title(webpage), + 'description': strip_or_none(self._og_search_description(webpage)), + 'ie_key': 'Ooyala', + } + + +class SkySportsIE(SkyBaseIE): + _VALID_URL = r'https?://(?:www\.)?skysports\.com/watch/video/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.skysports.com/watch/video/10328419/bale-its-our-time-to-shine', + 'md5': '77d59166cddc8d3cb7b13e35eaf0f5ec', + 'info_dict': { + 'id': 'o3eWJnNDE6l7kfNO8BOoBlRxXRQ4ANNQ', + 'ext': 'mp4', + 'title': 'Bale: It\'s our time to shine', + 'description': 'md5:e88bda94ae15f7720c5cb467e777bb6d', + }, + 'add_ie': ['Ooyala'], + } + + +class SkyNewsIE(SkyBaseIE): + _VALID_URL = r'https?://news\.sky\.com/video/[0-9a-z-]+-(?P<id>[0-9]+)' + _TEST = { + 'url': 'https://news.sky.com/video/russian-plane-inspected-after-deadly-fire-11712962', + 'md5': 'd6327e581473cea9976a3236ded370cd', + 'info_dict': { + 'id': '1ua21xaDE6lCtZDmbYfl8kwsKLooJbNM', + 'ext': 'mp4', + 'title': 'Russian plane inspected after deadly fire', + 'description': 'The Russian Investigative Committee has released video of the wreckage of a passenger plane which caught fire near Moscow.', + }, + 'add_ie': ['Ooyala'], + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/skysports.py youtube-dl-2019.05.20/youtube_dl/extractor/skysports.py --- youtube-dl-2016.02.22/youtube_dl/extractor/skysports.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/skysports.py 2019-06-07 18:47:18.000000000 +0000 @@ -0,0 +1,49 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + smuggle_url, + strip_or_none, + urljoin, +) + + +class SkySportsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?skysports\.com/watch/video/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.skysports.com/watch/video/10328419/bale-its-our-time-to-shine', + 'md5': '77d59166cddc8d3cb7b13e35eaf0f5ec', + 'info_dict': { + 'id': '10328419', + 'ext': 'mp4', + 'title': 'Bale: It\'s our time to shine', + 'description': 'md5:e88bda94ae15f7720c5cb467e777bb6d', + }, + 'add_ie': ['Ooyala'], + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video_data = extract_attributes(self._search_regex( + r'(<div.+?class="sdc-article-video__media-ooyala"[^>]+>)', webpage, 'video data')) + + video_url = 'ooyala:%s' % video_data['data-video-id'] + if video_data.get('data-token-required') == 'true': + token_fetch_options = self._parse_json(video_data.get('data-token-fetch-options', '{}'), video_id, fatal=False) or {} + token_fetch_url = token_fetch_options.get('url') + if token_fetch_url: + embed_token = self._download_webpage(urljoin(url, token_fetch_url), video_id, fatal=False) + if embed_token: + video_url = smuggle_url(video_url, {'embed_token': embed_token.strip('"')}) + + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': video_url, + 'title': self._og_search_title(webpage), + 'description': strip_or_none(self._og_search_description(webpage)), + 'ie_key': 'Ooyala', + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/slideshare.py youtube-dl-2019.05.20/youtube_dl/extractor/slideshare.py --- youtube-dl-2016.02.22/youtube_dl/extractor/slideshare.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/slideshare.py 2019-06-07 18:49:07.000000000 +0000 @@ -9,11 +9,12 @@ ) from ..utils import ( ExtractorError, + get_element_by_id, ) class SlideshareIE(InfoExtractor): - _VALID_URL = r'https?://www\.slideshare\.net/[^/]+?/(?P<title>.+?)($|\?)' + _VALID_URL = r'https?://(?:www\.)?slideshare\.net/[^/]+?/(?P<title>.+?)($|\?)' _TEST = { 'url': 'http://www.slideshare.net/Dataversity/keynote-presentation-managing-scale-and-complexity', @@ -30,7 +31,7 @@ page_title = mobj.group('title') webpage = self._download_webpage(url, page_title) slideshare_obj = self._search_regex( - r'\$\.extend\(slideshare_object,\s*(\{.*?\})\);', + r'\$\.extend\(.*?slideshare_object,\s*(\{.*?\})\);', webpage, 'slideshare object') info = json.loads(slideshare_obj) if info['slideshow']['type'] != 'video': @@ -40,7 +41,7 @@ bucket = info['jsplayer']['video_bucket'] ext = info['jsplayer']['video_extension'] video_url = compat_urlparse.urljoin(bucket, doc + '-SD.' + ext) - description = self._html_search_regex( + description = get_element_by_id('slideshow-description-paragraph', webpage) or self._html_search_regex( r'(?s)<p[^>]+itemprop="description"[^>]*>(.+?)</p>', webpage, 'description', fatal=False) @@ -51,5 +52,5 @@ 'ext': ext, 'url': video_url, 'thumbnail': info['slideshow']['pin_image_url'], - 'description': description, + 'description': description.strip() if description else None, } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/slideslive.py youtube-dl-2019.05.20/youtube_dl/extractor/slideslive.py --- youtube-dl-2016.02.22/youtube_dl/extractor/slideslive.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/slideslive.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,39 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class SlidesLiveIE(InfoExtractor): + _VALID_URL = r'https?://slideslive\.com/(?P<id>[0-9]+)' + _TESTS = [{ + # video_service_name = YOUTUBE + 'url': 'https://slideslive.com/38902413/gcc-ia16-backend', + 'md5': 'b29fcd6c6952d0c79c5079b0e7a07e6f', + 'info_dict': { + 'id': 'LMtgR8ba0b0', + 'ext': 'mp4', + 'title': '38902413: external video', + 'description': '3890241320170925-9-1yd6ech.mp4', + 'uploader': 'SlidesLive Administrator', + 'uploader_id': 'UC62SdArr41t_-_fX40QCLRw', + 'upload_date': '20170925', + } + }, { + # video_service_name = youtube + 'url': 'https://slideslive.com/38903721/magic-a-scientific-resurrection-of-an-esoteric-legend', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._download_json( + url, video_id, headers={'Accept': 'application/json'}) + service_name = video_data['video_service_name'].lower() + if service_name == 'youtube': + yt_video_id = video_data['video_service_id'] + return self.url_result(yt_video_id, 'Youtube', video_id=yt_video_id) + else: + raise ExtractorError( + 'Unsupported service name: {0}'.format(service_name), expected=True) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/slutload.py youtube-dl-2019.05.20/youtube_dl/extractor/slutload.py --- youtube-dl-2016.02.22/youtube_dl/extractor/slutload.py 2016-02-14 22:26:17.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/slutload.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,44 +1,65 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor class SlutloadIE(InfoExtractor): - _VALID_URL = r'^https?://(?:\w+\.)?slutload\.com/video/[^/]+/(?P<id>[^/]+)/?$' - _TEST = { + _VALID_URL = r'https?://(?:\w+\.)?slutload\.com/(?:video/[^/]+|embed_player|watch)/(?P<id>[^/]+)' + _TESTS = [{ 'url': 'http://www.slutload.com/video/virginie-baisee-en-cam/TD73btpBqSxc/', - 'md5': '0cf531ae8006b530bd9df947a6a0df77', + 'md5': '868309628ba00fd488cf516a113fd717', 'info_dict': { 'id': 'TD73btpBqSxc', 'ext': 'mp4', 'title': 'virginie baisee en cam', 'age_limit': 18, - 'thumbnail': 're:https?://.*?\.jpg' - } - } + 'thumbnail': r're:https?://.*?\.jpg' + }, + }, { + # mobile site + 'url': 'http://mobile.slutload.com/video/masturbation-solo/fviFLmc6kzJ/', + 'only_matching': True, + }, { + 'url': 'http://www.slutload.com/embed_player/TD73btpBqSxc/', + 'only_matching': True, + }, { + 'url': 'http://www.slutload.com/watch/TD73btpBqSxc/Virginie-Baisee-En-Cam.html', + 'only_matching': True, + }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - webpage = self._download_webpage(url, video_id) - - video_title = self._html_search_regex(r'<h1><strong>([^<]+)</strong>', - webpage, 'title').strip() + video_id = self._match_id(url) - video_url = self._html_search_regex( - r'(?s)<div id="vidPlayer"\s+data-url="([^"]+)"', - webpage, 'video URL') - thumbnail = self._html_search_regex( - r'(?s)<div id="vidPlayer"\s+.*?previewer-file="([^"]+)"', - webpage, 'thumbnail', fatal=False) - - return { + embed_page = self._download_webpage( + 'http://www.slutload.com/embed_player/%s' % video_id, video_id, + 'Downloading embed page', fatal=False) + + if embed_page: + def extract(what): + return self._html_search_regex( + r'data-video-%s=(["\'])(?P<url>(?:(?!\1).)+)\1' % what, + embed_page, 'video %s' % what, default=None, group='url') + + video_url = extract('url') + if video_url: + title = self._html_search_regex( + r'<title>([^<]+)', embed_page, 'title', default=video_id) + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'thumbnail': extract('preview'), + 'age_limit': 18 + } + + webpage = self._download_webpage( + 'http://www.slutload.com/video/_/%s/' % video_id, video_id) + title = self._html_search_regex( + r'<h1><strong>([^<]+)</strong>', webpage, 'title').strip() + info = self._parse_html5_media_entries(url, webpage, video_id)[0] + info.update({ 'id': video_id, - 'url': video_url, - 'title': video_title, - 'thumbnail': thumbnail, - 'age_limit': 18 - } + 'title': title, + 'age_limit': 18, + }) + return info diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/smotri.py youtube-dl-2019.05.20/youtube_dl/extractor/smotri.py --- youtube-dl-2016.02.22/youtube_dl/extractor/smotri.py 2016-02-14 22:26:17.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/smotri.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re @@ -7,26 +7,27 @@ import uuid from .common import InfoExtractor -from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, int_or_none, sanitized_Request, unified_strdate, + urlencode_postdata, + xpath_text, ) class SmotriIE(InfoExtractor): IE_DESC = 'Smotri.com' IE_NAME = 'smotri' - _VALID_URL = r'^https?://(?:www\.)?(?:smotri\.com/video/view/\?id=|pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=)(?P<id>v(?P<realvideoid>[0-9]+)[a-z0-9]{4})' + _VALID_URL = r'https?://(?:www\.)?(?:smotri\.com/video/view/\?id=|pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=)(?P<id>v(?P<realvideoid>[0-9]+)[a-z0-9]{4})' _NETRC_MACHINE = 'smotri' _TESTS = [ # real video id 2610366 { 'url': 'http://smotri.com/video/view/?id=v261036632ab', - 'md5': '2a7b08249e6f5636557579c368040eb9', + 'md5': '02c0dfab2102984e9c5bb585cc7cc321', 'info_dict': { 'id': 'v261036632ab', 'ext': 'mp4', @@ -80,7 +81,7 @@ 'uploader': 'psavari1', 'uploader_id': 'psavari1', 'upload_date': '20081103', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, 'params': { 'videopassword': '223322', @@ -116,7 +117,7 @@ 'uploader': 'вАся', 'uploader_id': 'asya_prosto', 'upload_date': '20081218', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'age_limit': 18, }, 'params': { @@ -174,11 +175,11 @@ if video_password: video_form['pass'] = hashlib.md5(video_password.encode('utf-8')).hexdigest() - request = sanitized_Request( - 'http://smotri.com/video/view/url/bot/', compat_urllib_parse.urlencode(video_form)) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') - - video = self._download_json(request, video_id, 'Downloading video JSON') + video = self._download_json( + 'http://smotri.com/video/view/url/bot/', + video_id, 'Downloading video JSON', + data=urlencode_postdata(video_form), + headers={'Content-Type': 'application/x-www-form-urlencoded'}) video_url = video.get('_vidURL') or video.get('_vidURL_mp4') @@ -196,11 +197,11 @@ raise ExtractorError(msg, expected=True) title = video['title'] - thumbnail = video['_imgURL'] - upload_date = unified_strdate(video['added']) - uploader = video['userNick'] - uploader_id = video['userLogin'] - duration = int_or_none(video['duration']) + thumbnail = video.get('_imgURL') + upload_date = unified_strdate(video.get('added')) + uploader = video.get('userNick') + uploader_id = video.get('userLogin') + duration = int_or_none(video.get('duration')) # Video JSON does not provide enough meta data # We will extract some from the video web page instead @@ -209,7 +210,7 @@ # Warning if video is unavailable warning = self._html_search_regex( - r'<div class="videoUnModer">(.*?)</div>', webpage, + r'<div[^>]+class="videoUnModer"[^>]*>(.+?)</div>', webpage, 'warning message', default=None) if warning is not None: self._downloader.report_warning( @@ -217,20 +218,22 @@ (video_id, warning)) # Adult content - if re.search('EroConfirmText">', webpage) is not None: + if 'EroConfirmText">' in webpage: self.report_age_confirmation() confirm_string = self._html_search_regex( - r'<a href="/video/view/\?id=%s&confirm=([^"]+)" title="[^"]+">' % video_id, + r'<a[^>]+href="/video/view/\?id=%s&confirm=([^"]+)"' % video_id, webpage, 'confirm string') confirm_url = webpage_url + '&confirm=%s' % confirm_string - webpage = self._download_webpage(confirm_url, video_id, 'Downloading video page (age confirmed)') + webpage = self._download_webpage( + confirm_url, video_id, + 'Downloading video page (age confirmed)') adult_content = True else: adult_content = False view_count = self._html_search_regex( - 'Общее количество просмотров.*?<span class="Number">(\\d+)</span>', - webpage, 'view count', fatal=False, flags=re.MULTILINE | re.DOTALL) + r'(?s)Общее количество просмотров.*?<span class="Number">(\d+)</span>', + webpage, 'view count', fatal=False) return { 'id': video_id, @@ -249,37 +252,33 @@ class SmotriCommunityIE(InfoExtractor): IE_DESC = 'Smotri.com community videos' IE_NAME = 'smotri:community' - _VALID_URL = r'^https?://(?:www\.)?smotri\.com/community/video/(?P<communityid>[0-9A-Za-z_\'-]+)' + _VALID_URL = r'https?://(?:www\.)?smotri\.com/community/video/(?P<id>[0-9A-Za-z_\'-]+)' _TEST = { 'url': 'http://smotri.com/community/video/kommuna', 'info_dict': { 'id': 'kommuna', - 'title': 'КПРФ', }, 'playlist_mincount': 4, } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - community_id = mobj.group('communityid') + community_id = self._match_id(url) - url = 'http://smotri.com/export/rss/video/by/community/-/%s/video.xml' % community_id - rss = self._download_xml(url, community_id, 'Downloading community RSS') - - entries = [self.url_result(video_url.text, 'Smotri') - for video_url in rss.findall('./channel/item/link')] + rss = self._download_xml( + 'http://smotri.com/export/rss/video/by/community/-/%s/video.xml' % community_id, + community_id, 'Downloading community RSS') - description_text = rss.find('./channel/description').text - community_title = self._html_search_regex( - '^Видео сообщества "([^"]+)"$', description_text, 'community title') + entries = [ + self.url_result(video_url.text, SmotriIE.ie_key()) + for video_url in rss.findall('./channel/item/link')] - return self.playlist_result(entries, community_id, community_title) + return self.playlist_result(entries, community_id) class SmotriUserIE(InfoExtractor): IE_DESC = 'Smotri.com user videos' IE_NAME = 'smotri:user' - _VALID_URL = r'^https?://(?:www\.)?smotri\.com/user/(?P<userid>[0-9A-Za-z_\'-]+)' + _VALID_URL = r'https?://(?:www\.)?smotri\.com/user/(?P<id>[0-9A-Za-z_\'-]+)' _TESTS = [{ 'url': 'http://smotri.com/user/inspector', 'info_dict': { @@ -290,19 +289,19 @@ }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - user_id = mobj.group('userid') + user_id = self._match_id(url) - url = 'http://smotri.com/export/rss/user/video/-/%s/video.xml' % user_id - rss = self._download_xml(url, user_id, 'Downloading user RSS') + rss = self._download_xml( + 'http://smotri.com/export/rss/user/video/-/%s/video.xml' % user_id, + user_id, 'Downloading user RSS') entries = [self.url_result(video_url.text, 'Smotri') for video_url in rss.findall('./channel/item/link')] - description_text = rss.find('./channel/description').text - user_nickname = self._html_search_regex( - '^Видео режиссера (.*)$', description_text, - 'user nickname') + description_text = xpath_text(rss, './channel/description') or '' + user_nickname = self._search_regex( + '^Видео режиссера (.+)$', description_text, + 'user nickname', fatal=False) return self.playlist_result(entries, user_id, user_nickname) @@ -310,11 +309,12 @@ class SmotriBroadcastIE(InfoExtractor): IE_DESC = 'Smotri.com broadcasts' IE_NAME = 'smotri:broadcast' - _VALID_URL = r'^https?://(?:www\.)?(?P<url>smotri\.com/live/(?P<broadcastid>[^/]+))/?.*' + _VALID_URL = r'https?://(?:www\.)?(?P<url>smotri\.com/live/(?P<id>[^/]+))/?.*' + _NETRC_MACHINE = 'smotri' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - broadcast_id = mobj.group('broadcastid') + broadcast_id = mobj.group('id') broadcast_url = 'http://' + mobj.group('url') broadcast_page = self._download_webpage(broadcast_url, broadcast_id, 'Downloading broadcast page') @@ -328,7 +328,8 @@ (username, password) = self._get_login_info() if username is None: - self.raise_login_required('Erotic broadcasts allowed only for registered users') + self.raise_login_required( + 'Erotic broadcasts allowed only for registered users') login_form = { 'login-hint53': '1', @@ -338,30 +339,32 @@ } request = sanitized_Request( - broadcast_url + '/?no_redirect=1', compat_urllib_parse.urlencode(login_form)) + broadcast_url + '/?no_redirect=1', urlencode_postdata(login_form)) request.add_header('Content-Type', 'application/x-www-form-urlencoded') broadcast_page = self._download_webpage( request, broadcast_id, 'Logging in and confirming age') - if re.search('>Неверный логин или пароль<', broadcast_page) is not None: - raise ExtractorError('Unable to log in: bad username or password', expected=True) + if '>Неверный логин или пароль<' in broadcast_page: + raise ExtractorError( + 'Unable to log in: bad username or password', expected=True) adult_content = True else: adult_content = False ticket = self._html_search_regex( - r"window\.broadcast_control\.addFlashVar\('file'\s*,\s*'([^']+)'\)", - broadcast_page, 'broadcast ticket') + (r'data-user-file=(["\'])(?P<ticket>(?!\1).+)\1', + r"window\.broadcast_control\.addFlashVar\('file'\s*,\s*'(?P<ticket>[^']+)'\)"), + broadcast_page, 'broadcast ticket', group='ticket') - url = 'http://smotri.com/broadcast/view/url/?ticket=%s' % ticket + broadcast_url = 'http://smotri.com/broadcast/view/url/?ticket=%s' % ticket broadcast_password = self._downloader.params.get('videopassword') if broadcast_password: - url += '&pass=%s' % hashlib.md5(broadcast_password.encode('utf-8')).hexdigest() + broadcast_url += '&pass=%s' % hashlib.md5(broadcast_password.encode('utf-8')).hexdigest() broadcast_json_page = self._download_webpage( - url, broadcast_id, 'Downloading broadcast JSON') + broadcast_url, broadcast_id, 'Downloading broadcast JSON') try: broadcast_json = json.loads(broadcast_json_page) @@ -383,11 +386,11 @@ broadcast_playpath = broadcast_json['_streamName'] broadcast_app = '%s/%s' % (mobj.group('app'), broadcast_json['_vidURL']) - broadcast_thumbnail = broadcast_json['_imgURL'] + broadcast_thumbnail = broadcast_json.get('_imgURL') broadcast_title = self._live_title(broadcast_json['title']) - broadcast_description = broadcast_json['description'] - broadcaster_nick = broadcast_json['nick'] - broadcaster_login = broadcast_json['login'] + broadcast_description = broadcast_json.get('description') + broadcaster_nick = broadcast_json.get('nick') + broadcaster_login = broadcast_json.get('login') rtmp_conn = 'S:%s' % uuid.uuid4().hex except KeyError: if protected_broadcast: diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/snagfilms.py youtube-dl-2019.05.20/youtube_dl/extractor/snagfilms.py --- youtube-dl-2016.02.22/youtube_dl/extractor/snagfilms.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/snagfilms.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,181 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - clean_html, - determine_ext, - int_or_none, - js_to_json, - parse_duration, -) - - -class SnagFilmsEmbedIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|embed)\.)?snagfilms\.com/embed/player\?.*\bfilmId=(?P<id>[\da-f-]{36})' - _TESTS = [{ - 'url': 'http://embed.snagfilms.com/embed/player?filmId=74849a00-85a9-11e1-9660-123139220831&w=500', - 'md5': '2924e9215c6eff7a55ed35b72276bd93', - 'info_dict': { - 'id': '74849a00-85a9-11e1-9660-123139220831', - 'ext': 'mp4', - 'title': '#whilewewatch', - } - }, { - # invalid labels, 360p is better that 480p - 'url': 'http://www.snagfilms.com/embed/player?filmId=17ca0950-a74a-11e0-a92a-0026bb61d036', - 'md5': '882fca19b9eb27ef865efeeaed376a48', - 'info_dict': { - 'id': '17ca0950-a74a-11e0-a92a-0026bb61d036', - 'ext': 'mp4', - 'title': 'Life in Limbo', - } - }, { - 'url': 'http://www.snagfilms.com/embed/player?filmId=0000014c-de2f-d5d6-abcf-ffef58af0017', - 'only_matching': True, - }] - - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:embed\.)?snagfilms\.com/embed/player.+?)\1', - webpage) - if mobj: - return mobj.group('url') - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - if '>This film is not playable in your area.<' in webpage: - raise ExtractorError( - 'Film %s is not playable in your area.' % video_id, expected=True) - - formats = [] - for source in self._parse_json(js_to_json(self._search_regex( - r'(?s)sources:\s*(\[.+?\]),', webpage, 'json')), video_id): - file_ = source.get('file') - if not file_: - continue - type_ = source.get('type') - ext = determine_ext(file_) - format_id = source.get('label') or ext - if all(v == 'm3u8' for v in (type_, ext)): - formats.extend(self._extract_m3u8_formats( - file_, video_id, 'mp4', m3u8_id='hls')) - else: - bitrate = int_or_none(self._search_regex( - [r'(\d+)kbps', r'_\d{1,2}x\d{1,2}_(\d{3,})\.%s' % ext], - file_, 'bitrate', default=None)) - height = int_or_none(self._search_regex( - r'^(\d+)[pP]$', format_id, 'height', default=None)) - formats.append({ - 'url': file_, - 'format_id': format_id, - 'tbr': bitrate, - 'height': height, - }) - self._sort_formats(formats) - - title = self._search_regex( - [r"title\s*:\s*'([^']+)'", r'<title>([^<]+)'], - webpage, 'title') - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - } - - -class SnagFilmsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?snagfilms\.com/(?:films/title|show)/(?P[^?#]+)' - _TESTS = [{ - 'url': 'http://www.snagfilms.com/films/title/lost_for_life', - 'md5': '19844f897b35af219773fd63bdec2942', - 'info_dict': { - 'id': '0000014c-de2f-d5d6-abcf-ffef58af0017', - 'display_id': 'lost_for_life', - 'ext': 'mp4', - 'title': 'Lost for Life', - 'description': 'md5:fbdacc8bb6b455e464aaf98bc02e1c82', - 'thumbnail': 're:^https?://.*\.jpg', - 'duration': 4489, - 'categories': ['Documentary', 'Crime', 'Award Winning', 'Festivals'] - } - }, { - 'url': 'http://www.snagfilms.com/show/the_world_cut_project/india', - 'md5': 'e6292e5b837642bbda82d7f8bf3fbdfd', - 'info_dict': { - 'id': '00000145-d75c-d96e-a9c7-ff5c67b20000', - 'display_id': 'the_world_cut_project/india', - 'ext': 'mp4', - 'title': 'India', - 'description': 'md5:5c168c5a8f4719c146aad2e0dfac6f5f', - 'thumbnail': 're:^https?://.*\.jpg', - 'duration': 979, - 'categories': ['Documentary', 'Sports', 'Politics'] - } - }, { - # Film is not playable in your area. - 'url': 'http://www.snagfilms.com/films/title/inside_mecca', - 'only_matching': True, - }, { - # Film is not available. - 'url': 'http://www.snagfilms.com/show/augie_alone/flirting', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - if ">Sorry, the Film you're looking for is not available.<" in webpage: - raise ExtractorError( - 'Film %s is not available.' % display_id, expected=True) - - film_id = self._search_regex(r'filmId=([\da-f-]{36})"', webpage, 'film id') - - snag = self._parse_json( - self._search_regex( - 'Snag\.page\.data\s*=\s*(\[.+?\]);', webpage, 'snag'), - display_id) - - for item in snag: - if item.get('data', {}).get('film', {}).get('id') == film_id: - data = item['data']['film'] - title = data['title'] - description = clean_html(data.get('synopsis')) - thumbnail = data.get('image') - duration = int_or_none(data.get('duration') or data.get('runtime')) - categories = [ - category['title'] for category in data.get('categories', []) - if category.get('title')] - break - else: - title = self._search_regex( - r'itemprop="title">([^<]+)<', webpage, 'title') - description = self._html_search_regex( - r'(?s)
(.+?)
', - webpage, 'description', default=None) or self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) - duration = parse_duration(self._search_regex( - r'([^<]+)<', - webpage, 'duration', fatal=False)) - categories = re.findall(r'([^<]+)', webpage) - - return { - '_type': 'url_transparent', - 'url': 'http://embed.snagfilms.com/embed/player?filmId=%s' % film_id, - 'id': film_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'categories': categories, - } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/snotr.py youtube-dl-2019.05.20/youtube_dl/extractor/snotr.py --- youtube-dl-2016.02.22/youtube_dl/extractor/snotr.py 2016-02-14 22:26:17.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/snotr.py 2019-06-07 18:49:07.000000000 +0000 @@ -5,9 +5,9 @@ from .common import InfoExtractor from ..utils import ( - float_or_none, - str_to_int, parse_duration, + parse_filesize, + str_to_int, ) @@ -17,21 +17,24 @@ 'url': 'http://www.snotr.com/video/13708/Drone_flying_through_fireworks', 'info_dict': { 'id': '13708', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Drone flying through fireworks!', - 'duration': 247, - 'filesize_approx': 98566144, + 'duration': 248, + 'filesize_approx': 40700000, 'description': 'A drone flying through Fourth of July Fireworks', - } + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'expected_warnings': ['description'], }, { 'url': 'http://www.snotr.com/video/530/David_Letteman_-_George_W_Bush_Top_10', 'info_dict': { 'id': '530', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'David Letteman - George W. Bush Top 10', 'duration': 126, - 'filesize_approx': 8912896, + 'filesize_approx': 8500000, 'description': 'The top 10 George W. Bush moments, brought to you by David Letterman!', + 'thumbnail': r're:^https?://.*\.jpg$', } }] @@ -43,26 +46,28 @@ title = self._og_search_title(webpage) description = self._og_search_description(webpage) - video_url = 'http://cdn.videos.snotr.com/%s.flv' % video_id + info_dict = self._parse_html5_media_entries( + url, webpage, video_id, m3u8_entry_protocol='m3u8_native')[0] view_count = str_to_int(self._html_search_regex( - r'

\nViews:\n([\d,\.]+)

', + r']*>\s*]*>Views:
\s*]*>([\d,\.]+)', webpage, 'view count', fatal=False)) duration = parse_duration(self._html_search_regex( - r'

\nLength:\n\s*([0-9:]+).*?

', + r']*>\s*]*>Length:
\s*]*>([\d:]+)', webpage, 'duration', fatal=False)) - filesize_approx = float_or_none(self._html_search_regex( - r'

\nFilesize:\n\s*([0-9.]+)\s*megabyte

', - webpage, 'filesize', fatal=False), invscale=1024 * 1024) + filesize_approx = parse_filesize(self._html_search_regex( + r']*>\s*]*>Filesize:\s*]*>([^<]+)', + webpage, 'filesize', fatal=False)) - return { + info_dict.update({ 'id': video_id, 'description': description, 'title': title, - 'url': video_url, 'view_count': view_count, 'duration': duration, 'filesize_approx': filesize_approx, - } + }) + + return info_dict diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/sohu.py youtube-dl-2019.05.20/youtube_dl/extractor/sohu.py --- youtube-dl-2016.02.22/youtube_dl/extractor/sohu.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/sohu.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re @@ -6,21 +6,22 @@ from .common import InfoExtractor from ..compat import ( compat_str, - compat_urllib_parse, + compat_urllib_parse_urlencode, ) from ..utils import ( ExtractorError, - sanitized_Request, + int_or_none, + try_get, ) class SohuIE(InfoExtractor): _VALID_URL = r'https?://(?Pmy\.)?tv\.sohu\.com/.+?/(?(mytv)|n)(?P\d+)\.shtml.*?' + # Sohu videos give different MD5 sums on Travis CI and my machine _TESTS = [{ 'note': 'This video is available only in Mainland China', 'url': 'http://tv.sohu.com/20130724/n382479172.shtml#super', - 'md5': '29175c8cadd8b5cc4055001e85d6b372', 'info_dict': { 'id': '382479172', 'ext': 'mp4', @@ -29,7 +30,6 @@ 'skip': 'On available in China', }, { 'url': 'http://tv.sohu.com/20150305/n409385080.shtml', - 'md5': '699060e75cf58858dd47fb9c03c42cfb', 'info_dict': { 'id': '409385080', 'ext': 'mp4', @@ -37,7 +37,6 @@ } }, { 'url': 'http://my.tv.sohu.com/us/232799889/78693464.shtml', - 'md5': '9bf34be48f2f4dadcb226c74127e203c', 'info_dict': { 'id': '78693464', 'ext': 'mp4', @@ -51,7 +50,6 @@ 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', }, 'playlist': [{ - 'md5': 'bdbfb8f39924725e6589c146bc1883ad', 'info_dict': { 'id': '78910339_part1', 'ext': 'mp4', @@ -59,7 +57,6 @@ 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', } }, { - 'md5': '3e1f46aaeb95354fd10e7fca9fc1804e', 'info_dict': { 'id': '78910339_part2', 'ext': 'mp4', @@ -67,7 +64,6 @@ 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', } }, { - 'md5': '8407e634175fdac706766481b9443450', 'info_dict': { 'id': '78910339_part3', 'ext': 'mp4', @@ -96,15 +92,10 @@ else: base_data_url = 'http://hot.vrs.sohu.com/vrs_flash.action?vid=' - req = sanitized_Request(base_data_url + vid_id) - - cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') - if cn_verification_proxy: - req.add_header('Ytdl-request-proxy', cn_verification_proxy) - return self._download_json( - req, video_id, - 'Downloading JSON data for %s' % vid_id) + base_data_url + vid_id, video_id, + 'Downloading JSON data for %s' % vid_id, + headers=self.geo_verification_headers()) mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') @@ -121,12 +112,11 @@ if vid_data['play'] != 1: if vid_data.get('status') == 12: raise ExtractorError( - 'Sohu said: There\'s something wrong in the video.', + '%s said: There\'s something wrong in the video.' % self.IE_NAME, expected=True) else: - raise ExtractorError( - 'Sohu said: The video is only licensed to users in Mainland China.', - expected=True) + self.raise_geo_restricted( + '%s said: The video is only licensed to users in Mainland China.' % self.IE_NAME) formats_json = {} for format_id in ('nor', 'high', 'super', 'ori', 'h2644k', 'h2654k'): @@ -170,7 +160,7 @@ if retries > 0: download_note += ' (retry #%d)' % retries part_info = self._parse_json(self._download_webpage( - 'http://%s/?%s' % (allot, compat_urllib_parse.urlencode(params)), + 'http://%s/?%s' % (allot, compat_urllib_parse_urlencode(params)), video_id, download_note), video_id) video_url = part_info['url'] @@ -183,10 +173,11 @@ formats.append({ 'url': video_url, 'format_id': format_id, - 'filesize': data['clipsBytes'][i], - 'width': data['width'], - 'height': data['height'], - 'fps': data['fps'], + 'filesize': int_or_none( + try_get(data, lambda x: x['clipsBytes'][i])), + 'width': int_or_none(data.get('width')), + 'height': int_or_none(data.get('height')), + 'fps': int_or_none(data.get('fps')), }) self._sort_formats(formats) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/sonyliv.py youtube-dl-2019.05.20/youtube_dl/extractor/sonyliv.py --- youtube-dl-2016.02.22/youtube_dl/extractor/sonyliv.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/sonyliv.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,40 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import smuggle_url + + +class SonyLIVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?sonyliv\.com/details/[^/]+/(?P\d+)' + _TESTS = [{ + 'url': "http://www.sonyliv.com/details/episodes/5024612095001/Ep.-1---Achaari-Cheese-Toast---Bachelor's-Delight", + 'info_dict': { + 'title': "Ep. 1 - Achaari Cheese Toast - Bachelor's Delight", + 'id': 'ref:5024612095001', + 'ext': 'mp4', + 'upload_date': '20170923', + 'description': 'md5:7f28509a148d5be9d0782b4d5106410d', + 'uploader_id': '5182475815001', + 'timestamp': 1506200547, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['BrightcoveNew'], + }, { + 'url': 'http://www.sonyliv.com/details/full%20movie/4951168986001/Sei-Raat-(Bangla)', + 'only_matching': True, + }] + + # BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/4338955589001/default_default/index.html?videoId=%s' + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5182475815001/default_default/index.html?videoId=ref:%s' + + def _real_extract(self, url): + brightcove_id = self._match_id(url) + return self.url_result( + smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, { + 'geo_countries': ['IN'], + 'referrer': url, + }), + 'BrightcoveNew', brightcove_id) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/soundcloud.py youtube-dl-2019.05.20/youtube_dl/extractor/soundcloud.py --- youtube-dl-2016.02.22/youtube_dl/extractor/soundcloud.py 2016-02-14 22:26:17.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/soundcloud.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,8 +1,8 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals -import re import itertools +import re from .common import ( InfoExtractor, @@ -11,13 +11,20 @@ from ..compat import ( compat_str, compat_urlparse, - compat_urllib_parse, + compat_urllib_parse_urlencode, ) from ..utils import ( - encode_dict, ExtractorError, + float_or_none, int_or_none, - unified_strdate, + KNOWN_EXTENSIONS, + merge_dicts, + mimetype2ext, + str_or_none, + try_get, + unified_timestamp, + update_url_query, + url_or_none, ) @@ -32,8 +39,9 @@ _VALID_URL = r'''(?x)^(?:https?://)? (?:(?:(?:www\.|m\.)?soundcloud\.com/ + (?!stations/track) (?P[\w\d-]+)/ - (?!(?:tracks|sets(?:/[^/?#]+)?|reposts|likes|spotlight)/?(?:$|[?#])) + (?!(?:tracks|albums|sets(?:/.+?)?|reposts|likes|spotlight)/?(?:$|[?#])) (?P[\w\d-]+)/? (?P<token>[^?]+?)?(?:[?].*)?$) |(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+) @@ -49,11 +57,17 @@ 'info_dict': { 'id': '62986583', 'ext': 'mp3', - 'upload_date': '20121011', + 'title': 'Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1', 'description': 'No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o\'d', 'uploader': 'E.T. ExTerrestrial Music', - 'title': 'Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1', - 'duration': 143, + 'timestamp': 1349920598, + 'upload_date': '20121011', + 'duration': 143.216, + 'license': 'all-rights-reserved', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, } }, # not streamable song @@ -65,8 +79,14 @@ 'title': 'Goldrushed', 'description': 'From Stockholm Sweden\r\nPovel / Magnus / Filip / David\r\nwww.theroyalconcept.com', 'uploader': 'The Royal Concept', + 'timestamp': 1337635207, 'upload_date': '20120521', - 'duration': 227, + 'duration': 30, + 'license': 'all-rights-reserved', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, }, 'params': { # rtmp @@ -81,10 +101,16 @@ 'id': '123998367', 'ext': 'mp3', 'title': 'Youtube - Dl Test Video \'\' Ä↭', - 'uploader': 'jaimeMF', 'description': 'test chars: \"\'/\\ä↭', + 'uploader': 'jaimeMF', + 'timestamp': 1386604920, 'upload_date': '20131209', - 'duration': 9, + 'duration': 9.927, + 'license': 'all-rights-reserved', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, }, }, # private link (alt format) @@ -95,10 +121,16 @@ 'id': '123998367', 'ext': 'mp3', 'title': 'Youtube - Dl Test Video \'\' Ä↭', - 'uploader': 'jaimeMF', 'description': 'test chars: \"\'/\\ä↭', + 'uploader': 'jaimeMF', + 'timestamp': 1386604920, 'upload_date': '20131209', - 'duration': 9, + 'duration': 9.927, + 'license': 'all-rights-reserved', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, }, }, # downloadable song @@ -111,49 +143,141 @@ 'title': 'Bus Brakes', 'description': 'md5:0053ca6396e8d2fd7b7e1595ef12ab66', 'uploader': 'oddsamples', + 'timestamp': 1389232924, 'upload_date': '20140109', - 'duration': 17, + 'duration': 17.346, + 'license': 'cc-by-sa', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + }, + }, + # private link, downloadable format + { + 'url': 'https://soundcloud.com/oriuplift/uponly-238-no-talking-wav/s-AyZUd', + 'md5': '64a60b16e617d41d0bef032b7f55441e', + 'info_dict': { + 'id': '340344461', + 'ext': 'wav', + 'title': 'Uplifting Only 238 [No Talking] (incl. Alex Feed Guestmix) (Aug 31, 2017) [wav]', + 'description': 'md5:fa20ee0fca76a3d6df8c7e57f3715366', + 'uploader': 'Ori Uplift Music', + 'timestamp': 1504206263, + 'upload_date': '20170831', + 'duration': 7449.096, + 'license': 'all-rights-reserved', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + }, + }, + # no album art, use avatar pic for thumbnail + { + 'url': 'https://soundcloud.com/garyvee/sideways-prod-mad-real', + 'md5': '59c7872bc44e5d99b7211891664760c2', + 'info_dict': { + 'id': '309699954', + 'ext': 'mp3', + 'title': 'Sideways (Prod. Mad Real)', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'uploader': 'garyvee', + 'timestamp': 1488152409, + 'upload_date': '20170226', + 'duration': 207.012, + 'thumbnail': r're:https?://.*\.jpg', + 'license': 'all-rights-reserved', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + }, + 'params': { + 'skip_download': True, }, }, + # not avaialble via api.soundcloud.com/i1/tracks/id/streams + { + 'url': 'https://soundcloud.com/giovannisarani/mezzo-valzer', + 'md5': 'e22aecd2bc88e0e4e432d7dcc0a1abf7', + 'info_dict': { + 'id': '583011102', + 'ext': 'mp3', + 'title': 'Mezzo Valzer', + 'description': 'md5:4138d582f81866a530317bae316e8b61', + 'uploader': 'Giovanni Sarani', + 'timestamp': 1551394171, + 'upload_date': '20190228', + 'duration': 180.157, + 'thumbnail': r're:https?://.*\.jpg', + 'license': 'all-rights-reserved', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + }, + 'expected_warnings': ['Unable to download JSON metadata'], + } ] - _CLIENT_ID = '02gUJC0hH2ct1EGOcYXQIzRFU91c72Ea' - _IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf' + _CLIENT_ID = 'FweeGBOOEOYJWLJN3oEyToGLKhmSz0I7' - def report_resolve(self, video_id): - """Report information extraction.""" - self.to_screen('%s: Resolving id' % video_id) + @staticmethod + def _extract_urls(webpage): + return [m.group('url') for m in re.finditer( + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:w\.)?soundcloud\.com/player.+?)\1', + webpage)] @classmethod def _resolv_url(cls, url): - return 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID + return 'https://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID def _extract_info_dict(self, info, full_title=None, quiet=False, secret_token=None): track_id = compat_str(info['id']) + title = info['title'] name = full_title or track_id if quiet: self.report_extraction(name) - - thumbnail = info['artwork_url'] - if thumbnail is not None: + thumbnail = info.get('artwork_url') or info.get('user', {}).get('avatar_url') + if isinstance(thumbnail, compat_str): thumbnail = thumbnail.replace('-large', '-t500x500') - ext = 'mp3' + username = try_get(info, lambda x: x['user']['username'], compat_str) + + def extract_count(key): + return int_or_none(info.get('%s_count' % key)) + + like_count = extract_count('favoritings') + if like_count is None: + like_count = extract_count('likes') + result = { 'id': track_id, - 'uploader': info['user']['username'], - 'upload_date': unified_strdate(info['created_at']), - 'title': info['title'], - 'description': info['description'], + 'uploader': username, + 'timestamp': unified_timestamp(info.get('created_at')), + 'title': title, + 'description': info.get('description'), 'thumbnail': thumbnail, - 'duration': int_or_none(info.get('duration'), 1000), + 'duration': float_or_none(info.get('duration'), 1000), 'webpage_url': info.get('permalink_url'), + 'license': info.get('license'), + 'view_count': extract_count('playback'), + 'like_count': like_count, + 'comment_count': extract_count('comment'), + 'repost_count': extract_count('reposts'), + 'genre': info.get('genre'), } + + format_urls = set() formats = [] + query = {'client_id': self._CLIENT_ID} + if secret_token is not None: + query['secret_token'] = secret_token if info.get('downloadable', False): # We can build a direct link to the song - format_url = ( - 'https://api.soundcloud.com/tracks/{0}/download?client_id={1}'.format( - track_id, self._CLIENT_ID)) + format_url = update_url_query( + 'https://api.soundcloud.com/tracks/%s/download' % track_id, query) + format_urls.add(format_url) formats.append({ 'format_id': 'download', 'ext': info.get('original_format', 'mp3'), @@ -162,49 +286,105 @@ 'preference': 10, }) - # We have to retrieve the url - streams_url = ('http://api.soundcloud.com/i1/tracks/{0}/streams?' - 'client_id={1}&secret_token={2}'.format(track_id, self._IPHONE_CLIENT_ID, secret_token)) + # Old API, does not work for some tracks (e.g. + # https://soundcloud.com/giovannisarani/mezzo-valzer) format_dict = self._download_json( - streams_url, - track_id, 'Downloading track url') + 'https://api.soundcloud.com/i1/tracks/%s/streams' % track_id, + track_id, 'Downloading track url', query=query, fatal=False) + + if format_dict: + for key, stream_url in format_dict.items(): + if stream_url in format_urls: + continue + format_urls.add(stream_url) + ext, abr = 'mp3', None + mobj = re.search(r'_([^_]+)_(\d+)_url', key) + if mobj: + ext, abr = mobj.groups() + abr = int(abr) + if key.startswith('http'): + stream_formats = [{ + 'format_id': key, + 'ext': ext, + 'url': stream_url, + }] + elif key.startswith('rtmp'): + # The url doesn't have an rtmp app, we have to extract the playpath + url, path = stream_url.split('mp3:', 1) + stream_formats = [{ + 'format_id': key, + 'url': url, + 'play_path': 'mp3:' + path, + 'ext': 'flv', + }] + elif key.startswith('hls'): + stream_formats = self._extract_m3u8_formats( + stream_url, track_id, ext, entry_protocol='m3u8_native', + m3u8_id=key, fatal=False) + else: + continue + + if abr: + for f in stream_formats: + f['abr'] = abr + + formats.extend(stream_formats) + + # New API + transcodings = try_get( + info, lambda x: x['media']['transcodings'], list) or [] + for t in transcodings: + if not isinstance(t, dict): + continue + format_url = url_or_none(t.get('url')) + if not format_url: + continue + stream = self._download_json( + update_url_query(format_url, query), track_id, fatal=False) + if not isinstance(stream, dict): + continue + stream_url = url_or_none(stream.get('url')) + if not stream_url: + continue + if stream_url in format_urls: + continue + format_urls.add(stream_url) + protocol = try_get(t, lambda x: x['format']['protocol'], compat_str) + if protocol != 'hls' and '/hls' in format_url: + protocol = 'hls' + ext = None + preset = str_or_none(t.get('preset')) + if preset: + ext = preset.split('_')[0] + if ext not in KNOWN_EXTENSIONS: + mimetype = try_get( + t, lambda x: x['format']['mime_type'], compat_str) + ext = mimetype2ext(mimetype) or 'mp3' + format_id_list = [] + if protocol: + format_id_list.append(protocol) + format_id_list.append(ext) + format_id = '_'.join(format_id_list) + formats.append({ + 'url': stream_url, + 'format_id': format_id, + 'ext': ext, + 'protocol': 'm3u8_native' if protocol == 'hls' else 'http', + }) + + if not formats: + # We fallback to the stream_url in the original info, this + # cannot be always used, sometimes it can give an HTTP 404 error + formats.append({ + 'format_id': 'fallback', + 'url': update_url_query(info['stream_url'], query), + 'ext': 'mp3', + }) + self._check_formats(formats, track_id) - for key, stream_url in format_dict.items(): - if key.startswith('http'): - formats.append({ - 'format_id': key, - 'ext': ext, - 'url': stream_url, - 'vcodec': 'none', - }) - elif key.startswith('rtmp'): - # The url doesn't have an rtmp app, we have to extract the playpath - url, path = stream_url.split('mp3:', 1) - formats.append({ - 'format_id': key, - 'url': url, - 'play_path': 'mp3:' + path, - 'ext': 'flv', - 'vcodec': 'none', - }) - - if not formats: - # We fallback to the stream_url in the original info, this - # cannot be always used, sometimes it can give an HTTP 404 error - formats.append({ - 'format_id': 'fallback', - 'url': info['stream_url'] + '?client_id=' + self._CLIENT_ID, - 'ext': ext, - 'vcodec': 'none', - }) - - for f in formats: - if f['format_id'].startswith('http'): - f['protocol'] = 'http' - if f['format_id'].startswith('rtmp'): - f['protocol'] = 'rtmp' + for f in formats: + f['vcodec'] = 'none' - self._check_formats(formats, track_id) self._sort_formats(formats) result['formats'] = formats @@ -216,9 +396,10 @@ raise ExtractorError('Invalid URL: %s' % url) track_id = mobj.group('track_id') - token = None + new_info = {} + if track_id is not None: - info_json_url = 'http://api.soundcloud.com/tracks/' + track_id + '.json?client_id=' + self._CLIENT_ID + info_json_url = 'https://api.soundcloud.com/tracks/' + track_id + '.json?client_id=' + self._CLIENT_ID full_title = track_id token = mobj.group('secret_token') if token: @@ -241,16 +422,47 @@ if token: resolve_title += '/%s' % token - self.report_resolve(full_title) - - url = 'http://soundcloud.com/%s' % resolve_title - info_json_url = self._resolv_url(url) - info = self._download_json(info_json_url, full_title, 'Downloading info JSON') + webpage = self._download_webpage(url, full_title, fatal=False) + if webpage: + entries = self._parse_json( + self._search_regex( + r'var\s+c\s*=\s*(\[.+?\])\s*,\s*o\s*=Date\b', webpage, + 'data', default='[]'), full_title, fatal=False) + if entries: + for e in entries: + if not isinstance(e, dict): + continue + if e.get('id') != 67: + continue + data = try_get(e, lambda x: x['data'][0], dict) + if data: + new_info = data + break + info_json_url = self._resolv_url( + 'https://soundcloud.com/%s' % resolve_title) + + # Contains some additional info missing from new_info + info = self._download_json( + info_json_url, full_title, 'Downloading info JSON') + + return self._extract_info_dict( + merge_dicts(info, new_info), full_title, secret_token=token) + + +class SoundcloudPlaylistBaseIE(SoundcloudIE): + @staticmethod + def _extract_id(e): + return compat_str(e['id']) if e.get('id') else None + + def _extract_track_entries(self, tracks): + return [ + self.url_result( + track['permalink_url'], SoundcloudIE.ie_key(), + video_id=self._extract_id(track)) + for track in tracks if track.get('permalink_url')] - return self._extract_info_dict(info, full_title, secret_token=token) - -class SoundcloudSetIE(SoundcloudIE): +class SoundcloudSetIE(SoundcloudPlaylistBaseIE): _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<uploader>[\w\d-]+)/sets/(?P<slug_title>[\w\d-]+)(?:/(?P<token>[^?/]+))?' IE_NAME = 'soundcloud:set' _TESTS = [{ @@ -259,7 +471,10 @@ 'id': '2284613', 'title': 'The Royal Concept EP', }, - 'playlist_mincount': 6, + 'playlist_mincount': 5, + }, { + 'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep/token', + 'only_matching': True, }] def _real_extract(self, url): @@ -270,15 +485,13 @@ # extract simple title (uploader + slug of song title) slug_title = mobj.group('slug_title') full_title = '%s/sets/%s' % (uploader, slug_title) - url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title) + url = 'https://soundcloud.com/%s/sets/%s' % (uploader, slug_title) token = mobj.group('token') if token: full_title += '/' + token url += '/' + token - self.report_resolve(full_title) - resolv_url = self._resolv_url(url) info = self._download_json(resolv_url, full_title) @@ -286,7 +499,7 @@ msgs = (compat_str(err['error_message']) for err in info['errors']) raise ExtractorError('unable to download video webpage: %s' % ','.join(msgs)) - entries = [self.url_result(track['permalink_url'], 'Soundcloud') for track in info['tracks']] + entries = self._extract_track_entries(info['tracks']) return { '_type': 'playlist', @@ -296,52 +509,123 @@ } -class SoundcloudUserIE(SoundcloudIE): +class SoundcloudPagedPlaylistBaseIE(SoundcloudPlaylistBaseIE): + _API_V2_BASE = 'https://api-v2.soundcloud.com' + + def _extract_playlist(self, base_url, playlist_id, playlist_title): + COMMON_QUERY = { + 'limit': 50, + 'client_id': self._CLIENT_ID, + 'linked_partitioning': '1', + } + + query = COMMON_QUERY.copy() + query['offset'] = 0 + + next_href = base_url + '?' + compat_urllib_parse_urlencode(query) + + entries = [] + for i in itertools.count(): + response = self._download_json( + next_href, playlist_id, 'Downloading track page %s' % (i + 1)) + + collection = response['collection'] + + if not isinstance(collection, list): + collection = [] + + # Empty collection may be returned, in this case we proceed + # straight to next_href + + def resolve_entry(candidates): + for cand in candidates: + if not isinstance(cand, dict): + continue + permalink_url = url_or_none(cand.get('permalink_url')) + if not permalink_url: + continue + return self.url_result( + permalink_url, + ie=SoundcloudIE.ie_key() if SoundcloudIE.suitable(permalink_url) else None, + video_id=self._extract_id(cand), + video_title=cand.get('title')) + + for e in collection: + entry = resolve_entry((e, e.get('track'), e.get('playlist'))) + if entry: + entries.append(entry) + + next_href = response.get('next_href') + if not next_href: + break + + parsed_next_href = compat_urlparse.urlparse(response['next_href']) + qs = compat_urlparse.parse_qs(parsed_next_href.query) + qs.update(COMMON_QUERY) + next_href = compat_urlparse.urlunparse( + parsed_next_href._replace(query=compat_urllib_parse_urlencode(qs, True))) + + return { + '_type': 'playlist', + 'id': playlist_id, + 'title': playlist_title, + 'entries': entries, + } + + +class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE): _VALID_URL = r'''(?x) https?:// (?:(?:www|m)\.)?soundcloud\.com/ (?P<user>[^/]+) (?:/ - (?P<rsrc>tracks|sets|reposts|likes|spotlight) + (?P<rsrc>tracks|albums|sets|reposts|likes|spotlight) )? /?(?:[?#].*)?$ ''' IE_NAME = 'soundcloud:user' _TESTS = [{ - 'url': 'https://soundcloud.com/the-akashic-chronicler', + 'url': 'https://soundcloud.com/soft-cell-official', + 'info_dict': { + 'id': '207965082', + 'title': 'Soft Cell (All)', + }, + 'playlist_mincount': 28, + }, { + 'url': 'https://soundcloud.com/soft-cell-official/tracks', 'info_dict': { - 'id': '114582580', - 'title': 'The Akashic Chronicler (All)', + 'id': '207965082', + 'title': 'Soft Cell (Tracks)', }, - 'playlist_mincount': 111, + 'playlist_mincount': 27, }, { - 'url': 'https://soundcloud.com/the-akashic-chronicler/tracks', + 'url': 'https://soundcloud.com/soft-cell-official/albums', 'info_dict': { - 'id': '114582580', - 'title': 'The Akashic Chronicler (Tracks)', + 'id': '207965082', + 'title': 'Soft Cell (Albums)', }, - 'playlist_mincount': 50, + 'playlist_mincount': 1, }, { - 'url': 'https://soundcloud.com/the-akashic-chronicler/sets', + 'url': 'https://soundcloud.com/jcv246/sets', 'info_dict': { - 'id': '114582580', - 'title': 'The Akashic Chronicler (Playlists)', + 'id': '12982173', + 'title': 'Jordi / cv (Playlists)', }, - 'playlist_mincount': 3, + 'playlist_mincount': 2, }, { - 'url': 'https://soundcloud.com/the-akashic-chronicler/reposts', + 'url': 'https://soundcloud.com/jcv246/reposts', 'info_dict': { - 'id': '114582580', - 'title': 'The Akashic Chronicler (Reposts)', + 'id': '12982173', + 'title': 'Jordi / cv (Reposts)', }, - 'playlist_mincount': 7, + 'playlist_mincount': 6, }, { - 'url': 'https://soundcloud.com/the-akashic-chronicler/likes', + 'url': 'https://soundcloud.com/clalberg/likes', 'info_dict': { - 'id': '114582580', - 'title': 'The Akashic Chronicler (Likes)', + 'id': '11817582', + 'title': 'clalberg (Likes)', }, - 'playlist_mincount': 321, + 'playlist_mincount': 5, }, { 'url': 'https://soundcloud.com/grynpyret/spotlight', 'info_dict': { @@ -351,21 +635,20 @@ 'playlist_mincount': 1, }] - _API_BASE = 'https://api.soundcloud.com' - _API_V2_BASE = 'https://api-v2.soundcloud.com' - _BASE_URL_MAP = { - 'all': '%s/profile/soundcloud:users:%%s' % _API_V2_BASE, - 'tracks': '%s/users/%%s/tracks' % _API_BASE, - 'sets': '%s/users/%%s/playlists' % _API_V2_BASE, - 'reposts': '%s/profile/soundcloud:users:%%s/reposts' % _API_V2_BASE, - 'likes': '%s/users/%%s/likes' % _API_V2_BASE, - 'spotlight': '%s/users/%%s/spotlight' % _API_V2_BASE, + 'all': '%s/stream/users/%%s' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, + 'tracks': '%s/users/%%s/tracks' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, + 'albums': '%s/users/%%s/albums' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, + 'sets': '%s/users/%%s/playlists' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, + 'reposts': '%s/stream/users/%%s/reposts' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, + 'likes': '%s/users/%%s/likes' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, + 'spotlight': '%s/users/%%s/spotlight' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, } _TITLE_MAP = { 'all': 'All', 'tracks': 'Tracks', + 'albums': 'Albums', 'sets': 'Playlists', 'reposts': 'Reposts', 'likes': 'Likes', @@ -376,69 +659,49 @@ mobj = re.match(self._VALID_URL, url) uploader = mobj.group('user') - url = 'http://soundcloud.com/%s/' % uploader + url = 'https://soundcloud.com/%s/' % uploader resolv_url = self._resolv_url(url) user = self._download_json( resolv_url, uploader, 'Downloading user info') resource = mobj.group('rsrc') or 'all' - base_url = self._BASE_URL_MAP[resource] % user['id'] - COMMON_QUERY = { - 'limit': 50, - 'client_id': self._CLIENT_ID, - 'linked_partitioning': '1', - } - - query = COMMON_QUERY.copy() - query['offset'] = 0 - - next_href = base_url + '?' + compat_urllib_parse.urlencode(query) - - entries = [] - for i in itertools.count(): - response = self._download_json( - next_href, uploader, 'Downloading track page %s' % (i + 1)) + return self._extract_playlist( + self._BASE_URL_MAP[resource] % user['id'], compat_str(user['id']), + '%s (%s)' % (user['username'], self._TITLE_MAP[resource])) - collection = response['collection'] - if not collection: - break - def resolve_permalink_url(candidates): - for cand in candidates: - if isinstance(cand, dict): - permalink_url = cand.get('permalink_url') - if permalink_url and permalink_url.startswith('http'): - return permalink_url +class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE): + _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/stations/track/[^/]+/(?P<id>[^/?#&]+)' + IE_NAME = 'soundcloud:trackstation' + _TESTS = [{ + 'url': 'https://soundcloud.com/stations/track/officialsundial/your-text', + 'info_dict': { + 'id': '286017854', + 'title': 'Track station: your-text', + }, + 'playlist_mincount': 47, + }] - for e in collection: - permalink_url = resolve_permalink_url((e, e.get('track'), e.get('playlist'))) - if permalink_url: - entries.append(self.url_result(permalink_url)) + def _real_extract(self, url): + track_name = self._match_id(url) - next_href = response.get('next_href') - if not next_href: - break + webpage = self._download_webpage(url, track_name) - parsed_next_href = compat_urlparse.urlparse(response['next_href']) - qs = compat_urlparse.parse_qs(parsed_next_href.query) - qs.update(COMMON_QUERY) - next_href = compat_urlparse.urlunparse( - parsed_next_href._replace(query=compat_urllib_parse.urlencode(qs, True))) + track_id = self._search_regex( + r'soundcloud:track-stations:(\d+)', webpage, 'track id') - return { - '_type': 'playlist', - 'id': compat_str(user['id']), - 'title': '%s (%s)' % (user['username'], self._TITLE_MAP[resource]), - 'entries': entries, - } + return self._extract_playlist( + '%s/stations/soundcloud:track-stations:%s/tracks' + % (self._API_V2_BASE, track_id), + track_id, 'Track station: %s' % track_name) -class SoundcloudPlaylistIE(SoundcloudIE): +class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE): _VALID_URL = r'https?://api\.soundcloud\.com/playlists/(?P<id>[0-9]+)(?:/?\?secret_token=(?P<token>[^&]+?))?$' IE_NAME = 'soundcloud:playlist' _TESTS = [{ - 'url': 'http://api.soundcloud.com/playlists/4110309', + 'url': 'https://api.soundcloud.com/playlists/4110309', 'info_dict': { 'id': '4110309', 'title': 'TILT Brass - Bowery Poetry Club, August \'03 [Non-Site SCR 02]', @@ -460,11 +723,11 @@ if token: data_dict['secret_token'] = token - data = compat_urllib_parse.urlencode(data_dict) + data = compat_urllib_parse_urlencode(data_dict) data = self._download_json( base_url + data, playlist_id, 'Downloading playlist') - entries = [self.url_result(track['permalink_url'], 'Soundcloud') for track in data['tracks']] + entries = self._extract_track_entries(data['tracks']) return { '_type': 'playlist', @@ -500,7 +763,7 @@ query['client_id'] = self._CLIENT_ID query['linked_partitioning'] = '1' query['offset'] = 0 - data = compat_urllib_parse.urlencode(encode_dict(query)) + data = compat_urllib_parse_urlencode(query) next_url = '{0}{1}?{2}'.format(self._API_V2_BASE, endpoint, data) collected_results = 0 diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/soundgasm.py youtube-dl-2019.05.20/youtube_dl/extractor/soundgasm.py --- youtube-dl-2016.02.22/youtube_dl/extractor/soundgasm.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/soundgasm.py 2019-06-07 18:49:07.000000000 +0000 @@ -8,36 +8,49 @@ class SoundgasmIE(InfoExtractor): IE_NAME = 'soundgasm' - _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<user>[0-9a-zA-Z_\-]+)/(?P<title>[0-9a-zA-Z_\-]+)' + _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<user>[0-9a-zA-Z_-]+)/(?P<display_id>[0-9a-zA-Z_-]+)' _TEST = { 'url': 'http://soundgasm.net/u/ytdl/Piano-sample', 'md5': '010082a2c802c5275bb00030743e75ad', 'info_dict': { 'id': '88abd86ea000cafe98f96321b23cc1206cbcbcc9', 'ext': 'm4a', - 'title': 'ytdl_Piano-sample', - 'description': 'Royalty Free Sample Music' + 'title': 'Piano sample', + 'description': 'Royalty Free Sample Music', + 'uploader': 'ytdl', } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('title') - audio_title = mobj.group('user') + '_' + mobj.group('title') + display_id = mobj.group('display_id') + webpage = self._download_webpage(url, display_id) + audio_url = self._html_search_regex( - r'(?s)m4a\:\s"([^"]+)"', webpage, 'audio URL') - audio_id = re.split('\/|\.', audio_url)[-2] + r'(?s)m4a\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'audio URL', group='url') + + title = self._search_regex( + r'<div[^>]+\bclass=["\']jp-title[^>]+>([^<]+)', + webpage, 'title', default=display_id) + description = self._html_search_regex( - r'(?s)<li>Description:\s(.*?)<\/li>', webpage, 'description', - fatal=False) + (r'(?s)<div[^>]+\bclass=["\']jp-description[^>]+>(.+?)</div>', + r'(?s)<li>Description:\s(.*?)<\/li>'), + webpage, 'description', fatal=False) + + audio_id = self._search_regex( + r'/([^/]+)\.m4a', audio_url, 'audio id', default=display_id) return { 'id': audio_id, 'display_id': display_id, 'url': audio_url, - 'title': audio_title, - 'description': description + 'vcodec': 'none', + 'title': title, + 'description': description, + 'uploader': mobj.group('user'), } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/southpark.py youtube-dl-2019.05.20/youtube_dl/extractor/southpark.py --- youtube-dl-2016.02.22/youtube_dl/extractor/southpark.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/southpark.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals from .mtv import MTVServicesInfoExtractor @@ -6,7 +6,7 @@ class SouthParkIE(MTVServicesInfoExtractor): IE_NAME = 'southpark.cc.com' - _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.cc\.com/(?:clips|full-episodes)/(?P<id>.+?)(\?|#|$))' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.cc\.com/(?:clips|(?:full-)?episodes|collections)/(?P<id>.+?)(\?|#|$))' _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss' @@ -17,7 +17,12 @@ 'ext': 'mp4', 'title': 'South Park|Bat Daded', 'description': 'Randy disqualifies South Park by getting into a fight with Bat Dad.', + 'timestamp': 1112760000, + 'upload_date': '20050406', }, + }, { + 'url': 'http://southpark.cc.com/collections/7758/fan-favorites/1', + 'only_matching': True, }] @@ -28,13 +33,18 @@ _TESTS = [{ 'url': 'http://southpark.cc.com/episodios-en-espanol/s01e01-cartman-consigue-una-sonda-anal#source=351c1323-0b96-402d-a8b9-40d01b2e9bde&position=1&sort=!airdate', + 'info_dict': { + 'title': 'Cartman Consigue Una Sonda Anal', + 'description': 'Cartman Consigue Una Sonda Anal', + }, 'playlist_count': 4, + 'skip': 'Geo-restricted', }] class SouthParkDeIE(SouthParkIE): IE_NAME = 'southpark.de' - _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.de/(?:clips|alle-episoden)/(?P<id>.+?)(\?|#|$))' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.de/(?:clips|alle-episoden|collections)/(?P<id>.+?)(\?|#|$))' _FEED_URL = 'http://www.southpark.de/feeds/video-player/mrss/' _TESTS = [{ @@ -42,37 +52,64 @@ 'info_dict': { 'id': '85487c96-b3b9-4e39-9127-ad88583d9bf2', 'ext': 'mp4', - 'title': 'The Government Won\'t Respect My Privacy', + 'title': 'South Park|The Government Won\'t Respect My Privacy', 'description': 'Cartman explains the benefits of "Shitter" to Stan, Kyle and Craig.', + 'timestamp': 1380160800, + 'upload_date': '20130926', }, }, { # non-ASCII characters in initial URL 'url': 'http://www.southpark.de/alle-episoden/s18e09-hashtag-aufwärmen', - 'playlist_count': 4, + 'info_dict': { + 'title': 'Hashtag „Aufwärmen“', + 'description': 'Kyle will mit seinem kleinen Bruder Ike Videospiele spielen. Als der nicht mehr mit ihm spielen will, hat Kyle Angst, dass er die Kids von heute nicht mehr versteht.', + }, + 'playlist_count': 3, }, { # non-ASCII characters in redirect URL 'url': 'http://www.southpark.de/alle-episoden/s18e09', - 'playlist_count': 4, + 'info_dict': { + 'title': 'Hashtag „Aufwärmen“', + 'description': 'Kyle will mit seinem kleinen Bruder Ike Videospiele spielen. Als der nicht mehr mit ihm spielen will, hat Kyle Angst, dass er die Kids von heute nicht mehr versteht.', + }, + 'playlist_count': 3, + }, { + 'url': 'http://www.southpark.de/collections/2476/superhero-showdown/1', + 'only_matching': True, }] class SouthParkNlIE(SouthParkIE): IE_NAME = 'southpark.nl' - _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.nl/(?:clips|full-episodes)/(?P<id>.+?)(\?|#|$))' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.nl/(?:clips|(?:full-)?episodes|collections)/(?P<id>.+?)(\?|#|$))' _FEED_URL = 'http://www.southpark.nl/feeds/video-player/mrss/' _TESTS = [{ 'url': 'http://www.southpark.nl/full-episodes/s18e06-freemium-isnt-free', - 'playlist_count': 4, + 'info_dict': { + 'title': 'Freemium Isn\'t Free', + 'description': 'Stan is addicted to the new Terrance and Phillip mobile game.', + }, + 'playlist_mincount': 3, }] class SouthParkDkIE(SouthParkIE): IE_NAME = 'southparkstudios.dk' - _VALID_URL = r'https?://(?:www\.)?(?P<url>southparkstudios\.dk/(?:clips|full-episodes)/(?P<id>.+?)(\?|#|$))' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southparkstudios\.(?:dk|nu)/(?:clips|full-episodes|collections)/(?P<id>.+?)(\?|#|$))' _FEED_URL = 'http://www.southparkstudios.dk/feeds/video-player/mrss/' _TESTS = [{ 'url': 'http://www.southparkstudios.dk/full-episodes/s18e07-grounded-vindaloop', - 'playlist_count': 4, + 'info_dict': { + 'title': 'Grounded Vindaloop', + 'description': 'Butters is convinced he\'s living in a virtual reality.', + }, + 'playlist_mincount': 3, + }, { + 'url': 'http://www.southparkstudios.dk/collections/2476/superhero-showdown/1', + 'only_matching': True, + }, { + 'url': 'http://www.southparkstudios.nu/collections/2476/superhero-showdown/1', + 'only_matching': True, }] diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/space.py youtube-dl-2019.05.20/youtube_dl/extractor/space.py --- youtube-dl-2016.02.22/youtube_dl/extractor/space.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/space.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,38 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from .brightcove import BrightcoveLegacyIE -from ..utils import RegexNotFoundError, ExtractorError - - -class SpaceIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|m)\.)?space\.com/\d+-(?P<title>[^/\.\?]*?)-video\.html' - _TEST = { - 'add_ie': ['BrightcoveLegacy'], - 'url': 'http://www.space.com/23373-huge-martian-landforms-detail-revealed-by-european-probe-video.html', - 'info_dict': { - 'id': '2780937028001', - 'ext': 'mp4', - 'title': 'Huge Martian Landforms\' Detail Revealed By European Probe | Video', - 'description': 'md5:db81cf7f3122f95ed234b631a6ea1e61', - 'uploader': 'TechMedia Networks', - }, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - title = mobj.group('title') - webpage = self._download_webpage(url, title) - try: - # Some videos require the playerKey field, which isn't define in - # the BrightcoveExperience object - brightcove_url = self._og_search_video_url(webpage) - except RegexNotFoundError: - # Other videos works fine with the info from the object - brightcove_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) - if brightcove_url is None: - raise ExtractorError( - 'The webpage does not contain a video', expected=True) - return self.url_result(brightcove_url, BrightcoveLegacyIE.ie_key()) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/spankbang.py youtube-dl-2019.05.20/youtube_dl/extractor/spankbang.py --- youtube-dl-2016.02.22/youtube_dl/extractor/spankbang.py 2016-02-09 11:57:41.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/spankbang.py 2019-06-07 18:49:07.000000000 +0000 @@ -3,10 +3,19 @@ import re from .common import InfoExtractor +from ..utils import ( + ExtractorError, + orderedSet, + parse_duration, + parse_resolution, + str_to_int, + url_or_none, + urlencode_postdata, +) class SpankBangIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|[a-z]{2})\.)?spankbang\.com/(?P<id>[\da-z]+)/video' + _VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P<id>[\da-z]+)/(?:video|play|embed)\b' _TESTS = [{ 'url': 'http://spankbang.com/3vvn/video/fantasy+solo', 'md5': '1cc433e1d6aa14bc376535b8679302f7', @@ -15,7 +24,7 @@ 'ext': 'mp4', 'title': 'fantasy solo', 'description': 'dillion harper masturbates on a bed', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'silly2587', 'age_limit': 18, } @@ -23,34 +32,99 @@ # 480p only 'url': 'http://spankbang.com/1vt0/video/solvane+gangbang', 'only_matching': True, + }, { + # no uploader + 'url': 'http://spankbang.com/lklg/video/sex+with+anyone+wedding+edition+2', + 'only_matching': True, + }, { + # mobile page + 'url': 'http://m.spankbang.com/1o2de/video/can+t+remember+her+name', + 'only_matching': True, + }, { + # 4k + 'url': 'https://spankbang.com/1vwqx/video/jade+kush+solo+4k', + 'only_matching': True, + }, { + 'url': 'https://m.spankbang.com/3vvn/play/fantasy+solo/480p/', + 'only_matching': True, + }, { + 'url': 'https://m.spankbang.com/3vvn/play', + 'only_matching': True, + }, { + 'url': 'https://spankbang.com/2y3td/embed/', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage( + url.replace('/%s/embed' % video_id, '/%s/video' % video_id), + video_id, headers={'Cookie': 'country=US'}) + + if re.search(r'<[^>]+\bid=["\']video_removed', webpage): + raise ExtractorError( + 'Video %s is not available' % video_id, expected=True) + + formats = [] + + def extract_format(format_id, format_url): + f_url = url_or_none(format_url) + if not f_url: + return + f = parse_resolution(format_id) + f.update({ + 'url': f_url, + 'format_id': format_id, + }) + formats.append(f) + + STREAM_URL_PREFIX = 'stream_url_' + + for mobj in re.finditer( + r'%s(?P<id>[^\s=]+)\s*=\s*(["\'])(?P<url>(?:(?!\2).)+)\2' + % STREAM_URL_PREFIX, webpage): + extract_format(mobj.group('id', 'url')) + + if not formats: + stream_key = self._search_regex( + r'data-streamkey\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1', + webpage, 'stream key', group='value') + + sb_csrf_session = self._get_cookies( + 'https://spankbang.com')['sb_csrf_session'].value + + stream = self._download_json( + 'https://spankbang.com/api/videos/stream', video_id, + 'Downloading stream JSON', data=urlencode_postdata({ + 'id': stream_key, + 'data': 0, + 'sb_csrf_session': sb_csrf_session, + }), headers={ + 'Referer': url, + 'X-CSRFToken': sb_csrf_session, + }) + + for format_id, format_url in stream.items(): + if format_id.startswith(STREAM_URL_PREFIX): + extract_format( + format_id[len(STREAM_URL_PREFIX):], format_url) - stream_key = self._html_search_regex( - r'''var\s+stream_key\s*=\s*['"](.+?)['"]''', - webpage, 'stream key') - - formats = [{ - 'url': 'http://spankbang.com/_%s/%s/title/%sp__mp4' % (video_id, stream_key, height), - 'ext': 'mp4', - 'format_id': '%sp' % height, - 'height': int(height), - } for height in re.findall(r'<(?:span|li|p)[^>]+[qb]_(\d+)p', webpage)] - self._check_formats(formats, video_id) self._sort_formats(formats) title = self._html_search_regex( r'(?s)<h1[^>]*>(.+?)</h1>', webpage, 'title') description = self._search_regex( - r'class="desc"[^>]*>([^<]+)', - webpage, 'description', default=None) + r'<div[^>]+\bclass=["\']bottom[^>]+>\s*<p>[^<]*</p>\s*<p>([^<]+)', + webpage, 'description', fatal=False) thumbnail = self._og_search_thumbnail(webpage) uploader = self._search_regex( - r'class="user"[^>]*>([^<]+)', - webpage, 'uploader', fatal=False) + r'class="user"[^>]*><img[^>]+>([^<]+)', + webpage, 'uploader', default=None) + duration = parse_duration(self._search_regex( + r'<div[^>]+\bclass=["\']right_side[^>]+>\s*<span>([^<]+)', + webpage, 'duration', fatal=False)) + view_count = str_to_int(self._search_regex( + r'([\d,.]+)\s+plays', webpage, 'view count', fatal=False)) age_limit = self._rta_search(webpage) @@ -60,6 +134,38 @@ 'description': description, 'thumbnail': thumbnail, 'uploader': uploader, + 'duration': duration, + 'view_count': view_count, 'formats': formats, 'age_limit': age_limit, } + + +class SpankBangPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P<id>[\da-z]+)/playlist/[^/]+' + _TEST = { + 'url': 'https://spankbang.com/ug0k/playlist/big+ass+titties', + 'info_dict': { + 'id': 'ug0k', + 'title': 'Big Ass Titties', + }, + 'playlist_mincount': 50, + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage( + url, playlist_id, headers={'Cookie': 'country=US; mobile=on'}) + + entries = [self.url_result( + 'https://spankbang.com/%s/video' % video_id, + ie=SpankBangIE.ie_key(), video_id=video_id) + for video_id in orderedSet(re.findall( + r'<a[^>]+\bhref=["\']/?([\da-z]+)/play/', webpage))] + + title = self._html_search_regex( + r'<h1>([^<]+)\s+playlist</h1>', webpage, 'playlist title', + fatal=False) + + return self.playlist_result(entries, playlist_id, title) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/spankwire.py youtube-dl-2019.05.20/youtube_dl/extractor/spankwire.py --- youtube-dl-2016.02.22/youtube_dl/extractor/spankwire.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/spankwire.py 2019-06-07 18:49:07.000000000 +0000 @@ -85,7 +85,7 @@ r'playerData\.cdnPath([0-9]{3,})\s*=\s*(?:encodeURIComponent\()?["\']([^"\']+)["\']', webpage) heights = [int(video[0]) for video in videos] video_urls = list(map(compat_urllib_parse_unquote, [video[1] for video in videos])) - if webpage.find('flashvars\.encrypted = "true"') != -1: + if webpage.find(r'flashvars\.encrypted = "true"') != -1: password = self._search_regex( r'flashvars\.video_title = "([^"]+)', webpage, 'password').replace('+', ' ') @@ -96,20 +96,18 @@ formats = [] for height, video_url in zip(heights, video_urls): path = compat_urllib_parse_urlparse(video_url).path - _, quality = path.split('/')[4].split('_')[:2] - f = { + m = re.search(r'/(?P<height>\d+)[pP]_(?P<tbr>\d+)[kK]', path) + if m: + tbr = int(m.group('tbr')) + height = int(m.group('height')) + else: + tbr = None + formats.append({ 'url': video_url, + 'format_id': '%dp' % height, 'height': height, - } - tbr = self._search_regex(r'^(\d+)[Kk]$', quality, 'tbr', default=None) - if tbr: - f.update({ - 'tbr': int(tbr), - 'format_id': '%dp' % height, - }) - else: - f['format_id'] = quality - formats.append(f) + 'tbr': tbr, + }) self._sort_formats(formats) age_limit = self._rta_search(webpage) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/spiegel.py youtube-dl-2019.05.20/youtube_dl/extractor/spiegel.py --- youtube-dl-2016.02.22/youtube_dl/extractor/spiegel.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/spiegel.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,100 +1,97 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urlparse +from .nexx import ( + NexxIE, + NexxEmbedIE, +) from .spiegeltv import SpiegeltvIE +from ..compat import compat_urlparse +from ..utils import ( + parse_duration, + strip_or_none, + unified_timestamp, +) class SpiegelIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<id>[0-9]+)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' _TESTS = [{ 'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html', - 'md5': '2c2754212136f35fb4b19767d242f66e', + 'md5': 'b57399839d055fccfeb9a0455c439868', 'info_dict': { - 'id': '1259285', + 'id': '563747', 'ext': 'mp4', 'title': 'Vulkanausbruch in Ecuador: Der "Feuerschlund" ist wieder aktiv', 'description': 'md5:8029d8310232196eb235d27575a8b9f4', 'duration': 49, + 'upload_date': '20130311', + 'timestamp': 1362994320, }, }, { 'url': 'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html', - 'md5': 'f2cdf638d7aa47654e251e1aee360af1', + 'md5': '5b6c2f4add9d62912ed5fc78a1faed80', 'info_dict': { - 'id': '1309159', + 'id': '580988', 'ext': 'mp4', 'title': 'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers', 'description': 'md5:c2322b65e58f385a820c10fa03b2d088', 'duration': 983, + 'upload_date': '20131115', + 'timestamp': 1384546642, }, }, { 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-embed.html', - 'md5': 'd8eeca6bfc8f1cd6f490eb1f44695d51', + 'md5': '97b91083a672d72976faa8433430afb9', 'info_dict': { - 'id': '1519126', + 'id': '601883', 'ext': 'mp4', 'description': 'SPIEGEL ONLINE-Nutzer durften den deutschen Astronauten Alexander Gerst über sein Leben auf der ISS-Station befragen. Hier kommen seine Antworten auf die besten sechs Fragen.', 'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"', + 'upload_date': '20140904', + 'timestamp': 1409834160, } }, { 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html', 'only_matching': True, + }, { + # nexx video + 'url': 'http://www.spiegel.de/video/spiegel-tv-magazin-ueber-guellekrise-in-schleswig-holstein-video-99012776.html', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - webpage, handle = self._download_webpage_handle(url, video_id) + metadata_url = 'http://www.spiegel.de/video/metadata/video-%s.json' % video_id + handle = self._request_webpage(metadata_url, video_id) # 302 to spiegel.tv, like http://www.spiegel.de/video/der-film-zum-wochenende-die-wahrheit-ueber-maenner-video-99003272.html if SpiegeltvIE.suitable(handle.geturl()): return self.url_result(handle.geturl(), 'Spiegeltv') - title = re.sub(r'\s+', ' ', self._html_search_regex( - r'(?s)<(?:h1|div) class="module-title"[^>]*>(.*?)</(?:h1|div)>', - webpage, 'title')) - description = self._html_search_meta('description', webpage, 'description') - - base_url = self._search_regex( - [r'server\s*:\s*(["\'])(?P<url>.+?)\1', r'var\s+server\s*=\s*"(?P<url>[^"]+)\"'], - webpage, 'server URL', group='url') - - xml_url = base_url + video_id + '.xml' - idoc = self._download_xml(xml_url, video_id) - - formats = [] - for n in list(idoc): - if n.tag.startswith('type') and n.tag != 'type6': - format_id = n.tag.rpartition('type')[2] - video_url = base_url + n.find('./filename').text - formats.append({ - 'format_id': format_id, - 'url': video_url, - 'width': int(n.find('./width').text), - 'height': int(n.find('./height').text), - 'abr': int(n.find('./audiobitrate').text), - 'vbr': int(n.find('./videobitrate').text), - 'vcodec': n.find('./codec').text, - 'acodec': 'MP4A', - }) - duration = float(idoc[0].findall('./duration')[0].text) - - self._check_formats(formats, video_id) - self._sort_formats(formats) + video_data = self._parse_json(self._webpage_read_content( + handle, metadata_url, video_id), video_id) + title = video_data['title'] + nexx_id = video_data['nexxOmniaId'] + domain_id = video_data.get('nexxOmniaDomain') or '748' return { + '_type': 'url_transparent', 'id': video_id, + 'url': 'nexx:%s:%s' % (domain_id, nexx_id), 'title': title, - 'description': description, - 'duration': duration, - 'formats': formats, + 'description': strip_or_none(video_data.get('teaser')), + 'duration': parse_duration(video_data.get('duration')), + 'timestamp': unified_timestamp(video_data.get('datum')), + 'ie_key': NexxIE.ie_key(), } class SpiegelArticleIE(InfoExtractor): - _VALID_URL = 'https?://www\.spiegel\.de/(?!video/)[^?#]*?-(?P<id>[0-9]+)\.html' + _VALID_URL = r'https?://(?:www\.)?spiegel\.de/(?!video/)[^?#]*?-(?P<id>[0-9]+)\.html' IE_NAME = 'Spiegel:Article' IE_DESC = 'Articles on spiegel.de' _TESTS = [{ @@ -104,6 +101,7 @@ 'ext': 'mp4', 'title': 'Faszination Badminton: Nennt es bloß nicht Federball', 'description': 're:^Patrick Kämnitz gehört.{100,}', + 'upload_date': '20140825', }, }, { 'url': 'http://www.spiegel.de/wissenschaft/weltall/astronaut-alexander-gerst-antwortet-spiegel-online-lesern-a-989876.html', @@ -111,6 +109,26 @@ }, 'playlist_count': 6, + }, { + # Nexx iFrame embed + 'url': 'http://www.spiegel.de/sptv/spiegeltv/spiegel-tv-ueber-schnellste-katapult-achterbahn-der-welt-taron-a-1137884.html', + 'info_dict': { + 'id': '161464', + 'ext': 'mp4', + 'title': 'Nervenkitzel Achterbahn', + 'alt_title': 'Karussellbauer in Deutschland', + 'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc', + 'release_year': 2005, + 'creator': 'SPIEGEL TV', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2761, + 'timestamp': 1394021479, + 'upload_date': '20140305', + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, }] def _real_extract(self, url): @@ -133,6 +151,9 @@ entries = [ self.url_result(compat_urlparse.urljoin( self.http_scheme() + '//spiegel.de/', embed_path)) - for embed_path in embeds - ] - return self.playlist_result(entries) + for embed_path in embeds] + if embeds: + return self.playlist_result(entries) + + return self.playlist_from_matches( + NexxEmbedIE._extract_urls(webpage), ie=NexxEmbedIE.ie_key()) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/spiegeltv.py youtube-dl-2019.05.20/youtube_dl/extractor/spiegeltv.py --- youtube-dl-2016.02.22/youtube_dl/extractor/spiegeltv.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/spiegeltv.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,114 +1,17 @@ -# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlparse -from ..utils import ( - determine_ext, - float_or_none, -) +from .nexx import NexxIE class SpiegeltvIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?spiegel\.tv/(?:#/)?filme/(?P<id>[\-a-z0-9]+)' - _TESTS = [{ - 'url': 'http://www.spiegel.tv/filme/flug-mh370/', - 'info_dict': { - 'id': 'flug-mh370', - 'ext': 'm4v', - 'title': 'Flug MH370', - 'description': 'Das Rätsel um die Boeing 777 der Malaysia-Airlines', - 'thumbnail': 're:http://.*\.jpg$', - }, - 'params': { - # m3u8 download - 'skip_download': True, - } - }, { - 'url': 'http://www.spiegel.tv/#/filme/alleskino-die-wahrheit-ueber-maenner/', + _VALID_URL = r'https?://(?:www\.)?spiegel\.tv/videos/(?P<id>\d+)' + _TEST = { + 'url': 'http://www.spiegel.tv/videos/161681-flug-mh370/', 'only_matching': True, - }] + } def _real_extract(self, url): - if '/#/' in url: - url = url.replace('/#/', '/') - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'<h1.*?>(.*?)</h1>', webpage, 'title') - - apihost = 'http://spiegeltv-ivms2-restapi.s3.amazonaws.com' - version_json = self._download_json( - '%s/version.json' % apihost, video_id, - note='Downloading version information') - version_name = version_json['version_name'] - - slug_json = self._download_json( - '%s/%s/restapi/slugs/%s.json' % (apihost, version_name, video_id), - video_id, - note='Downloading object information') - oid = slug_json['object_id'] - - media_json = self._download_json( - '%s/%s/restapi/media/%s.json' % (apihost, version_name, oid), - video_id, note='Downloading media information') - uuid = media_json['uuid'] - is_wide = media_json['is_wide'] - - server_json = self._download_json( - 'http://spiegeltv-prod-static.s3.amazonaws.com/projectConfigs/projectConfig.json', - video_id, note='Downloading server information') - - format = '16x9' if is_wide else '4x3' - - formats = [] - for streamingserver in server_json['streamingserver']: - endpoint = streamingserver.get('endpoint') - if not endpoint: - continue - play_path = 'mp4:%s_spiegeltv_0500_%s.m4v' % (uuid, format) - if endpoint.startswith('rtmp'): - formats.append({ - 'url': endpoint, - 'format_id': 'rtmp', - 'app': compat_urllib_parse_urlparse(endpoint).path[1:], - 'play_path': play_path, - 'player_path': 'http://prod-static.spiegel.tv/frontend-076.swf', - 'ext': 'flv', - 'rtmp_live': True, - }) - elif determine_ext(endpoint) == 'm3u8': - formats.append({ - 'url': endpoint.replace('[video]', play_path), - 'ext': 'm4v', - 'format_id': 'hls', # Prefer hls since it allows to workaround georestriction - 'protocol': 'm3u8', - 'preference': 1, - 'http_headers': { - 'Accept-Encoding': 'deflate', # gzip causes trouble on the server side - }, - }) - else: - formats.append({ - 'url': endpoint, - }) - self._check_formats(formats, video_id) - - thumbnails = [] - for image in media_json['images']: - thumbnails.append({ - 'url': image['url'], - 'width': image['width'], - 'height': image['height'], - }) - - description = media_json['subtitle'] - duration = float_or_none(media_json.get('duration_in_ms'), scale=1000) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'thumbnails': thumbnails, - 'formats': formats, - } + return self.url_result( + 'https://api.nexx.cloud/v3/748/videos/byid/%s' + % self._match_id(url), ie=NexxIE.ie_key()) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/spike.py youtube-dl-2019.05.20/youtube_dl/extractor/spike.py --- youtube-dl-2016.02.22/youtube_dl/extractor/spike.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/spike.py 2019-06-07 18:49:07.000000000 +0000 @@ -3,27 +3,55 @@ from .mtv import MTVServicesInfoExtractor -class SpikeIE(MTVServicesInfoExtractor): - _VALID_URL = r'''(?x)https?:// - (?:www\.spike\.com/(?:video-(?:clips|playlists)|(?:full-)?episodes)/.+| - m\.spike\.com/videos/video\.rbml\?id=(?P<id>[^&]+)) - ''' - _TEST = { - 'url': 'http://www.spike.com/video-clips/lhtu8m/auction-hunters-can-allen-ride-a-hundred-year-old-motorcycle', - 'md5': '1a9265f32b0c375793d6c4ce45255256', +class BellatorIE(MTVServicesInfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bellator\.com/[^/]+/[\da-z]{6}(?:[/?#&]|$)' + _TESTS = [{ + 'url': 'http://www.bellator.com/fight/atwr7k/bellator-158-michael-page-vs-evangelista-cyborg', 'info_dict': { - 'id': 'b9c8221a-4e50-479a-b86d-3333323e38ba', + 'id': 'b55e434e-fde1-4a98-b7cc-92003a034de4', 'ext': 'mp4', - 'title': 'Auction Hunters|Can Allen Ride A Hundred Year-Old Motorcycle?', - 'description': 'md5:fbed7e82ed5fad493615b3094a9499cb', + 'title': 'Douglas Lima vs. Paul Daley - Round 1', + 'description': 'md5:805a8dd29310fd611d32baba2f767885', }, - } + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.bellator.com/video-clips/bw6k7n/bellator-158-foundations-michael-venom-page', + 'only_matching': True, + }] _FEED_URL = 'http://www.spike.com/feeds/mrss/' - _MOBILE_TEMPLATE = 'http://m.spike.com/videos/video.rbml?id=%s' + _GEO_COUNTRIES = ['US'] + + +class ParamountNetworkIE(MTVServicesInfoExtractor): + _VALID_URL = r'https?://(?:www\.)?paramountnetwork\.com/[^/]+/[\da-z]{6}(?:[/?#&]|$)' + _TESTS = [{ + 'url': 'http://www.paramountnetwork.com/episodes/j830qm/lip-sync-battle-joel-mchale-vs-jim-rash-season-2-ep-13', + 'info_dict': { + 'id': '37ace3a8-1df6-48be-85b8-38df8229e241', + 'ext': 'mp4', + 'title': 'Lip Sync Battle|April 28, 2016|2|209|Joel McHale Vs. Jim Rash|Act 1', + 'description': 'md5:a739ca8f978a7802f67f8016d27ce114', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }] + + _FEED_URL = 'http://www.paramountnetwork.com/feeds/mrss/' + _GEO_COUNTRIES = ['US'] + + def _extract_mgid(self, webpage): + root_data = self._parse_json(self._search_regex( + r'window\.__DATA__\s*=\s*({.+})', + webpage, 'data'), None) + + def find_sub_data(data, data_type): + return next(c for c in data['children'] if c.get('type') == data_type) - def _real_extract(self, url): - mobile_id = self._match_id(url) - if mobile_id: - url = 'http://www.spike.com/video-clips/%s' % mobile_id - return super(SpikeIE, self)._real_extract(url) + c = find_sub_data(find_sub_data(root_data, 'MainContainer'), 'VideoPlayer') + return c['props']['media']['video']['config']['uri'] diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/sport5.py youtube-dl-2019.05.20/youtube_dl/extractor/sport5.py --- youtube-dl-2016.02.22/youtube_dl/extractor/sport5.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/sport5.py 2019-06-07 18:49:07.000000000 +0000 @@ -8,7 +8,7 @@ class Sport5IE(InfoExtractor): - _VALID_URL = r'http://(?:www|vod)?\.sport5\.co\.il/.*\b(?:Vi|docID)=(?P<id>\d+)' + _VALID_URL = r'https?://(?:www|vod)?\.sport5\.co\.il/.*\b(?:Vi|docID)=(?P<id>\d+)' _TESTS = [ { 'url': 'http://vod.sport5.co.il/?Vc=147&Vi=176331&Page=1', @@ -41,7 +41,7 @@ webpage = self._download_webpage(url, media_id) - video_id = self._html_search_regex('clipId=([\w-]+)', webpage, 'video id') + video_id = self._html_search_regex(r'clipId=([\w-]+)', webpage, 'video id') metadata = self._download_xml( 'http://sport5-metadata-rr-d.nsacdn.com/vod/vod/%s/HDS/metadata.xml' % video_id, diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/sportbox.py youtube-dl-2019.05.20/youtube_dl/extractor/sportbox.py --- youtube-dl-2016.02.22/youtube_dl/extractor/sportbox.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/sportbox.py 2019-06-07 18:49:07.000000000 +0000 @@ -4,89 +4,51 @@ import re from .common import InfoExtractor -from ..compat import compat_urlparse from ..utils import ( - unified_strdate, + determine_ext, + int_or_none, + js_to_json, + merge_dicts, ) class SportBoxIE(InfoExtractor): - _VALID_URL = r'https?://news\.sportbox\.ru/(?:[^/]+/)+spbvideo_NI\d+_(?P<display_id>.+)' + _VALID_URL = r'https?://(?:news\.sportbox|matchtv)\.ru/vdl/player(?:/[^/]+/|\?.*?\bn?id=)(?P<id>\d+)' _TESTS = [{ - 'url': 'http://news.sportbox.ru/Vidy_sporta/Avtosport/Rossijskij/spbvideo_NI483529_Gonka-2-zaezd-Obyedinenniy-2000-klassi-Turing-i-S', - 'md5': 'ff56a598c2cf411a9a38a69709e97079', + 'url': 'http://news.sportbox.ru/vdl/player/ci/211355', 'info_dict': { - 'id': '80822', + 'id': '109158', 'ext': 'mp4', - 'title': 'Гонка 2 заезд ««Объединенный 2000»: классы Туринг и Супер-продакшн', - 'description': 'md5:3d72dc4a006ab6805d82f037fdc637ad', - 'thumbnail': 're:^https?://.*\.jpg$', - 'upload_date': '20140928', + 'title': 'В Новороссийске прошел детский турнир «Поле славы боевой»', + 'description': 'В Новороссийске прошел детский турнир «Поле славы боевой»', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 292, + 'view_count': int, + 'timestamp': 1426237001, + 'upload_date': '20150313', }, 'params': { # m3u8 download 'skip_download': True, }, }, { - 'url': 'http://news.sportbox.ru/Vidy_sporta/billiard/spbvideo_NI486287_CHempionat-mira-po-dinamichnoy-piramide-4', + 'url': 'http://news.sportbox.ru/vdl/player?nid=370908&only_player=1&autostart=false&playeri=2&height=340&width=580', 'only_matching': True, }, { - 'url': 'http://news.sportbox.ru/video/no_ads/spbvideo_NI536574_V_Novorossijske_proshel_detskij_turnir_Pole_slavy_bojevoj?ci=211355', + 'url': 'https://news.sportbox.ru/vdl/player/media/193095', 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('display_id') - - webpage = self._download_webpage(url, display_id) - - player = self._search_regex( - r'src="/?(vdl/player/[^"]+)"', webpage, 'player') - - title = self._html_search_regex( - [r'"nodetitle"\s*:\s*"([^"]+)"', r'class="node-header_{1,2}title">([^<]+)'], - webpage, 'title') - description = self._og_search_description(webpage) or self._html_search_meta( - 'description', webpage, 'description') - thumbnail = self._og_search_thumbnail(webpage) - upload_date = unified_strdate(self._html_search_meta( - 'dateCreated', webpage, 'upload date')) - - return { - '_type': 'url_transparent', - 'url': compat_urlparse.urljoin(url, '/%s' % player), - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'upload_date': upload_date, - } - - -class SportBoxEmbedIE(InfoExtractor): - _VALID_URL = r'https?://news\.sportbox\.ru/vdl/player(?:/[^/]+/|\?.*?\bn?id=)(?P<id>\d+)' - _TESTS = [{ - 'url': 'http://news.sportbox.ru/vdl/player/ci/211355', - 'info_dict': { - 'id': '211355', - 'ext': 'mp4', - 'title': 'В Новороссийске прошел детский турнир «Поле славы боевой»', - 'thumbnail': 're:^https?://.*\.jpg$', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, }, { - 'url': 'http://news.sportbox.ru/vdl/player?nid=370908&only_player=1&autostart=false&playeri=2&height=340&width=580', + 'url': 'https://news.sportbox.ru/vdl/player/media/109158', + 'only_matching': True, + }, { + 'url': 'https://matchtv.ru/vdl/player/media/109158', 'only_matching': True, }] @staticmethod def _extract_urls(webpage): return re.findall( - r'<iframe[^>]+src="(https?://news\.sportbox\.ru/vdl/player[^"]+)"', + r'<iframe[^>]+src="(https?://(?:news\.sportbox|matchtv)\.ru/vdl/player[^"]+)"', webpage) def _real_extract(self, url): @@ -94,22 +56,44 @@ webpage = self._download_webpage(url, video_id) - hls = self._search_regex( - r"sportboxPlayer\.jwplayer_common_params\.file\s*=\s*['\"]([^'\"]+)['\"]", - webpage, 'hls file') - - formats = self._extract_m3u8_formats(hls, video_id, 'mp4') - - title = self._search_regex( - r'sportboxPlayer\.node_title\s*=\s*"([^"]+)"', webpage, 'title') - - thumbnail = self._search_regex( - r'sportboxPlayer\.jwplayer_common_params\.image\s*=\s*"([^"]+)"', - webpage, 'thumbnail', default=None) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, + sources = self._parse_json( + self._search_regex( + r'(?s)playerOptions\.sources(?:WithRes)?\s*=\s*(\[.+?\])\s*;\s*\n', + webpage, 'sources'), + video_id, transform_source=js_to_json) + + formats = [] + for source in sources: + src = source.get('src') + if not src: + continue + if determine_ext(src) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + src, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': src, + }) + self._sort_formats(formats) + + player = self._parse_json( + self._search_regex( + r'(?s)playerOptions\s*=\s*({.+?})\s*;\s*\n', webpage, + 'player options', default='{}'), + video_id, transform_source=js_to_json) + media_id = player['mediaId'] + + info = self._search_json_ld(webpage, media_id, default={}) + + view_count = int_or_none(self._search_regex( + r'Просмотров\s*:\s*(\d+)', webpage, 'view count', default=None)) + + return merge_dicts(info, { + 'id': media_id, + 'title': self._og_search_title(webpage, default=None) or media_id, + 'thumbnail': player.get('poster'), + 'duration': int_or_none(player.get('duration')), + 'view_count': view_count, 'formats': formats, - } + }) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/sportdeutschland.py youtube-dl-2019.05.20/youtube_dl/extractor/sportdeutschland.py --- youtube-dl-2016.02.22/youtube_dl/extractor/sportdeutschland.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/sportdeutschland.py 2019-06-07 18:49:07.000000000 +0000 @@ -20,8 +20,8 @@ 'title': 're:Li-Ning Badminton Weltmeisterschaft 2014 Kopenhagen', 'categories': ['Badminton'], 'view_count': int, - 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 're:Die Badminton-WM 2014 aus Kopenhagen bei Sportdeutschland\.TV', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': r're:Die Badminton-WM 2014 aus Kopenhagen bei Sportdeutschland\.TV', 'timestamp': int, 'upload_date': 're:^201408[23][0-9]$', }, @@ -38,7 +38,7 @@ 'timestamp': 1408976060, 'duration': 2732, 'title': 'Li-Ning Badminton Weltmeisterschaft 2014 Kopenhagen: Herren Einzel, Wei Lee vs. Keun Lee', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'view_count': int, 'categories': ['Li-Ning Badminton WM 2014'], diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/springboardplatform.py youtube-dl-2019.05.20/youtube_dl/extractor/springboardplatform.py --- youtube-dl-2016.02.22/youtube_dl/extractor/springboardplatform.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/springboardplatform.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,125 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + xpath_attr, + xpath_text, + xpath_element, + unescapeHTML, + unified_timestamp, +) + + +class SpringboardPlatformIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + cms\.springboardplatform\.com/ + (?: + (?:previews|embed_iframe)/(?P<index>\d+)/video/(?P<id>\d+)| + xml_feeds_advanced/index/(?P<index_2>\d+)/rss3/(?P<id_2>\d+) + ) + ''' + _TESTS = [{ + 'url': 'http://cms.springboardplatform.com/previews/159/video/981017/0/0/1', + 'md5': '5c3cb7b5c55740d482561099e920f192', + 'info_dict': { + 'id': '981017', + 'ext': 'mp4', + 'title': 'Redman "BUD like YOU" "Usher Good Kisser" REMIX', + 'description': 'Redman "BUD like YOU" "Usher Good Kisser" REMIX', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1409132328, + 'upload_date': '20140827', + 'duration': 193, + }, + }, { + 'url': 'http://cms.springboardplatform.com/embed_iframe/159/video/981017/rab007/rapbasement.com/1/1', + 'only_matching': True, + }, { + 'url': 'http://cms.springboardplatform.com/embed_iframe/20/video/1731611/ki055/kidzworld.com/10', + 'only_matching': True, + }, { + 'url': 'http://cms.springboardplatform.com/xml_feeds_advanced/index/159/rss3/981017/0/0/1/', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//cms\.springboardplatform\.com/embed_iframe/\d+/video/\d+.*?)\1', + webpage)] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') or mobj.group('id_2') + index = mobj.group('index') or mobj.group('index_2') + + video = self._download_xml( + 'http://cms.springboardplatform.com/xml_feeds_advanced/index/%s/rss3/%s' + % (index, video_id), video_id) + + item = xpath_element(video, './/item', 'item', fatal=True) + + content = xpath_element( + item, './{http://search.yahoo.com/mrss/}content', 'content', + fatal=True) + title = unescapeHTML(xpath_text(item, './title', 'title', fatal=True)) + + video_url = content.attrib['url'] + + if 'error_video.mp4' in video_url: + raise ExtractorError( + 'Video %s no longer exists' % video_id, expected=True) + + duration = int_or_none(content.get('duration')) + tbr = int_or_none(content.get('bitrate')) + filesize = int_or_none(content.get('fileSize')) + width = int_or_none(content.get('width')) + height = int_or_none(content.get('height')) + + description = unescapeHTML(xpath_text( + item, './description', 'description')) + thumbnail = xpath_attr( + item, './{http://search.yahoo.com/mrss/}thumbnail', 'url', + 'thumbnail') + + timestamp = unified_timestamp(xpath_text( + item, './{http://cms.springboardplatform.com/namespaces.html}created', + 'timestamp')) + + formats = [{ + 'url': video_url, + 'format_id': 'http', + 'tbr': tbr, + 'filesize': filesize, + 'width': width, + 'height': height, + }] + + m3u8_format = formats[0].copy() + m3u8_format.update({ + 'url': re.sub(r'(https?://)cdn\.', r'\1hls.', video_url) + '.m3u8', + 'ext': 'mp4', + 'format_id': 'hls', + 'protocol': 'm3u8_native', + }) + formats.append(m3u8_format) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/sprout.py youtube-dl-2019.05.20/youtube_dl/extractor/sprout.py --- youtube-dl-2016.02.22/youtube_dl/extractor/sprout.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/sprout.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,52 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .adobepass import AdobePassIE +from ..utils import ( + extract_attributes, + update_url_query, + smuggle_url, +) + + +class SproutIE(AdobePassIE): + _VALID_URL = r'https?://(?:www\.)?sproutonline\.com/watch/(?P<id>[^/?#]+)' + _TEST = { + 'url': 'http://www.sproutonline.com/watch/cowboy-adventure', + 'md5': '74bf14128578d1e040c3ebc82088f45f', + 'info_dict': { + 'id': '9dexnwtmh8_X', + 'ext': 'mp4', + 'title': 'A Cowboy Adventure', + 'description': 'Ruff-Ruff, Tweet and Dave get to be cowboys for the day at Six Cow Corral.', + 'timestamp': 1437758640, + 'upload_date': '20150724', + 'uploader': 'NBCU-SPROUT-NEW', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video_component = self._search_regex( + r'(?s)(<div[^>]+data-component="video"[^>]*?>)', + webpage, 'video component', default=None) + if video_component: + options = self._parse_json(extract_attributes( + video_component)['data-options'], video_id) + theplatform_url = options['video'] + query = { + 'mbr': 'true', + 'manifest': 'm3u', + } + if options.get('protected'): + query['auth'] = self._extract_mvpd_auth(url, options['pid'], 'sprout', 'sprout') + theplatform_url = smuggle_url(update_url_query( + theplatform_url, query), {'force_smil_url': True}) + else: + iframe = self._search_regex( + r'(<iframe[^>]+id="sproutVideoIframe"[^>]*?>)', + webpage, 'iframe') + theplatform_url = extract_attributes(iframe)['src'] + + return self.url_result(theplatform_url, 'ThePlatform') diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/srgssr.py youtube-dl-2019.05.20/youtube_dl/extractor/srgssr.py --- youtube-dl-2016.02.22/youtube_dl/extractor/srgssr.py 2016-02-04 12:39:20.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/srgssr.py 2019-06-07 18:49:07.000000000 +0000 @@ -4,6 +4,7 @@ import re from .common import InfoExtractor +from ..compat import compat_urllib_parse_urlparse from ..utils import ( ExtractorError, parse_iso8601, @@ -13,6 +14,8 @@ class SRGSSRIE(InfoExtractor): _VALID_URL = r'(?:https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=urn|srgssr):(?P<bu>srf|rts|rsi|rtr|swi):(?:[^:]+:)?(?P<type>video|audio):(?P<id>[0-9a-f\-]{36}|\d+)' + _GEO_BYPASS = False + _GEO_COUNTRIES = ['CH'] _ERRORS = { 'AGERATING12': 'To protect children under the age of 12, this video is only available between 8 p.m. and 6 a.m.', @@ -23,23 +26,34 @@ 'STARTDATE': 'This video is not yet available. Please try again later.', } + def _get_tokenized_src(self, url, video_id, format_id): + sp = compat_urllib_parse_urlparse(url).path.split('/') + token = self._download_json( + 'http://tp.srgssr.ch/akahd/token?acl=/%s/%s/*' % (sp[1], sp[2]), + video_id, 'Downloading %s token' % format_id, fatal=False) or {} + auth_params = token.get('token', {}).get('authparams') + if auth_params: + url += '?' + auth_params + return url + def get_media_data(self, bu, media_type, media_id): media_data = self._download_json( 'http://il.srgssr.ch/integrationlayer/1.0/ue/%s/%s/play/%s.json' % (bu, media_type, media_id), media_id)[media_type.capitalize()] if media_data.get('block') and media_data['block'] in self._ERRORS: - raise ExtractorError('%s said: %s' % ( - self.IE_NAME, self._ERRORS[media_data['block']]), expected=True) + message = self._ERRORS[media_data['block']] + if media_data['block'] == 'GEOBLOCK': + self.raise_geo_restricted( + msg=message, countries=self._GEO_COUNTRIES) + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, message), expected=True) return media_data def _real_extract(self, url): bu, media_type, media_id = re.match(self._VALID_URL, url).groups() - if bu == 'rts': - return self.url_result('rts:%s' % media_id, 'RTS') - media_data = self.get_media_data(bu, media_type, media_id) metadata = media_data['AssetMetadatas']['AssetMetadata'][0] @@ -61,14 +75,16 @@ asset_url = asset['text'] quality = asset['@quality'] format_id = '%s-%s' % (protocol, quality) - if protocol == 'HTTP-HDS': - formats.extend(self._extract_f4m_formats( - asset_url + '?hdcore=3.4.0', media_id, - f4m_id=format_id, fatal=False)) - elif protocol == 'HTTP-HLS': - formats.extend(self._extract_m3u8_formats( - asset_url, media_id, 'mp4', 'm3u8_native', - m3u8_id=format_id, fatal=False)) + if protocol.startswith('HTTP-HDS') or protocol.startswith('HTTP-HLS'): + asset_url = self._get_tokenized_src(asset_url, media_id, format_id) + if protocol.startswith('HTTP-HDS'): + formats.extend(self._extract_f4m_formats( + asset_url + ('?' if '?' not in asset_url else '&') + 'hdcore=3.4.0', + media_id, f4m_id=format_id, fatal=False)) + elif protocol.startswith('HTTP-HLS'): + formats.extend(self._extract_m3u8_formats( + asset_url, media_id, 'mp4', 'm3u8_native', + m3u8_id=format_id, fatal=False)) else: formats.append({ 'format_id': format_id, @@ -90,14 +106,23 @@ class SRGSSRPlayIE(InfoExtractor): IE_DESC = 'srf.ch, rts.ch, rsi.ch, rtr.ch and swissinfo.ch play sites' - _VALID_URL = r'https?://(?:(?:www|play)\.)?(?P<bu>srf|rts|rsi|rtr|swissinfo)\.ch/play/(?:tv|radio)/[^/]+/(?P<type>video|audio)/[^?]+\?id=(?P<id>[0-9a-f\-]{36}|\d+)' + _VALID_URL = r'''(?x) + https?:// + (?:(?:www|play)\.)? + (?P<bu>srf|rts|rsi|rtr|swissinfo)\.ch/play/(?:tv|radio)/ + (?: + [^/]+/(?P<type>video|audio)/[^?]+| + popup(?P<type_2>video|audio)player + ) + \?id=(?P<id>[0-9a-f\-]{36}|\d+) + ''' _TESTS = [{ 'url': 'http://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5', - 'md5': '4cd93523723beff51bb4bee974ee238d', + 'md5': 'da6b5b3ac9fa4761a942331cef20fcb3', 'info_dict': { 'id': '28e1a57d-5b76-4399-8ab3-9097f071e6c5', - 'ext': 'm4v', + 'ext': 'mp4', 'upload_date': '20130701', 'title': 'Snowden beantragt Asyl in Russland', 'timestamp': 1372713995, @@ -140,16 +165,22 @@ 'uploader': '19h30', 'upload_date': '20141201', 'timestamp': 1417458600, - 'thumbnail': 're:^https?://.*\.image', + 'thumbnail': r're:^https?://.*\.image', 'view_count': int, }, 'params': { # m3u8 download 'skip_download': True, } + }, { + 'url': 'https://www.srf.ch/play/tv/popupvideoplayer?id=c4dba0ca-e75b-43b2-a34f-f708a4932e01', + 'only_matching': True, }] def _real_extract(self, url): - bu, media_type, media_id = re.match(self._VALID_URL, url).groups() + mobj = re.match(self._VALID_URL, url) + bu = mobj.group('bu') + media_type = mobj.group('type') or mobj.group('type_2') + media_id = mobj.group('id') # other info can be extracted from url + '&layout=json' return self.url_result('srgssr:%s:%s:%s' % (bu[:3], media_type, media_id), 'SRGSSR') diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/srmediathek.py youtube-dl-2019.05.20/youtube_dl/extractor/srmediathek.py --- youtube-dl-2016.02.22/youtube_dl/extractor/srmediathek.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/srmediathek.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals from .ard import ARDMediathekIE @@ -9,8 +9,9 @@ class SRMediathekIE(ARDMediathekIE): + IE_NAME = 'sr:mediathek' IE_DESC = 'Saarländischer Rundfunk' - _VALID_URL = r'https?://sr-mediathek\.sr-online\.de/index\.php\?.*?&id=(?P<id>[0-9]+)' + _VALID_URL = r'https?://sr-mediathek(?:\.sr-online)?\.de/index\.php\?.*?&id=(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://sr-mediathek.sr-online.de/index.php?seite=7&id=28455', @@ -19,7 +20,7 @@ 'ext': 'mp4', 'title': 'sportarena (26.10.2014)', 'description': 'Ringen: KSV Köllerbach gegen Aachen-Walheim; Frauen-Fußball: 1. FC Saarbrücken gegen Sindelfingen; Motorsport: Rallye in Losheim; dazu: Interview mit Timo Bernhard; Turnen: TG Saar; Reitsport: Deutscher Voltigier-Pokal; Badminton: Interview mit Michael Fuchs ', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, 'skip': 'no longer available', }, { @@ -34,7 +35,9 @@ # m3u8 download 'skip_download': True, }, - 'expected_warnings': ['Unable to download f4m manifest'] + }, { + 'url': 'http://sr-mediathek.de/index.php?seite=7&id=7480', + 'only_matching': True, }] def _real_extract(self, url): diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/ssa.py youtube-dl-2019.05.20/youtube_dl/extractor/ssa.py --- youtube-dl-2016.02.22/youtube_dl/extractor/ssa.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/ssa.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,58 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - unescapeHTML, - parse_duration, -) - - -class SSAIE(InfoExtractor): - _VALID_URL = r'http://ssa\.nls\.uk/film/(?P<id>\d+)' - _TEST = { - 'url': 'http://ssa.nls.uk/film/3561', - 'info_dict': { - 'id': '3561', - 'ext': 'flv', - 'title': 'SHETLAND WOOL', - 'description': 'md5:c5afca6871ad59b4271e7704fe50ab04', - 'duration': 900, - 'thumbnail': 're:^https?://.*\.jpg$', - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - streamer = self._search_regex( - r"'streamer'\s*,\S*'(rtmp[^']+)'", webpage, 'streamer') - play_path = self._search_regex( - r"'file'\s*,\s*'([^']+)'", webpage, 'file').rpartition('.')[0] - - def search_field(field_name, fatal=False): - return self._search_regex( - r'<span\s+class="field_title">%s:</span>\s*<span\s+class="field_content">([^<]+)</span>' % field_name, - webpage, 'title', fatal=fatal) - - title = unescapeHTML(search_field('Title', fatal=True)).strip('()[]') - description = unescapeHTML(search_field('Description')) - duration = parse_duration(search_field('Running time')) - thumbnail = self._search_regex( - r"'image'\s*,\s*'([^']+)'", webpage, 'thumbnails', fatal=False) - - return { - 'id': video_id, - 'url': streamer, - 'play_path': play_path, - 'ext': 'flv', - 'title': title, - 'description': description, - 'duration': duration, - 'thumbnail': thumbnail, - } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/stanfordoc.py youtube-dl-2019.05.20/youtube_dl/extractor/stanfordoc.py --- youtube-dl-2016.02.22/youtube_dl/extractor/stanfordoc.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/stanfordoc.py 2019-06-07 18:49:07.000000000 +0000 @@ -66,7 +66,7 @@ r'(?s)<description>([^<]+)</description>', coursepage, 'description', fatal=False) - links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage)) + links = orderedSet(re.findall(r'<a href="(VideoPage\.php\?[^"]+)">', coursepage)) info['entries'] = [self.url_result( 'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l) ) for l in links] @@ -84,7 +84,7 @@ rootpage = self._download_webpage(rootURL, info['id'], errnote='Unable to download course info page') - links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage)) + links = orderedSet(re.findall(r'<a href="(CoursePage\.php\?[^"]+)">', rootpage)) info['entries'] = [self.url_result( 'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l) ) for l in links] diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/steam.py youtube-dl-2019.05.20/youtube_dl/extractor/steam.py --- youtube-dl-2016.02.22/youtube_dl/extractor/steam.py 2016-02-14 22:26:17.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/steam.py 2019-06-07 18:49:07.000000000 +0000 @@ -4,8 +4,10 @@ from .common import InfoExtractor from ..utils import ( + extract_attributes, ExtractorError, - unescapeHTML, + get_element_by_class, + js_to_json, ) @@ -25,35 +27,39 @@ 'url': 'http://store.steampowered.com/video/105600/', 'playlist': [ { - 'md5': 'f870007cee7065d7c76b88f0a45ecc07', + 'md5': '6a294ee0c4b1f47f5bb76a65e31e3592', 'info_dict': { - 'id': '81300', - 'ext': 'flv', - 'title': 'Terraria 1.1 Trailer', + 'id': '2040428', + 'ext': 'mp4', + 'title': 'Terraria 1.3 Trailer', 'playlist_index': 1, } }, { - 'md5': '61aaf31a5c5c3041afb58fb83cbb5751', + 'md5': '911672b20064ca3263fa89650ba5a7aa', 'info_dict': { - 'id': '80859', - 'ext': 'flv', - 'title': 'Terraria Trailer', + 'id': '2029566', + 'ext': 'mp4', + 'title': 'Terraria 1.2 Trailer', 'playlist_index': 2, } } ], + 'info_dict': { + 'id': '105600', + 'title': 'Terraria', + }, 'params': { 'playlistend': 2, } }, { 'url': 'http://steamcommunity.com/sharedfiles/filedetails/?id=242472205', 'info_dict': { - 'id': 'WB5DvDOOvAY', + 'id': 'X8kpJBlzD2E', 'ext': 'mp4', - 'upload_date': '20140329', - 'title': 'FRONTIERS - Final Greenlight Trailer', - 'description': 'md5:dc96a773669d0ca1b36c13c1f30250d9', + 'upload_date': '20140617', + 'title': 'FRONTIERS - Trapping', + 'description': 'md5:bf6f7f773def614054089e5769c12a6e', 'uploader': 'AAD Productions', 'uploader_id': 'AtomicAgeDogGames', } @@ -69,6 +75,9 @@ gameID = m.group('gameID') playlist_id = gameID videourl = self._VIDEO_PAGE_TEMPLATE % playlist_id + + self._set_cookie('steampowered.com', 'mature_content', '1') + webpage = self._download_webpage(videourl, playlist_id) if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None: @@ -76,48 +85,65 @@ self.report_age_confirmation() webpage = self._download_webpage(videourl, playlist_id) + flash_vars = self._parse_json(self._search_regex( + r'(?s)rgMovieFlashvars\s*=\s*({.+?});', webpage, + 'flash vars'), playlist_id, js_to_json) + + playlist_title = None + entries = [] if fileID: - playlist_title = self._html_search_regex( - r'<div class="workshopItemTitle">(.+)</div>', webpage, 'title') - mweb = re.finditer(r'''(?x) - 'movie_(?P<videoID>[0-9]+)':\s*\{\s* - YOUTUBE_VIDEO_ID:\s*"(?P<youtube_id>[^"]+)", - ''', webpage) - videos = [{ - '_type': 'url', - 'url': vid.group('youtube_id'), - 'ie_key': 'Youtube', - } for vid in mweb] + playlist_title = get_element_by_class('workshopItemTitle', webpage) + for movie in flash_vars.values(): + if not movie: + continue + youtube_id = movie.get('YOUTUBE_VIDEO_ID') + if not youtube_id: + continue + entries.append({ + '_type': 'url', + 'url': youtube_id, + 'ie_key': 'Youtube', + }) else: - playlist_title = self._html_search_regex( - r'<h2 class="pageheader">(.*?)</h2>', webpage, 'game title') - - mweb = re.finditer(r'''(?x) - 'movie_(?P<videoID>[0-9]+)':\s*\{\s* - FILENAME:\s*"(?P<videoURL>[\w:/\.\?=]+)" - (,\s*MOVIE_NAME:\s*\"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\}, - ''', webpage) - titles = re.finditer( - r'<span class="title">(?P<videoName>.+?)</span>', webpage) - thumbs = re.finditer( - r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">', webpage) - videos = [] - - for vid, vtitle, thumb in zip(mweb, titles, thumbs): - video_id = vid.group('videoID') - title = vtitle.group('videoName') - video_url = vid.group('videoURL') - video_thumb = thumb.group('thumbnail') - if not video_url: - raise ExtractorError('Cannot find video url for %s' % video_id) - videos.append({ + playlist_title = get_element_by_class('apphub_AppName', webpage) + for movie_id, movie in flash_vars.items(): + if not movie: + continue + video_id = self._search_regex(r'movie_(\d+)', movie_id, 'video id', fatal=False) + title = movie.get('MOVIE_NAME') + if not title or not video_id: + continue + entry = { 'id': video_id, - 'url': video_url, - 'ext': 'flv', - 'title': unescapeHTML(title), - 'thumbnail': video_thumb - }) - if not videos: + 'title': title.replace('+', ' '), + } + formats = [] + flv_url = movie.get('FILENAME') + if flv_url: + formats.append({ + 'format_id': 'flv', + 'url': flv_url, + }) + highlight_element = self._search_regex( + r'(<div[^>]+id="highlight_movie_%s"[^>]+>)' % video_id, + webpage, 'highlight element', fatal=False) + if highlight_element: + highlight_attribs = extract_attributes(highlight_element) + if highlight_attribs: + entry['thumbnail'] = highlight_attribs.get('data-poster') + for quality in ('', '-hd'): + for ext in ('webm', 'mp4'): + video_url = highlight_attribs.get('data-%s%s-source' % (ext, quality)) + if video_url: + formats.append({ + 'format_id': ext + quality, + 'url': video_url, + }) + if not formats: + continue + entry['formats'] = formats + entries.append(entry) + if not entries: raise ExtractorError('Could not find any videos') - return self.playlist_result(videos, playlist_id, playlist_title) + return self.playlist_result(entries, playlist_id, playlist_title) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/stitcher.py youtube-dl-2019.05.20/youtube_dl/extractor/stitcher.py --- youtube-dl-2016.02.22/youtube_dl/extractor/stitcher.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/stitcher.py 2019-06-07 18:49:07.000000000 +0000 @@ -22,7 +22,7 @@ 'title': 'Machine Learning Mastery and Cancer Clusters', 'description': 'md5:55163197a44e915a14a1ac3a1de0f2d3', 'duration': 1604, - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', }, }, { 'url': 'http://www.stitcher.com/podcast/panoply/vulture-tv/e/the-rare-hourlong-comedy-plus-40846275?autoplay=true', @@ -33,7 +33,7 @@ 'title': "The CW's 'Crazy Ex-Girlfriend'", 'description': 'md5:04f1e2f98eb3f5cbb094cea0f9e19b17', 'duration': 2235, - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', }, 'params': { 'skip_download': True, @@ -56,7 +56,7 @@ episode = self._parse_json( js_to_json(self._search_regex( - r'(?s)var\s+stitcher\s*=\s*({.+?});\n', webpage, 'episode config')), + r'(?s)var\s+stitcher(?:Config)?\s*=\s*({.+?});\n', webpage, 'episode config')), display_id)['config']['episode'] title = unescapeHTML(episode['title']) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/streamable.py youtube-dl-2019.05.20/youtube_dl/extractor/streamable.py --- youtube-dl-2016.02.22/youtube_dl/extractor/streamable.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/streamable.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,112 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, +) + + +class StreamableIE(InfoExtractor): + _VALID_URL = r'https?://streamable\.com/(?:[es]/)?(?P<id>\w+)' + _TESTS = [ + { + 'url': 'https://streamable.com/dnd1', + 'md5': '3e3bc5ca088b48c2d436529b64397fef', + 'info_dict': { + 'id': 'dnd1', + 'ext': 'mp4', + 'title': 'Mikel Oiarzabal scores to make it 0-3 for La Real against Espanyol', + 'thumbnail': r're:https?://.*\.jpg$', + 'uploader': 'teabaker', + 'timestamp': 1454964157.35115, + 'upload_date': '20160208', + 'duration': 61.516, + 'view_count': int, + } + }, + # older video without bitrate, width/height, etc. info + { + 'url': 'https://streamable.com/moo', + 'md5': '2cf6923639b87fba3279ad0df3a64e73', + 'info_dict': { + 'id': 'moo', + 'ext': 'mp4', + 'title': '"Please don\'t eat me!"', + 'thumbnail': r're:https?://.*\.jpg$', + 'timestamp': 1426115495, + 'upload_date': '20150311', + 'duration': 12, + 'view_count': int, + } + }, + { + 'url': 'https://streamable.com/e/dnd1', + 'only_matching': True, + }, + { + 'url': 'https://streamable.com/s/okkqk/drxjds', + 'only_matching': True, + } + ] + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+src=(?P<q1>[\'"])(?P<src>(?:https?:)?//streamable\.com/(?:(?!\1).+))(?P=q1)', + webpage) + if mobj: + return mobj.group('src') + + def _real_extract(self, url): + video_id = self._match_id(url) + + # Note: Using the ajax API, as the public Streamable API doesn't seem + # to return video info like the title properly sometimes, and doesn't + # include info like the video duration + video = self._download_json( + 'https://ajax.streamable.com/videos/%s' % video_id, video_id) + + # Format IDs: + # 0 The video is being uploaded + # 1 The video is being processed + # 2 The video has at least one file ready + # 3 The video is unavailable due to an error + status = video.get('status') + if status != 2: + raise ExtractorError( + 'This video is currently unavailable. It may still be uploading or processing.', + expected=True) + + title = video.get('reddit_title') or video['title'] + + formats = [] + for key, info in video['files'].items(): + if not info.get('url'): + continue + formats.append({ + 'format_id': key, + 'url': self._proto_relative_url(info['url']), + 'width': int_or_none(info.get('width')), + 'height': int_or_none(info.get('height')), + 'filesize': int_or_none(info.get('size')), + 'fps': int_or_none(info.get('framerate')), + 'vbr': float_or_none(info.get('bitrate'), 1000) + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': video.get('description'), + 'thumbnail': self._proto_relative_url(video.get('thumbnail_url')), + 'uploader': video.get('owner', {}).get('user_name'), + 'timestamp': float_or_none(video.get('date_added')), + 'duration': float_or_none(video.get('duration')), + 'view_count': int_or_none(video.get('plays')), + 'formats': formats + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/streamango.py youtube-dl-2019.05.20/youtube_dl/extractor/streamango.py --- youtube-dl-2016.02.22/youtube_dl/extractor/streamango.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/streamango.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,128 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_chr +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + js_to_json, +) + + +class StreamangoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?:streamango\.com|fruithosts\.net|streamcherry\.com)/(?:f|embed)/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://streamango.com/f/clapasobsptpkdfe/20170315_150006_mp4', + 'md5': 'e992787515a182f55e38fc97588d802a', + 'info_dict': { + 'id': 'clapasobsptpkdfe', + 'ext': 'mp4', + 'title': '20170315_150006.mp4', + } + }, { + # no og:title + 'url': 'https://streamango.com/embed/foqebrpftarclpob/asdf_asd_2_mp4', + 'info_dict': { + 'id': 'foqebrpftarclpob', + 'ext': 'mp4', + 'title': 'foqebrpftarclpob', + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'gone', + }, { + 'url': 'https://streamango.com/embed/clapasobsptpkdfe/20170315_150006_mp4', + 'only_matching': True, + }, { + 'url': 'https://fruithosts.net/f/mreodparcdcmspsm/w1f1_r4lph_2018_brrs_720p_latino_mp4', + 'only_matching': True, + }, { + 'url': 'https://streamcherry.com/f/clapasobsptpkdfe/', + 'only_matching': True, + }] + + def _real_extract(self, url): + def decrypt_src(encoded, val): + ALPHABET = '=/+9876543210zyxwvutsrqponmlkjihgfedcbaZYXWVUTSRQPONMLKJIHGFEDCBA' + encoded = re.sub(r'[^A-Za-z0-9+/=]', '', encoded) + decoded = '' + sm = [None] * 4 + i = 0 + str_len = len(encoded) + while i < str_len: + for j in range(4): + sm[j % 4] = ALPHABET.index(encoded[i]) + i += 1 + char_code = ((sm[0] << 0x2) | (sm[1] >> 0x4)) ^ val + decoded += compat_chr(char_code) + if sm[2] != 0x40: + char_code = ((sm[1] & 0xf) << 0x4) | (sm[2] >> 0x2) + decoded += compat_chr(char_code) + if sm[3] != 0x40: + char_code = ((sm[2] & 0x3) << 0x6) | sm[3] + decoded += compat_chr(char_code) + return decoded + + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title(webpage, default=video_id) + + formats = [] + for format_ in re.findall(r'({[^}]*\bsrc\s*:\s*[^}]*})', webpage): + mobj = re.search(r'(src\s*:\s*[^(]+\(([^)]*)\)[\s,]*)', format_) + if mobj is None: + continue + + format_ = format_.replace(mobj.group(0), '') + + video = self._parse_json( + format_, video_id, transform_source=js_to_json, + fatal=False) or {} + + mobj = re.search( + r'([\'"])(?P<src>(?:(?!\1).)+)\1\s*,\s*(?P<val>\d+)', + mobj.group(1)) + if mobj is None: + continue + + src = decrypt_src(mobj.group('src'), int_or_none(mobj.group('val'))) + if not src: + continue + + ext = determine_ext(src, default_ext=None) + if video.get('type') == 'application/dash+xml' or ext == 'mpd': + formats.extend(self._extract_mpd_formats( + src, video_id, mpd_id='dash', fatal=False)) + else: + formats.append({ + 'url': src, + 'ext': ext or 'mp4', + 'width': int_or_none(video.get('width')), + 'height': int_or_none(video.get('height')), + 'tbr': int_or_none(video.get('bitrate')), + }) + + if not formats: + error = self._search_regex( + r'<p[^>]+\bclass=["\']lead[^>]+>(.+?)</p>', webpage, + 'error', default=None) + if not error and '>Sorry' in webpage: + error = 'Video %s is not available' % video_id + if error: + raise ExtractorError(error, expected=True) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'url': url, + 'title': title, + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/streamcloud.py youtube-dl-2019.05.20/youtube_dl/extractor/streamcloud.py --- youtube-dl-2016.02.22/youtube_dl/extractor/streamcloud.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/streamcloud.py 2019-06-07 18:49:07.000000000 +0000 @@ -4,15 +4,17 @@ import re from .common import InfoExtractor -from ..compat import compat_urllib_parse -from ..utils import sanitized_Request +from ..utils import ( + ExtractorError, + urlencode_postdata, +) class StreamcloudIE(InfoExtractor): IE_NAME = 'streamcloud.eu' _VALID_URL = r'https?://streamcloud\.eu/(?P<id>[a-zA-Z0-9_-]+)(?:/(?P<fname>[^#?]*)\.html)?' - _TEST = { + _TESTS = [{ 'url': 'http://streamcloud.eu/skp9j99s4bpz/youtube-dl_test_video_____________-BaW_jenozKc.mp4.html', 'md5': '6bea4c7fa5daaacc2a946b7146286686', 'info_dict': { @@ -21,7 +23,10 @@ 'title': 'youtube-dl test video \'/\\ ä ↭', }, 'skip': 'Only available from the EU' - } + }, { + 'url': 'http://streamcloud.eu/ua8cmfh1nbe6/NSHIP-148--KUC-NG--H264-.mp4.html', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -29,26 +34,36 @@ orig_webpage = self._download_webpage(url, video_id) + if '>File Not Found<' in orig_webpage: + raise ExtractorError( + 'Video %s does not exist' % video_id, expected=True) + fields = re.findall(r'''(?x)<input\s+ type="(?:hidden|submit)"\s+ name="([^"]+)"\s+ (?:id="[^"]+"\s+)? value="([^"]*)" ''', orig_webpage) - post = compat_urllib_parse.urlencode(fields) - self._sleep(12, video_id) - headers = { - b'Content-Type': b'application/x-www-form-urlencoded', - } - req = sanitized_Request(url, post, headers) + self._sleep(6, video_id) webpage = self._download_webpage( - req, video_id, note='Downloading video page ...') - title = self._html_search_regex( - r'<h1[^>]*>([^<]+)<', webpage, 'title') - video_url = self._search_regex( - r'file:\s*"([^"]+)"', webpage, 'video URL') + url, video_id, data=urlencode_postdata(fields), headers={ + b'Content-Type': b'application/x-www-form-urlencoded', + }) + + try: + title = self._html_search_regex( + r'<h1[^>]*>([^<]+)<', webpage, 'title') + video_url = self._search_regex( + r'file:\s*"([^"]+)"', webpage, 'video URL') + except ExtractorError: + message = self._html_search_regex( + r'(?s)<div[^>]+class=(["\']).*?msgboxinfo.*?\1[^>]*>(?P<message>.+?)</div>', + webpage, 'message', default=None, group='message') + if message: + raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True) + raise thumbnail = self._search_regex( r'image:\s*"([^"]+)"', webpage, 'thumbnail URL', fatal=False) @@ -57,4 +72,7 @@ 'title': title, 'url': video_url, 'thumbnail': thumbnail, + 'http_headers': { + 'Referer': url, + }, } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/streamcz.py youtube-dl-2019.05.20/youtube_dl/extractor/streamcz.py --- youtube-dl-2016.02.22/youtube_dl/extractor/streamcz.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/streamcz.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals import hashlib @@ -26,7 +26,7 @@ _TESTS = [{ 'url': 'http://www.stream.cz/peklonataliri/765767-ecka-pro-deti', - 'md5': '6d3ca61a8d0633c9c542b92fcb936b0c', + 'md5': '934bb6a6d220d99c010783c9719960d5', 'info_dict': { 'id': '765767', 'ext': 'mp4', @@ -37,7 +37,7 @@ }, }, { 'url': 'http://www.stream.cz/blanik/10002447-tri-roky-pro-mazanka', - 'md5': 'e54a254fb8b871968fd8403255f28589', + 'md5': '849a88c1e1ca47d41403c2ba5e59e261', 'info_dict': { 'id': '10002447', 'ext': 'mp4', @@ -85,6 +85,14 @@ else: title = data['name'] + subtitles = {} + srt_url = data.get('subtitles_srt') + if srt_url: + subtitles['cs'] = [{ + 'ext': 'srt', + 'url': srt_url, + }] + return { 'id': video_id, 'title': title, @@ -93,4 +101,5 @@ 'description': data.get('web_site_text'), 'duration': int_or_none(data.get('duration')), 'view_count': int_or_none(data.get('views')), + 'subtitles': subtitles, } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/streetvoice.py youtube-dl-2019.05.20/youtube_dl/extractor/streetvoice.py --- youtube-dl-2016.02.22/youtube_dl/extractor/streetvoice.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/streetvoice.py 2019-06-07 18:49:07.000000000 +0000 @@ -14,10 +14,9 @@ 'info_dict': { 'id': '94440', 'ext': 'mp3', - 'filesize': 4167053, 'title': '輸', 'description': 'Crispy脆樂團 - 輸', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 260, 'upload_date': '20091018', 'uploader': 'Crispy脆樂團', @@ -32,20 +31,19 @@ song_id = self._match_id(url) song = self._download_json( - 'http://streetvoice.com/music/api/song/%s' % song_id, song_id) + 'https://streetvoice.com/api/v1/public/song/%s/' % song_id, song_id, data=b'') title = song['name'] - author = song['musician']['name'] + author = song['user']['nickname'] return { 'id': song_id, 'url': song['file'], - 'filesize': song.get('size'), 'title': title, 'description': '%s - %s' % (author, title), 'thumbnail': self._proto_relative_url(song.get('image'), 'http:'), 'duration': song.get('length'), 'upload_date': unified_strdate(song.get('created_at')), 'uploader': author, - 'uploader_id': compat_str(song['musician']['id']), + 'uploader_id': compat_str(song['user']['id']), } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/stretchinternet.py youtube-dl-2019.05.20/youtube_dl/extractor/stretchinternet.py --- youtube-dl-2016.02.22/youtube_dl/extractor/stretchinternet.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/stretchinternet.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,48 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import int_or_none + + +class StretchInternetIE(InfoExtractor): + _VALID_URL = r'https?://portal\.stretchinternet\.com/[^/]+/portal\.htm\?.*?\beventId=(?P<id>\d+)' + _TEST = { + 'url': 'https://portal.stretchinternet.com/umary/portal.htm?eventId=313900&streamType=video', + 'info_dict': { + 'id': '313900', + 'ext': 'mp4', + 'title': 'Augustana (S.D.) Baseball vs University of Mary', + 'description': 'md5:7578478614aae3bdd4a90f578f787438', + 'timestamp': 1490468400, + 'upload_date': '20170325', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + stream = self._download_json( + 'https://neo-client.stretchinternet.com/streamservice/v1/media/stream/v%s' + % video_id, video_id) + + video_url = 'https://%s' % stream['source'] + + event = self._download_json( + 'https://neo-client.stretchinternet.com/portal-ws/getEvent.json', + video_id, query={ + 'clientID': 99997, + 'eventID': video_id, + 'token': 'asdf', + })['event'] + + title = event.get('title') or event['mobileTitle'] + description = event.get('customText') + timestamp = int_or_none(event.get('longtime')) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'url': video_url, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/stv.py youtube-dl-2019.05.20/youtube_dl/extractor/stv.py --- youtube-dl-2016.02.22/youtube_dl/extractor/stv.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/stv.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,94 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse +) +from ..utils import ( + extract_attributes, + float_or_none, + int_or_none, + str_or_none, +) + + +class STVPlayerIE(InfoExtractor): + IE_NAME = 'stv:player' + _VALID_URL = r'https?://player\.stv\.tv/(?P<type>episode|video)/(?P<id>[a-z0-9]{4})' + _TEST = { + 'url': 'https://player.stv.tv/video/7srz/victoria/interview-with-the-cast-ahead-of-new-victoria/', + 'md5': '2ad867d4afd641fa14187596e0fbc91b', + 'info_dict': { + 'id': '6016487034001', + 'ext': 'mp4', + 'upload_date': '20190321', + 'title': 'Interview with the cast ahead of new Victoria', + 'description': 'Nell Hudson and Lily Travers tell us what to expect in the new season of Victoria.', + 'timestamp': 1553179628, + 'uploader_id': '1486976045', + }, + 'skip': 'this resource is unavailable outside of the UK', + } + _PUBLISHER_ID = '1486976045' + _PTYPE_MAP = { + 'episode': 'episodes', + 'video': 'shortform', + } + + def _real_extract(self, url): + ptype, video_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, video_id) + + qs = compat_parse_qs(compat_urllib_parse_urlparse(self._search_regex( + r'itemprop="embedURL"[^>]+href="([^"]+)', + webpage, 'embed URL', default=None)).query) + publisher_id = qs.get('publisherID', [None])[0] or self._PUBLISHER_ID + + player_attr = extract_attributes(self._search_regex( + r'(<[^>]+class="bcplayer"[^>]+>)', webpage, 'player', default=None)) or {} + + info = {} + duration = ref_id = series = video_id = None + api_ref_id = player_attr.get('data-player-api-refid') + if api_ref_id: + resp = self._download_json( + 'https://player.api.stv.tv/v1/%s/%s' % (self._PTYPE_MAP[ptype], api_ref_id), + api_ref_id, fatal=False) + if resp: + result = resp.get('results') or {} + video = result.get('video') or {} + video_id = str_or_none(video.get('id')) + ref_id = video.get('guid') + duration = video.get('length') + programme = result.get('programme') or {} + series = programme.get('name') or programme.get('shortName') + subtitles = {} + _subtitles = result.get('_subtitles') or {} + for ext, sub_url in _subtitles.items(): + subtitles.setdefault('en', []).append({ + 'ext': 'vtt' if ext == 'webvtt' else ext, + 'url': sub_url, + }) + info.update({ + 'description': result.get('summary'), + 'subtitles': subtitles, + 'view_count': int_or_none(result.get('views')), + }) + if not video_id: + video_id = qs.get('videoId', [None])[0] or self._search_regex( + r'<link\s+itemprop="url"\s+href="(\d+)"', + webpage, 'video id', default=None) or 'ref:' + (ref_id or player_attr['data-refid']) + + info.update({ + '_type': 'url_transparent', + 'duration': float_or_none(duration or player_attr.get('data-duration'), 1000), + 'id': video_id, + 'ie_key': 'BrightcoveNew', + 'series': series or player_attr.get('data-programme-name'), + 'url': 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' % (publisher_id, video_id), + }) + return info diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/sunporno.py youtube-dl-2019.05.20/youtube_dl/extractor/sunporno.py --- youtube-dl-2016.02.22/youtube_dl/extractor/sunporno.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/sunporno.py 2019-06-07 18:49:07.000000000 +0000 @@ -12,25 +12,29 @@ class SunPornoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?sunporno\.com/videos/(?P<id>\d+)' - _TEST = { + _VALID_URL = r'https?://(?:(?:www\.)?sunporno\.com/videos|embeds\.sunporno\.com/embed)/(?P<id>\d+)' + _TESTS = [{ 'url': 'http://www.sunporno.com/videos/807778/', - 'md5': '6457d3c165fd6de062b99ef6c2ff4c86', + 'md5': '507887e29033502f29dba69affeebfc9', 'info_dict': { 'id': '807778', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'md5:0a400058e8105d39e35c35e7c5184164', 'description': 'md5:a31241990e1bd3a64e72ae99afb325fb', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 302, 'age_limit': 18, } - } + }, { + 'url': 'http://embeds.sunporno.com/embed/807778', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage( + 'http://www.sunporno.com/videos/%s' % video_id, video_id) title = self._html_search_regex( r'<title>([^<]+)', webpage, 'title') @@ -40,7 +44,8 @@ r'poster="([^"]+)"', webpage, 'thumbnail', fatal=False) duration = parse_duration(self._search_regex( - r'itemprop="duration">\s*(\d+:\d+)\s*<', + (r'itemprop="duration"[^>]*>\s*(\d+:\d+)\s*<', + r'>Duration:\s*]+>\s*(\d+:\d+)\s*<'), webpage, 'duration', fatal=False)) view_count = int_or_none(self._html_search_regex( @@ -48,7 +53,7 @@ webpage, 'view count', fatal=False)) comment_count = int_or_none(self._html_search_regex( r'(\d+) Comments?', - webpage, 'comment count', fatal=False)) + webpage, 'comment count', fatal=False, default=None)) formats = [] quality = qualities(['mp4', 'flv']) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/sverigesradio.py youtube-dl-2019.05.20/youtube_dl/extractor/sverigesradio.py --- youtube-dl-2016.02.22/youtube_dl/extractor/sverigesradio.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/sverigesradio.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,115 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + str_or_none, +) + + +class SverigesRadioBaseIE(InfoExtractor): + _BASE_URL = 'https://sverigesradio.se/sida/playerajax/' + _QUALITIES = ['low', 'medium', 'high'] + _EXT_TO_CODEC_MAP = { + 'mp3': 'mp3', + 'm4a': 'aac', + } + _CODING_FORMAT_TO_ABR_MAP = { + 5: 128, + 11: 192, + 12: 32, + 13: 96, + } + + def _real_extract(self, url): + audio_id = self._match_id(url) + query = { + 'id': audio_id, + 'type': self._AUDIO_TYPE, + } + + item = self._download_json( + self._BASE_URL + 'audiometadata', audio_id, + 'Downloading audio JSON metadata', query=query)['items'][0] + title = item['subtitle'] + + query['format'] = 'iis' + urls = [] + formats = [] + for quality in self._QUALITIES: + query['quality'] = quality + audio_url_data = self._download_json( + self._BASE_URL + 'getaudiourl', audio_id, + 'Downloading %s format JSON metadata' % quality, + fatal=False, query=query) or {} + audio_url = audio_url_data.get('audioUrl') + if not audio_url or audio_url in urls: + continue + urls.append(audio_url) + ext = determine_ext(audio_url) + coding_format = audio_url_data.get('codingFormat') + abr = int_or_none(self._search_regex( + r'_a(\d+)\.m4a', audio_url, 'audio bitrate', + default=None)) or self._CODING_FORMAT_TO_ABR_MAP.get(coding_format) + formats.append({ + 'abr': abr, + 'acodec': self._EXT_TO_CODEC_MAP.get(ext), + 'ext': ext, + 'format_id': str_or_none(coding_format), + 'vcodec': 'none', + 'url': audio_url, + }) + self._sort_formats(formats) + + return { + 'id': audio_id, + 'title': title, + 'formats': formats, + 'series': item.get('title'), + 'duration': int_or_none(item.get('duration')), + 'thumbnail': item.get('displayimageurl'), + 'description': item.get('description'), + } + + +class SverigesRadioPublicationIE(SverigesRadioBaseIE): + IE_NAME = 'sverigesradio:publication' + _VALID_URL = r'https?://(?:www\.)?sverigesradio\.se/sida/(?:artikel|gruppsida)\.aspx\?.*?\bartikel=(?P[0-9]+)' + _TESTS = [{ + 'url': 'https://sverigesradio.se/sida/artikel.aspx?programid=83&artikel=7038546', + 'md5': '6a4917e1923fccb080e5a206a5afa542', + 'info_dict': { + 'id': '7038546', + 'ext': 'm4a', + 'duration': 132, + 'series': 'Nyheter (Ekot)', + 'title': 'Esa Teittinen: Sanningen har inte kommit fram', + 'description': 'md5:daf7ce66a8f0a53d5465a5984d3839df', + 'thumbnail': r're:^https?://.*\.jpg', + }, + }, { + 'url': 'https://sverigesradio.se/sida/gruppsida.aspx?programid=3304&grupp=6247&artikel=7146887', + 'only_matching': True, + }] + _AUDIO_TYPE = 'publication' + + +class SverigesRadioEpisodeIE(SverigesRadioBaseIE): + IE_NAME = 'sverigesradio:episode' + _VALID_URL = r'https?://(?:www\.)?sverigesradio\.se/(?:sida/)?avsnitt/(?P[0-9]+)' + _TEST = { + 'url': 'https://sverigesradio.se/avsnitt/1140922?programid=1300', + 'md5': '20dc4d8db24228f846be390b0c59a07c', + 'info_dict': { + 'id': '1140922', + 'ext': 'mp3', + 'duration': 3307, + 'series': 'Konflikt', + 'title': 'Metoo och valen', + 'description': 'md5:fcb5c1f667f00badcc702b196f10a27e', + 'thumbnail': r're:^https?://.*\.jpg', + } + } + _AUDIO_TYPE = 'episode' diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/svt.py youtube-dl-2019.05.20/youtube_dl/extractor/svt.py --- youtube-dl-2016.02.22/youtube_dl/extractor/svt.py 2016-01-23 10:08:46.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/svt.py 2019-06-07 18:49:07.000000000 +0000 @@ -4,58 +4,97 @@ import re from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) from ..utils import ( determine_ext, + dict_get, + int_or_none, + orderedSet, + strip_or_none, + try_get, + urljoin, + compat_str, ) class SVTBaseIE(InfoExtractor): - def _extract_video(self, url, video_id): - info = self._download_json(url, video_id) - - title = info['context']['title'] - thumbnail = info['context'].get('thumbnailImage') + _GEO_COUNTRIES = ['SE'] - video_info = info['video'] + def _extract_video(self, video_info, video_id): + is_live = dict_get(video_info, ('live', 'simulcast'), default=False) + m3u8_protocol = 'm3u8' if is_live else 'm3u8_native' formats = [] for vr in video_info['videoReferences']: + player_type = vr.get('playerType') or vr.get('format') vurl = vr['url'] ext = determine_ext(vurl) if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( vurl, video_id, - ext='mp4', entry_protocol='m3u8_native', - m3u8_id=vr.get('playerType'))) + ext='mp4', entry_protocol=m3u8_protocol, + m3u8_id=player_type, fatal=False)) elif ext == 'f4m': formats.extend(self._extract_f4m_formats( vurl + '?hdcore=3.3.0', video_id, - f4m_id=vr.get('playerType'))) + f4m_id=player_type, fatal=False)) + elif ext == 'mpd': + if player_type == 'dashhbbtv': + formats.extend(self._extract_mpd_formats( + vurl, video_id, mpd_id=player_type, fatal=False)) else: formats.append({ - 'format_id': vr.get('playerType'), + 'format_id': player_type, 'url': vurl, }) + if not formats and video_info.get('rights', {}).get('geoBlockedSweden'): + self.raise_geo_restricted( + 'This video is only available in Sweden', + countries=self._GEO_COUNTRIES) self._sort_formats(formats) subtitles = {} - subtitle_references = video_info.get('subtitleReferences') + subtitle_references = dict_get(video_info, ('subtitles', 'subtitleReferences')) if isinstance(subtitle_references, list): for sr in subtitle_references: subtitle_url = sr.get('url') + subtitle_lang = sr.get('language', 'sv') if subtitle_url: - subtitles.setdefault('sv', []).append({'url': subtitle_url}) - - duration = video_info.get('materialLength') - age_limit = 18 if video_info.get('inappropriateForChildren') else 0 + if determine_ext(subtitle_url) == 'm3u8': + # TODO(yan12125): handle WebVTT in m3u8 manifests + continue + + subtitles.setdefault(subtitle_lang, []).append({'url': subtitle_url}) + + title = video_info.get('title') + + series = video_info.get('programTitle') + season_number = int_or_none(video_info.get('season')) + episode = video_info.get('episodeTitle') + episode_number = int_or_none(video_info.get('episodeNumber')) + + duration = int_or_none(dict_get(video_info, ('materialLength', 'contentDuration'))) + age_limit = None + adult = dict_get( + video_info, ('inappropriateForChildren', 'blockedForChildren'), + skip_false_values=False) + if adult is not None: + age_limit = 18 if adult else 0 return { 'id': video_id, 'title': title, 'formats': formats, 'subtitles': subtitles, - 'thumbnail': thumbnail, 'duration': duration, 'age_limit': age_limit, + 'series': series, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, + 'is_live': is_live, } @@ -63,11 +102,11 @@ _VALID_URL = r'https?://(?:www\.)?svt\.se/wd\?(?:.*?&)?widgetId=(?P\d+)&.*?\barticleId=(?P\d+)' _TEST = { 'url': 'http://www.svt.se/wd?widgetId=23991§ionId=541&articleId=2900353&type=embed&contextSectionId=123&autostart=false', - 'md5': '9648197555fc1b49e3dc22db4af51d46', + 'md5': '33e9a5d8f646523ce0868ecfb0eed77d', 'info_dict': { 'id': '2900353', - 'ext': 'flv', - 'title': 'Här trycker Jagr till Giroux (under SVT-intervjun)', + 'ext': 'mp4', + 'title': 'Stjärnorna skojar till det - under SVT-intervjun', 'duration': 27, 'age_limit': 0, }, @@ -84,15 +123,29 @@ mobj = re.match(self._VALID_URL, url) widget_id = mobj.group('widget_id') article_id = mobj.group('id') - return self._extract_video( + + info = self._download_json( 'http://www.svt.se/wd?widgetId=%s&articleId=%s&format=json&type=embed&output=json' % (widget_id, article_id), article_id) + info_dict = self._extract_video(info['video'], article_id) + info_dict['title'] = info['context']['title'] + return info_dict + -class SVTPlayIE(SVTBaseIE): +class SVTPlayBaseIE(SVTBaseIE): + _SVTPLAY_RE = r'root\s*\[\s*(["\'])_*svtplay\1\s*\]\s*=\s*(?P{.+?})\s*;\s*\n' + + +class SVTPlayIE(SVTPlayBaseIE): IE_DESC = 'SVT Play and Öppet arkiv' - _VALID_URL = r'https?://(?:www\.)?(?Psvtplay|oppetarkiv)\.se/video/(?P[0-9]+)' - _TEST = { + _VALID_URL = r'''(?x) + (?: + svt:(?P[^/?#&]+)| + https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P[^/?#&]+) + ) + ''' + _TESTS = [{ 'url': 'http://www.svtplay.se/video/5996901/flygplan-till-haile-selassie/flygplan-till-haile-selassie-2', 'md5': '2b6704fe4a28801e1a098bbf3c5ac611', 'info_dict': { @@ -100,7 +153,7 @@ 'ext': 'mp4', 'title': 'Flygplan till Haile Selassie', 'duration': 3527, - 'thumbnail': 're:^https?://.*[\.-]jpg$', + 'thumbnail': r're:^https?://.*[\.-]jpg$', 'age_limit': 0, 'subtitles': { 'sv': [{ @@ -108,12 +161,211 @@ }] }, }, - } + }, { + # geo restricted to Sweden + 'url': 'http://www.oppetarkiv.se/video/5219710/trollflojten', + 'only_matching': True, + }, { + 'url': 'http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg', + 'only_matching': True, + }, { + 'url': 'https://www.svtplay.se/kanaler/svt1', + 'only_matching': True, + }, { + 'url': 'svt:1376446-003A', + 'only_matching': True, + }, { + 'url': 'svt:14278044', + 'only_matching': True, + }] + + def _adjust_title(self, info): + if info['is_live']: + info['title'] = self._live_title(info['title']) + + def _extract_by_video_id(self, video_id, webpage=None): + data = self._download_json( + 'https://api.svt.se/videoplayer-api/video/%s' % video_id, + video_id, headers=self.geo_verification_headers()) + info_dict = self._extract_video(data, video_id) + if not info_dict.get('title'): + title = dict_get(info_dict, ('episode', 'series')) + if not title and webpage: + title = re.sub( + r'\s*\|\s*.+?$', '', self._og_search_title(webpage)) + if not title: + title = video_id + info_dict['title'] = title + self._adjust_title(info_dict) + return info_dict def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - host = mobj.group('host') - return self._extract_video( - 'http://www.%s.se/video/%s?output=json' % (host, video_id), - video_id) + video_id, svt_id = mobj.group('id', 'svt_id') + + if svt_id: + return self._extract_by_video_id(svt_id) + + webpage = self._download_webpage(url, video_id) + + data = self._parse_json( + self._search_regex( + self._SVTPLAY_RE, webpage, 'embedded data', default='{}', + group='json'), + video_id, fatal=False) + + thumbnail = self._og_search_thumbnail(webpage) + + if data: + video_info = try_get( + data, lambda x: x['context']['dispatcher']['stores']['VideoTitlePageStore']['data']['video'], + dict) + if video_info: + info_dict = self._extract_video(video_info, video_id) + info_dict.update({ + 'title': data['context']['dispatcher']['stores']['MetaStore']['title'], + 'thumbnail': thumbnail, + }) + self._adjust_title(info_dict) + return info_dict + + svt_id = self._search_regex( + r']+data-video-id=["\']([\da-zA-Z-]+)', + webpage, 'video id') + + return self._extract_by_video_id(svt_id, webpage) + + +class SVTSeriesIE(SVTPlayBaseIE): + _VALID_URL = r'https?://(?:www\.)?svtplay\.se/(?P[^/?&#]+)' + _TESTS = [{ + 'url': 'https://www.svtplay.se/rederiet', + 'info_dict': { + 'id': 'rederiet', + 'title': 'Rederiet', + 'description': 'md5:505d491a58f4fcf6eb418ecab947e69e', + }, + 'playlist_mincount': 318, + }, { + 'url': 'https://www.svtplay.se/rederiet?tab=sasong2', + 'info_dict': { + 'id': 'rederiet-sasong2', + 'title': 'Rederiet - Säsong 2', + 'description': 'md5:505d491a58f4fcf6eb418ecab947e69e', + }, + 'playlist_count': 12, + }] + + @classmethod + def suitable(cls, url): + return False if SVTIE.suitable(url) or SVTPlayIE.suitable(url) else super(SVTSeriesIE, cls).suitable(url) + + def _real_extract(self, url): + series_id = self._match_id(url) + + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + season_slug = qs.get('tab', [None])[0] + + if season_slug: + series_id += '-%s' % season_slug + + webpage = self._download_webpage( + url, series_id, 'Downloading series page') + + root = self._parse_json( + self._search_regex( + self._SVTPLAY_RE, webpage, 'content', group='json'), + series_id) + + season_name = None + + entries = [] + for season in root['relatedVideoContent']['relatedVideosAccordion']: + if not isinstance(season, dict): + continue + if season_slug: + if season.get('slug') != season_slug: + continue + season_name = season.get('name') + videos = season.get('videos') + if not isinstance(videos, list): + continue + for video in videos: + content_url = video.get('contentUrl') + if not content_url or not isinstance(content_url, compat_str): + continue + entries.append( + self.url_result( + urljoin(url, content_url), + ie=SVTPlayIE.ie_key(), + video_title=video.get('title') + )) + + metadata = root.get('metaData') + if not isinstance(metadata, dict): + metadata = {} + + title = metadata.get('title') + season_name = season_name or season_slug + + if title and season_name: + title = '%s - %s' % (title, season_name) + elif season_slug: + title = season_slug + + return self.playlist_result( + entries, series_id, title, metadata.get('description')) + + +class SVTPageIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?svt\.se/(?:[^/]+/)*(?P[^/?&#]+)' + _TESTS = [{ + 'url': 'https://www.svt.se/sport/oseedat/guide-sommartraningen-du-kan-gora-var-och-nar-du-vill', + 'info_dict': { + 'id': 'guide-sommartraningen-du-kan-gora-var-och-nar-du-vill', + 'title': 'GUIDE: Sommarträning du kan göra var och när du vill', + }, + 'playlist_count': 7, + }, { + 'url': 'https://www.svt.se/nyheter/inrikes/ebba-busch-thor-kd-har-delvis-ratt-om-no-go-zoner', + 'info_dict': { + 'id': 'ebba-busch-thor-kd-har-delvis-ratt-om-no-go-zoner', + 'title': 'Ebba Busch Thor har bara delvis rätt om ”no-go-zoner”', + }, + 'playlist_count': 1, + }, { + # only programTitle + 'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun', + 'info_dict': { + 'id': '2900353', + 'ext': 'mp4', + 'title': 'Stjärnorna skojar till det - under SVT-intervjun', + 'duration': 27, + 'age_limit': 0, + }, + }, { + 'url': 'https://www.svt.se/nyheter/lokalt/vast/svt-testar-tar-nagon-upp-skrapet-1', + 'only_matching': True, + }, { + 'url': 'https://www.svt.se/vader/manadskronikor/maj2018', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if SVTIE.suitable(url) else super(SVTPageIE, cls).suitable(url) + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + entries = [ + self.url_result( + 'svt:%s' % video_id, ie=SVTPlayIE.ie_key(), video_id=video_id) + for video_id in orderedSet(re.findall( + r'data-video-id=["\'](\d+)', webpage))] + + title = strip_or_none(self._og_search_title(webpage, default=None)) + + return self.playlist_result(entries, playlist_id, title) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/swrmediathek.py youtube-dl-2019.05.20/youtube_dl/extractor/swrmediathek.py --- youtube-dl-2016.02.22/youtube_dl/extractor/swrmediathek.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/swrmediathek.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,10 +1,12 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import parse_duration +from ..utils import ( + parse_duration, + int_or_none, + determine_protocol, +) class SWRMediathekIE(InfoExtractor): @@ -18,7 +20,7 @@ 'ext': 'mp4', 'title': 'SWR odysso', 'description': 'md5:2012e31baad36162e97ce9eb3f157b8a', - 'thumbnail': 're:^http:.*\.jpg$', + 'thumbnail': r're:^http:.*\.jpg$', 'duration': 2602, 'upload_date': '20140515', 'uploader': 'SWR Fernsehen', @@ -32,12 +34,13 @@ 'ext': 'mp4', 'title': 'Nachtcafé - Alltagsdroge Alkohol - zwischen Sektempfang und Komasaufen', 'description': 'md5:e0a3adc17e47db2c23aab9ebc36dbee2', - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', 'duration': 5305, 'upload_date': '20140516', 'uploader': 'SWR Fernsehen', 'uploader_id': '990030', }, + 'skip': 'redirect to http://swrmediathek.de/index.htm?hinweis=swrlink', }, { 'url': 'http://swrmediathek.de/player.htm?show=bba23e10-cb93-11e3-bf7f-0026b975f2e6', 'md5': '4382e4ef2c9d7ce6852535fa867a0dd3', @@ -46,59 +49,67 @@ 'ext': 'mp3', 'title': 'Saša Stanišic: Vor dem Fest', 'description': 'md5:5b792387dc3fbb171eb709060654e8c9', - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', 'duration': 3366, 'upload_date': '20140520', 'uploader': 'SWR 2', 'uploader_id': '284670', - } + }, + 'skip': 'redirect to http://swrmediathek.de/index.htm?hinweis=swrlink', }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) video = self._download_json( - 'http://swrmediathek.de/AjaxEntry?ekey=%s' % video_id, video_id, 'Downloading video JSON') + 'http://swrmediathek.de/AjaxEntry?ekey=%s' % video_id, + video_id, 'Downloading video JSON') attr = video['attr'] - media_type = attr['entry_etype'] + title = attr['entry_title'] + media_type = attr.get('entry_etype') formats = [] - for entry in video['sub']: - if entry['name'] != 'entry_media': + for entry in video.get('sub', []): + if entry.get('name') != 'entry_media': continue - entry_attr = entry['attr'] - codec = entry_attr['val0'] - quality = int(entry_attr['val1']) - - fmt = { - 'url': entry_attr['val2'], - 'quality': quality, - } - - if media_type == 'Video': - fmt.update({ - 'format_note': ['144p', '288p', '544p', '720p'][quality - 1], - 'vcodec': codec, - }) - elif media_type == 'Audio': - fmt.update({ - 'acodec': codec, + entry_attr = entry.get('attr', {}) + f_url = entry_attr.get('val2') + if not f_url: + continue + codec = entry_attr.get('val0') + if codec == 'm3u8': + formats.extend(self._extract_m3u8_formats( + f_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif codec == 'f4m': + formats.extend(self._extract_f4m_formats( + f_url + '?hdcore=3.7.0', video_id, + f4m_id='hds', fatal=False)) + else: + formats.append({ + 'format_id': determine_protocol({'url': f_url}), + 'url': f_url, + 'quality': int_or_none(entry_attr.get('val1')), + 'vcodec': codec if media_type == 'Video' else 'none', + 'acodec': codec if media_type == 'Audio' else None, }) - formats.append(fmt) - self._sort_formats(formats) + upload_date = None + entry_pdatet = attr.get('entry_pdatet') + if entry_pdatet: + upload_date = entry_pdatet[:-4] + return { 'id': video_id, - 'title': attr['entry_title'], - 'description': attr['entry_descl'], - 'thumbnail': attr['entry_image_16_9'], - 'duration': parse_duration(attr['entry_durat']), - 'upload_date': attr['entry_pdatet'][:-4], - 'uploader': attr['channel_title'], - 'uploader_id': attr['channel_idkey'], + 'title': title, + 'description': attr.get('entry_descl'), + 'thumbnail': attr.get('entry_image_16_9'), + 'duration': parse_duration(attr.get('entry_durat')), + 'upload_date': upload_date, + 'uploader': attr.get('channel_title'), + 'uploader_id': attr.get('channel_idkey'), 'formats': formats, } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/syfy.py youtube-dl-2019.05.20/youtube_dl/extractor/syfy.py --- youtube-dl-2016.02.22/youtube_dl/extractor/syfy.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/syfy.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,46 +1,58 @@ from __future__ import unicode_literals -import re +from .adobepass import AdobePassIE +from ..utils import ( + update_url_query, + smuggle_url, +) -from .common import InfoExtractor - - -class SyfyIE(InfoExtractor): - _VALID_URL = r'https?://www\.syfy\.com/(?:videos/.+?vid:(?P[0-9]+)|(?!videos)(?P[^/]+)(?:$|[?#]))' +class SyfyIE(AdobePassIE): + _VALID_URL = r'https?://(?:www\.)?syfy\.com/(?:[^/]+/)?videos/(?P[^/?#]+)' _TESTS = [{ - 'url': 'http://www.syfy.com/videos/Robot%20Combat%20League/Behind%20the%20Scenes/vid:2631458', + 'url': 'http://www.syfy.com/theinternetruinedmylife/videos/the-internet-ruined-my-life-season-1-trailer', 'info_dict': { - 'id': 'NmqMrGnXvmO1', - 'ext': 'flv', - 'title': 'George Lucas has Advice for his Daughter', - 'description': 'Listen to what insights George Lucas give his daughter Amanda.', + 'id': '2968097', + 'ext': 'mp4', + 'title': 'The Internet Ruined My Life: Season 1 Trailer', + 'description': 'One tweet, one post, one click, can destroy everything.', + 'uploader': 'NBCU-MPAT', + 'upload_date': '20170113', + 'timestamp': 1484345640, }, - 'add_ie': ['ThePlatform'], - }, { - 'url': 'http://www.syfy.com/wilwheaton', - 'md5': '94dfa54ee3ccb63295b276da08c415f6', - 'info_dict': { - 'id': '4yoffOOXC767', - 'ext': 'flv', - 'title': 'The Wil Wheaton Project - Premiering May 27th at 10/9c.', - 'description': 'The Wil Wheaton Project premieres May 27th at 10/9c. Don\'t miss it.', + 'params': { + # m3u8 download + 'skip_download': True, }, 'add_ie': ['ThePlatform'], - 'skip': 'Blocked outside the US', }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_name = mobj.group('video_name') - if video_name: - generic_webpage = self._download_webpage(url, video_name) - video_id = self._search_regex( - r'', - generic_webpage, 'video ID') - url = 'http://www.syfy.com/videos/%s/%s/vid:%s' % ( - video_name, video_name, video_id) - else: - video_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) - return self.url_result(self._og_search_video_url(webpage)) + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + syfy_mpx = list(self._parse_json(self._search_regex( + r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', webpage, 'drupal settings'), + display_id)['syfy']['syfy_mpx'].values())[0] + video_id = syfy_mpx['mpxGUID'] + title = syfy_mpx['episodeTitle'] + query = { + 'mbr': 'true', + 'manifest': 'm3u', + } + if syfy_mpx.get('entitlement') == 'auth': + resource = self._get_mvpd_resource( + 'syfy', title, video_id, + syfy_mpx.get('mpxRating', 'TV-14')) + query['auth'] = self._extract_mvpd_auth( + url, video_id, 'syfy', resource) + + return { + '_type': 'url_transparent', + 'ie_key': 'ThePlatform', + 'url': smuggle_url(update_url_query( + self._proto_relative_url(syfy_mpx['releaseURL']), query), + {'force_smil_url': True}), + 'title': title, + 'id': video_id, + 'display_id': display_id, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/sztvhu.py youtube-dl-2019.05.20/youtube_dl/extractor/sztvhu.py --- youtube-dl-2016.02.22/youtube_dl/extractor/sztvhu.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/sztvhu.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,11 +1,11 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor class SztvHuIE(InfoExtractor): - _VALID_URL = r'http://(?:(?:www\.)?sztv\.hu|www\.tvszombathely\.hu)/(?:[^/]+)/.+-(?P[0-9]+)' + _VALID_URL = r'https?://(?:(?:www\.)?sztv\.hu|www\.tvszombathely\.hu)/(?:[^/]+)/.+-(?P[0-9]+)' _TEST = { 'url': 'http://sztv.hu/hirek/cserkeszek-nepszerusitettek-a-kornyezettudatos-eletmodot-a-savaria-teren-20130909', 'md5': 'a6df607b11fb07d0e9f2ad94613375cb', diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/tagesschau.py youtube-dl-2019.05.20/youtube_dl/extractor/tagesschau.py --- youtube-dl-2016.02.22/youtube_dl/extractor/tagesschau.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/tagesschau.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,46 +1,182 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import parse_filesize +from ..utils import ( + determine_ext, + js_to_json, + parse_iso8601, + parse_filesize, +) + + +class TagesschauPlayerIE(InfoExtractor): + IE_NAME = 'tagesschau:player' + _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?Paudio|video)/(?P=kind)-(?P\d+)~player(?:_[^/?#&]+)?\.html' + + _TESTS = [{ + 'url': 'http://www.tagesschau.de/multimedia/video/video-179517~player.html', + 'md5': '8d09548d5c15debad38bee3a4d15ca21', + 'info_dict': { + 'id': '179517', + 'ext': 'mp4', + 'title': 'Marie Kristin Boese, ARD Berlin, über den zukünftigen Kurs der AfD', + 'thumbnail': r're:^https?:.*\.jpg$', + 'formats': 'mincount:6', + }, + }, { + 'url': 'https://www.tagesschau.de/multimedia/audio/audio-29417~player.html', + 'md5': '76e6eec6ebd40740671cf0a2c88617e5', + 'info_dict': { + 'id': '29417', + 'ext': 'mp3', + 'title': 'Trabi - Bye, bye Rennpappe', + 'thumbnail': r're:^https?:.*\.jpg$', + 'formats': 'mincount:2', + }, + }, { + 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417~player_autoplay-true.html', + 'only_matching': True, + }] + + _FORMATS = { + 'xs': {'quality': 0}, + 's': {'width': 320, 'height': 180, 'quality': 1}, + 'm': {'width': 512, 'height': 288, 'quality': 2}, + 'l': {'width': 960, 'height': 540, 'quality': 3}, + 'xl': {'width': 1280, 'height': 720, 'quality': 4}, + 'xxl': {'quality': 5}, + } + + def _extract_via_api(self, kind, video_id): + info = self._download_json( + 'https://www.tagesschau.de/api/multimedia/{0}/{0}-{1}.json'.format(kind, video_id), + video_id) + title = info['headline'] + formats = [] + for media in info['mediadata']: + for format_id, format_url in media.items(): + if determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls')) + else: + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'vcodec': 'none' if kind == 'audio' else None, + }) + self._sort_formats(formats) + timestamp = parse_iso8601(info.get('date')) + return { + 'id': video_id, + 'title': title, + 'timestamp': timestamp, + 'formats': formats, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + # kind = mobj.group('kind').lower() + # if kind == 'video': + # return self._extract_via_api(kind, video_id) + + # JSON api does not provide some audio formats (e.g. ogg) thus + # extractiong audio via webpage + + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title(webpage).strip() + formats = [] + + for media_json in re.findall(r'({src\s*:\s*["\']http[^}]+type\s*:[^}]+})', webpage): + media = self._parse_json(js_to_json(media_json), video_id, fatal=False) + if not media: + continue + src = media.get('src') + if not src: + return + quality = media.get('quality') + kind = media.get('type', '').split('/')[0] + ext = determine_ext(src) + f = { + 'url': src, + 'format_id': '%s_%s' % (quality, ext) if quality else ext, + 'ext': ext, + 'vcodec': 'none' if kind == 'audio' else None, + } + f.update(self._FORMATS.get(quality, {})) + formats.append(f) + + self._sort_formats(formats) + + thumbnail = self._og_search_thumbnail(webpage) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'formats': formats, + } class TagesschauIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?:[^/]+/)*?[^/#?]+?(?P-?[0-9]+)(?:~_[^/#?]+?)?\.html' + _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/(?P[^/]+/(?:[^/]+/)*?(?P[^/#?]+?(?:-?[0-9]+)?))(?:~_?[^/#?]+?)?\.html' _TESTS = [{ 'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html', - 'md5': '917a228bc7df7850783bc47979673a09', + 'md5': 'f7c27a0eff3bfe8c7727e65f8fe1b1e6', 'info_dict': { - 'id': '102143', + 'id': 'video-102143', 'ext': 'mp4', 'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt', - 'description': 'md5:171feccd9d9b3dd54d05d501568f6359', - 'thumbnail': 're:^https?:.*\.jpg$', + 'description': '18.07.2015 20:10 Uhr', + 'thumbnail': r're:^https?:.*\.jpg$', }, }, { 'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html', 'md5': '3c54c1f6243d279b706bde660ceec633', 'info_dict': { - 'id': '5727', + 'id': 'ts-5727', 'ext': 'mp4', - 'description': 'md5:695c01bfd98b7e313c501386327aea59', 'title': 'Sendung: tagesschau \t04.12.2014 20:00 Uhr', - 'thumbnail': 're:^https?:.*\.jpg$', + 'description': 'md5:695c01bfd98b7e313c501386327aea59', + 'thumbnail': r're:^https?:.*\.jpg$', }, }, { - 'url': 'http://www.tagesschau.de/multimedia/politikimradio/audio-18407.html', - 'md5': 'aef45de271c4bf0a5db834aa40bf774c', + # exclusive audio + 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417.html', + 'md5': '76e6eec6ebd40740671cf0a2c88617e5', 'info_dict': { - 'id': '18407', + 'id': 'audio-29417', 'ext': 'mp3', - 'title': 'Flüchtlingsdebatte: Hitzig, aber wenig hilfreich', - 'description': 'Flüchtlingsdebatte: Hitzig, aber wenig hilfreich', - 'thumbnail': 're:^https?:.*\.jpg$', + 'title': 'Trabi - Bye, bye Rennpappe', + 'description': 'md5:8687dda862cbbe2cfb2df09b56341317', + 'thumbnail': r're:^https?:.*\.jpg$', }, }, { + # audio in article + 'url': 'http://www.tagesschau.de/inland/bnd-303.html', + 'md5': 'e0916c623e85fc1d2b26b78f299d3958', + 'info_dict': { + 'id': 'bnd-303', + 'ext': 'mp3', + 'title': 'Viele Baustellen für neuen BND-Chef', + 'description': 'md5:1e69a54be3e1255b2b07cdbce5bcd8b4', + 'thumbnail': r're:^https?:.*\.jpg$', + }, + }, { + 'url': 'http://www.tagesschau.de/inland/afd-parteitag-135.html', + 'info_dict': { + 'id': 'afd-parteitag-135', + 'title': 'Möchtegern-Underdog mit Machtanspruch', + }, + 'playlist_count': 2, + }, { 'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html', 'only_matching': True, }, { @@ -61,88 +197,108 @@ }, { 'url': 'http://www.tagesschau.de/multimedia/video/video-102303~_bab-sendung-211.html', 'only_matching': True, + }, { + 'url': 'http://www.tagesschau.de/100sekunden/index.html', + 'only_matching': True, + }, { + # playlist article with collapsing sections + 'url': 'http://www.tagesschau.de/wirtschaft/faq-freihandelszone-eu-usa-101.html', + 'only_matching': True, }] - _FORMATS = { - 's': {'width': 256, 'height': 144, 'quality': 1}, - 'm': {'width': 512, 'height': 288, 'quality': 2}, - 'l': {'width': 960, 'height': 544, 'quality': 3}, - } + @classmethod + def suitable(cls, url): + return False if TagesschauPlayerIE.suitable(url) else super(TagesschauIE, cls).suitable(url) + + def _extract_formats(self, download_text, media_kind): + links = re.finditer( + r'', + download_text) + formats = [] + for l in links: + link_url = l.group('url') + if not link_url: + continue + format_id = self._search_regex( + r'.*/[^/.]+\.([^/]+)\.[^/.]+$', link_url, 'format ID', + default=determine_ext(link_url)) + format = { + 'format_id': format_id, + 'url': l.group('url'), + 'format_name': l.group('name'), + } + title = l.group('title') + if title: + if media_kind.lower() == 'video': + m = re.match( + r'''(?x) + Video:\s*(?P[a-zA-Z0-9/._-]+)\s*&\#10; + (?P[0-9]+)x(?P[0-9]+)px&\#10; + (?P[0-9]+)kbps&\#10; + Audio:\s*(?P[0-9]+)kbps,\s*(?P[A-Za-z\.0-9]+)&\#10; + Größe:\s*(?P[0-9.,]+\s+[a-zA-Z]*B)''', + title) + if m: + format.update({ + 'format_note': m.group('audio_desc'), + 'vcodec': m.group('vcodec'), + 'width': int(m.group('width')), + 'height': int(m.group('height')), + 'abr': int(m.group('abr')), + 'vbr': int(m.group('vbr')), + 'filesize_approx': parse_filesize(m.group('filesize_approx')), + }) + else: + m = re.match( + r'(?P.+?)-Format\s*:\s*(?P\d+)kbps\s*,\s*(?P.+)', + title) + if m: + format.update({ + 'format_note': '%s, %s' % (m.group('format'), m.group('note')), + 'vcodec': 'none', + 'abr': int(m.group('abr')), + }) + formats.append(format) + self._sort_formats(formats) + return formats def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') or mobj.group('path') display_id = video_id.lstrip('-') + webpage = self._download_webpage(url, display_id) - player_url = self._html_search_meta( - 'twitter:player', webpage, 'player URL', default=None) - if player_url: - playerpage = self._download_webpage( - player_url, display_id, 'Downloading player page') - - formats = [] - for media in re.finditer( - r'''(?x) - (?P["\'])(?Phttp://media.+?)(?P=q_url) - ,\s*type:(?P["\'])(?Pvideo|audio)/(?P.+?)(?P=q_type) - (?:,\s*quality:(?P["\'])(?P.+?)(?P=q_quality))? - ''', playerpage): - url = media.group('url') - type_ = media.group('type') - ext = media.group('ext') - res = media.group('quality') - f = { - 'format_id': '%s_%s' % (res, ext) if res else ext, - 'url': url, - 'ext': ext, - 'vcodec': 'none' if type_ == 'audio' else None, - } - f.update(self._FORMATS.get(res, {})) - formats.append(f) - thumbnail = self._og_search_thumbnail(playerpage) - title = self._og_search_title(webpage).strip() - description = self._og_search_description(webpage).strip() - else: + title = self._html_search_regex( + r']*class="headline"[^>]*>(.+?)', + webpage, 'title', default=None) or self._og_search_title(webpage) + + DOWNLOAD_REGEX = r'(?s)

Wir bieten dieses (?PVideo|Audio) in folgenden Formaten zum Download an:

\s*
(?P.*?)
\s*

' + + webpage_type = self._og_search_property('type', webpage, default=None) + if webpage_type == 'website': # Article + entries = [] + for num, (entry_title, media_kind, download_text) in enumerate(re.findall( + r'(?s)]+class="infotext"[^>]*>\s*(?:]+>)?\s*(.+?).*?

.*?%s' % DOWNLOAD_REGEX, + webpage), 1): + entries.append({ + 'id': '%s-%d' % (display_id, num), + 'title': '%s' % entry_title, + 'formats': self._extract_formats(download_text, media_kind), + }) + if len(entries) > 1: + return self.playlist_result(entries, display_id, title) + formats = entries[0]['formats'] + else: # Assume single video download_text = self._search_regex( - r'(?s)

Wir bieten dieses Video in folgenden Formaten zum Download an:

\s*
(.*?)
\s*

', - webpage, 'download links') - links = re.finditer( - r'

', - download_text) - formats = [] - for l in links: - format_id = self._search_regex( - r'.*/[^/.]+\.([^/]+)\.[^/.]+', l.group('url'), 'format ID') - format = { - 'format_id': format_id, - 'url': l.group('url'), - 'format_name': l.group('name'), - } - m = re.match( - r'''(?x) - Video:\s*(?P[a-zA-Z0-9/._-]+)\s*&\#10; - (?P[0-9]+)x(?P[0-9]+)px&\#10; - (?P[0-9]+)kbps&\#10; - Audio:\s*(?P[0-9]+)kbps,\s*(?P[A-Za-z\.0-9]+)&\#10; - Größe:\s*(?P[0-9.,]+\s+[a-zA-Z]*B)''', - l.group('title')) - if m: - format.update({ - 'format_note': m.group('audio_desc'), - 'vcodec': m.group('vcodec'), - 'width': int(m.group('width')), - 'height': int(m.group('height')), - 'abr': int(m.group('abr')), - 'vbr': int(m.group('vbr')), - 'filesize_approx': parse_filesize(m.group('filesize_approx')), - }) - formats.append(format) - thumbnail = self._og_search_thumbnail(webpage) - description = self._html_search_regex( - r'(?s)

(.*?)

', - webpage, 'description', default=None) - title = self._html_search_regex( - r'(.*?)', webpage, 'title') + DOWNLOAD_REGEX, webpage, 'download links', group='links') + media_kind = self._search_regex( + DOWNLOAD_REGEX, webpage, 'media kind', default='Video', group='kind') + formats = self._extract_formats(download_text, media_kind) + thumbnail = self._og_search_thumbnail(webpage) + description = self._html_search_regex( + r'(?s)

(.*?)

', + webpage, 'description', default=None) self._sort_formats(formats) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/tapely.py youtube-dl-2019.05.20/youtube_dl/extractor/tapely.py --- youtube-dl-2016.02.22/youtube_dl/extractor/tapely.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/tapely.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,109 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - clean_html, - ExtractorError, - float_or_none, - parse_iso8601, - sanitized_Request, -) - - -class TapelyIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:tape\.ly|tapely\.com)/(?P[A-Za-z0-9\-_]+)(?:/(?P\d+))?' - _API_URL = 'http://tape.ly/showtape?id={0:}' - _S3_SONG_URL = 'http://mytape.s3.amazonaws.com/{0:}' - _SOUNDCLOUD_SONG_URL = 'http://api.soundcloud.com{0:}' - _TESTS = [ - { - 'url': 'http://tape.ly/my-grief-as-told-by-water', - 'info_dict': { - 'id': 23952, - 'title': 'my grief as told by water', - 'thumbnail': 're:^https?://.*\.png$', - 'uploader_id': 16484, - 'timestamp': 1411848286, - 'description': 'For Robin and Ponkers, whom the tides of life have taken out to sea.', - }, - 'playlist_count': 13, - }, - { - 'url': 'http://tape.ly/my-grief-as-told-by-water/1', - 'md5': '79031f459fdec6530663b854cbc5715c', - 'info_dict': { - 'id': 258464, - 'title': 'Dreaming Awake (My Brightest Diamond)', - 'ext': 'm4a', - }, - }, - { - 'url': 'https://tapely.com/my-grief-as-told-by-water', - 'only_matching': True, - }, - ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('id') - - playlist_url = self._API_URL.format(display_id) - request = sanitized_Request(playlist_url) - request.add_header('X-Requested-With', 'XMLHttpRequest') - request.add_header('Accept', 'application/json') - request.add_header('Referer', url) - - playlist = self._download_json(request, display_id) - - tape = playlist['tape'] - - entries = [] - for s in tape['songs']: - song = s['song'] - entry = { - 'id': song['id'], - 'duration': float_or_none(song.get('songduration'), 1000), - 'title': song['title'], - } - if song['source'] == 'S3': - entry.update({ - 'url': self._S3_SONG_URL.format(song['filename']), - }) - entries.append(entry) - elif song['source'] == 'YT': - self.to_screen('YouTube video detected') - yt_id = song['filename'].replace('/youtube/', '') - entry.update(self.url_result(yt_id, 'Youtube', video_id=yt_id)) - entries.append(entry) - elif song['source'] == 'SC': - self.to_screen('SoundCloud song detected') - sc_url = self._SOUNDCLOUD_SONG_URL.format(song['filename']) - entry.update(self.url_result(sc_url, 'Soundcloud')) - entries.append(entry) - else: - self.report_warning('Unknown song source: %s' % song['source']) - - if mobj.group('songnr'): - songnr = int(mobj.group('songnr')) - 1 - try: - return entries[songnr] - except IndexError: - raise ExtractorError( - 'No song with index: %s' % mobj.group('songnr'), - expected=True) - - return { - '_type': 'playlist', - 'id': tape['id'], - 'display_id': display_id, - 'title': tape['name'], - 'entries': entries, - 'thumbnail': tape.get('image_url'), - 'description': clean_html(tape.get('subtext')), - 'like_count': tape.get('likescount'), - 'uploader_id': tape.get('user_id'), - 'timestamp': parse_iso8601(tape.get('published_at')), - } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/tass.py youtube-dl-2019.05.20/youtube_dl/extractor/tass.py --- youtube-dl-2016.02.22/youtube_dl/extractor/tass.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/tass.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import json @@ -21,7 +21,7 @@ 'ext': 'mp4', 'title': 'Посетителям московского зоопарка показали красную панду', 'description': 'Приехавшую из Дублина Зейну можно увидеть в павильоне "Кошки тропиков"', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, }, { diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/tastytrade.py youtube-dl-2019.05.20/youtube_dl/extractor/tastytrade.py --- youtube-dl-2016.02.22/youtube_dl/extractor/tastytrade.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/tastytrade.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,43 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from .ooyala import OoyalaIE + + +class TastyTradeIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tastytrade\.com/tt/shows/[^/]+/episodes/(?P[^/?#&]+)' + + _TESTS = [{ + 'url': 'https://www.tastytrade.com/tt/shows/market-measures/episodes/correlation-in-short-volatility-06-28-2017', + 'info_dict': { + 'id': 'F3bnlzbToeI6pLEfRyrlfooIILUjz4nM', + 'ext': 'mp4', + 'title': 'A History of Teaming', + 'description': 'md5:2a9033db8da81f2edffa4c99888140b3', + 'duration': 422.255, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['Ooyala'], + }, { + 'url': 'https://www.tastytrade.com/tt/shows/daily-dose/episodes/daily-dose-06-30-2017', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + ooyala_code = self._search_regex( + r'data-media-id=(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'ooyala code', group='code') + + info = self._search_json_ld(webpage, display_id, fatal=False) + info.update({ + '_type': 'url_transparent', + 'ie_key': OoyalaIE.ie_key(), + 'url': 'ooyala:%s' % ooyala_code, + 'display_id': display_id, + }) + return info diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/tbs.py youtube-dl-2019.05.20/youtube_dl/extractor/tbs.py --- youtube-dl-2016.02.22/youtube_dl/extractor/tbs.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/tbs.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,89 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .turner import TurnerBaseIE +from ..compat import ( + compat_urllib_parse_urlparse, + compat_parse_qs, +) +from ..utils import ( + float_or_none, + int_or_none, + strip_or_none, +) + + +class TBSIE(TurnerBaseIE): + _VALID_URL = r'https?://(?:www\.)?(?Ptbs|tntdrama)\.com(?P/(?:movies|shows/[^/]+/(?:clips|season-\d+/episode-\d+))/(?P[^/?#]+))' + _TESTS = [{ + 'url': 'http://www.tntdrama.com/shows/the-alienist/clips/monster', + 'info_dict': { + 'id': '8d384cde33b89f3a43ce5329de42903ed5099887', + 'ext': 'mp4', + 'title': 'Monster', + 'description': 'Get a first look at the theatrical trailer for TNT’s highly anticipated new psychological thriller The Alienist, which premieres January 22 on TNT.', + 'timestamp': 1508175329, + 'upload_date': '20171016', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + 'url': 'http://www.tbs.com/shows/search-party/season-1/episode-1/explicit-the-mysterious-disappearance-of-the-girl-no-one-knew', + 'only_matching': True, + }, { + 'url': 'http://www.tntdrama.com/movies/star-wars-a-new-hope', + 'only_matching': True, + }] + + def _real_extract(self, url): + site, path, display_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, display_id) + drupal_settings = self._parse_json(self._search_regex( + r']+?data-drupal-selector="drupal-settings-json"[^>]*?>({.+?})', + webpage, 'drupal setting'), display_id) + video_data = next(v for v in drupal_settings['turner_playlist'] if v.get('url') == path) + + media_id = video_data['mediaID'] + title = video_data['title'] + tokenizer_query = compat_parse_qs(compat_urllib_parse_urlparse( + drupal_settings['ngtv_token_url']).query) + + info = self._extract_ngtv_info( + media_id, tokenizer_query, { + 'url': url, + 'site_name': site[:3].upper(), + 'auth_required': video_data.get('authRequired') == '1', + }) + + thumbnails = [] + for image_id, image in video_data.get('images', {}).items(): + image_url = image.get('url') + if not image_url or image.get('type') != 'video': + continue + i = { + 'id': image_id, + 'url': image_url, + } + mobj = re.search(r'(\d+)x(\d+)', image_url) + if mobj: + i.update({ + 'width': int(mobj.group(1)), + 'height': int(mobj.group(2)), + }) + thumbnails.append(i) + + info.update({ + 'id': media_id, + 'title': title, + 'description': strip_or_none(video_data.get('descriptionNoTags') or video_data.get('shortDescriptionNoTags')), + 'duration': float_or_none(video_data.get('duration')) or info.get('duration'), + 'timestamp': int_or_none(video_data.get('created')), + 'season_number': int_or_none(video_data.get('season')), + 'episode_number': int_or_none(video_data.get('episode')), + 'thumbnails': thumbnails, + }) + return info diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/tdslifeway.py youtube-dl-2019.05.20/youtube_dl/extractor/tdslifeway.py --- youtube-dl-2016.02.22/youtube_dl/extractor/tdslifeway.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/tdslifeway.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,33 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class TDSLifewayIE(InfoExtractor): + _VALID_URL = r'https?://tds\.lifeway\.com/v1/trainingdeliverysystem/courses/(?P\d+)/index\.html' + + _TEST = { + # From http://www.ministrygrid.com/training-viewer/-/training/t4g-2014-conference/the-gospel-by-numbers-4/the-gospel-by-numbers + 'url': 'http://tds.lifeway.com/v1/trainingdeliverysystem/courses/3453494717001/index.html?externalRegistration=AssetId%7C34F466F1-78F3-4619-B2AB-A8EFFA55E9E9%21InstanceId%7C0%21UserId%7Caaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa&grouping=http%3A%2F%2Flifeway.com%2Fvideo%2F3453494717001&activity_id=http%3A%2F%2Flifeway.com%2Fvideo%2F3453494717001&content_endpoint=http%3A%2F%2Ftds.lifeway.com%2Fv1%2Ftrainingdeliverysystem%2FScormEngineInterface%2FTCAPI%2Fcontent%2F&actor=%7B%22name%22%3A%5B%22Guest%20Guest%22%5D%2C%22account%22%3A%5B%7B%22accountServiceHomePage%22%3A%22http%3A%2F%2Fscorm.lifeway.com%2F%22%2C%22accountName%22%3A%22aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa%22%7D%5D%2C%22objectType%22%3A%22Agent%22%7D&content_token=462a50b2-b6f9-4970-99b1-930882c499fb®istration=93d6ec8e-7f7b-4ed3-bbc8-a857913c0b2a&externalConfiguration=access%7CFREE%21adLength%7C-1%21assignOrgId%7C4AE36F78-299A-425D-91EF-E14A899B725F%21assignOrgParentId%7C%21courseId%7C%21isAnonymous%7Cfalse%21previewAsset%7Cfalse%21previewLength%7C-1%21previewMode%7Cfalse%21royalty%7CFREE%21sessionId%7C671422F9-8E79-48D4-9C2C-4EE6111EA1CD%21trackId%7C&auth=Basic%20OjhmZjk5MDBmLTBlYTMtNDJhYS04YjFlLWE4MWQ3NGNkOGRjYw%3D%3D&endpoint=http%3A%2F%2Ftds.lifeway.com%2Fv1%2Ftrainingdeliverysystem%2FScormEngineInterface%2FTCAPI%2F', + 'info_dict': { + 'id': '3453494717001', + 'ext': 'mp4', + 'title': 'The Gospel by Numbers', + 'thumbnail': r're:^https?://.*\.jpg', + 'upload_date': '20140410', + 'description': 'Coming soon from T4G 2014!', + 'uploader_id': '2034960640001', + 'timestamp': 1397145591, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': ['BrightcoveNew'], + } + + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/2034960640001/default_default/index.html?videoId=%s' + + def _real_extract(self, url): + brightcove_id = self._match_id(url) + return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/teachable.py youtube-dl-2019.05.20/youtube_dl/extractor/teachable.py --- youtube-dl-2016.02.22/youtube_dl/extractor/teachable.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/teachable.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,259 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .wistia import WistiaIE +from ..compat import compat_str +from ..utils import ( + clean_html, + ExtractorError, + get_element_by_class, + urlencode_postdata, + urljoin, +) + + +class TeachableBaseIE(InfoExtractor): + _NETRC_MACHINE = 'teachable' + _URL_PREFIX = 'teachable:' + + _SITES = { + # Only notable ones here + 'upskillcourses.com': 'upskill', + 'academy.gns3.com': 'gns3', + 'academyhacker.com': 'academyhacker', + 'stackskills.com': 'stackskills', + 'market.saleshacker.com': 'saleshacker', + 'learnability.org': 'learnability', + 'edurila.com': 'edurila', + 'courses.workitdaily.com': 'workitdaily', + } + + _VALID_URL_SUB_TUPLE = (_URL_PREFIX, '|'.join(re.escape(site) for site in _SITES.keys())) + + def _real_initialize(self): + self._logged_in = False + + def _login(self, site): + if self._logged_in: + return + + username, password = self._get_login_info( + netrc_machine=self._SITES.get(site, site)) + if username is None: + return + + login_page, urlh = self._download_webpage_handle( + 'https://%s/sign_in' % site, None, + 'Downloading %s login page' % site) + + login_url = compat_str(urlh.geturl()) + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'user[email]': username, + 'user[password]': password, + }) + + post_url = self._search_regex( + r']+action=(["\'])(?P(?:(?!\1).)+)\1', login_page, + 'post url', default=login_url, group='url') + + if not post_url.startswith('http'): + post_url = urljoin(login_url, post_url) + + response = self._download_webpage( + post_url, None, 'Logging in to %s' % site, + data=urlencode_postdata(login_form), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': login_url, + }) + + if '>I accept the new Privacy Policy<' in response: + raise ExtractorError( + 'Unable to login: %s asks you to accept new Privacy Policy. ' + 'Go to https://%s/ and accept.' % (site, site), expected=True) + + # Successful login + if any(re.search(p, response) for p in ( + r'class=["\']user-signout', + r']+\bhref=["\']/sign_out', + r'>\s*Log out\s*<')): + self._logged_in = True + return + + message = get_element_by_class('alert', response) + if message is not None: + raise ExtractorError( + 'Unable to login: %s' % clean_html(message), expected=True) + + raise ExtractorError('Unable to log in') + + +class TeachableIE(TeachableBaseIE): + _VALID_URL = r'''(?x) + (?: + %shttps?://(?P[^/]+)| + https?://(?:www\.)?(?P%s) + ) + /courses/[^/]+/lectures/(?P\d+) + ''' % TeachableBaseIE._VALID_URL_SUB_TUPLE + + _TESTS = [{ + 'url': 'http://upskillcourses.com/courses/essential-web-developer-course/lectures/1747100', + 'info_dict': { + 'id': 'uzw6zw58or', + 'ext': 'mp4', + 'title': 'Welcome to the Course!', + 'description': 'md5:65edb0affa582974de4625b9cdea1107', + 'duration': 138.763, + 'timestamp': 1479846621, + 'upload_date': '20161122', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://upskillcourses.com/courses/119763/lectures/1747100', + 'only_matching': True, + }, { + 'url': 'https://academy.gns3.com/courses/423415/lectures/6885939', + 'only_matching': True, + }, { + 'url': 'teachable:https://upskillcourses.com/courses/essential-web-developer-course/lectures/1747100', + 'only_matching': True, + }] + + @staticmethod + def _is_teachable(webpage): + return 'teachableTracker.linker:autoLink' in webpage and re.search( + r']+href=["\']https?://process\.fs\.teachablecdn\.com', + webpage) + + @staticmethod + def _extract_url(webpage, source_url): + if not TeachableIE._is_teachable(webpage): + return + if re.match(r'https?://[^/]+/(?:courses|p)', source_url): + return '%s%s' % (TeachableBaseIE._URL_PREFIX, source_url) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + site = mobj.group('site') or mobj.group('site_t') + video_id = mobj.group('id') + + self._login(site) + + prefixed = url.startswith(self._URL_PREFIX) + if prefixed: + url = url[len(self._URL_PREFIX):] + + webpage = self._download_webpage(url, video_id) + + wistia_url = WistiaIE._extract_url(webpage) + if not wistia_url: + if any(re.search(p, webpage) for p in ( + r'class=["\']lecture-contents-locked', + r'>\s*Lecture contents locked', + r'id=["\']lecture-locked')): + self.raise_login_required('Lecture contents locked') + + title = self._og_search_title(webpage, default=None) + + return { + '_type': 'url_transparent', + 'url': wistia_url, + 'ie_key': WistiaIE.ie_key(), + 'title': title, + } + + +class TeachableCourseIE(TeachableBaseIE): + _VALID_URL = r'''(?x) + (?: + %shttps?://(?P[^/]+)| + https?://(?:www\.)?(?P%s) + ) + /(?:courses|p)/(?:enrolled/)?(?P[^/?#&]+) + ''' % TeachableBaseIE._VALID_URL_SUB_TUPLE + _TESTS = [{ + 'url': 'http://upskillcourses.com/courses/essential-web-developer-course/', + 'info_dict': { + 'id': 'essential-web-developer-course', + 'title': 'The Essential Web Developer Course (Free)', + }, + 'playlist_count': 192, + }, { + 'url': 'http://upskillcourses.com/courses/119763/', + 'only_matching': True, + }, { + 'url': 'http://upskillcourses.com/courses/enrolled/119763', + 'only_matching': True, + }, { + 'url': 'https://academy.gns3.com/courses/enrolled/423415', + 'only_matching': True, + }, { + 'url': 'teachable:https://learn.vrdev.school/p/gear-vr-developer-mini', + 'only_matching': True, + }, { + 'url': 'teachable:https://filmsimplified.com/p/davinci-resolve-15-crash-course', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if TeachableIE.suitable(url) else super( + TeachableCourseIE, cls).suitable(url) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + site = mobj.group('site') or mobj.group('site_t') + course_id = mobj.group('id') + + self._login(site) + + prefixed = url.startswith(self._URL_PREFIX) + if prefixed: + prefix = self._URL_PREFIX + url = url[len(prefix):] + + webpage = self._download_webpage(url, course_id) + + url_base = 'https://%s/' % site + + entries = [] + + for mobj in re.finditer( + r'(?s)(?P
  • ]+class=(["\'])(?:(?!\2).)*?section-item[^>]+>.+?
  • )', + webpage): + li = mobj.group('li') + if 'fa-youtube-play' not in li: + continue + lecture_url = self._search_regex( + r']+href=(["\'])(?P(?:(?!\1).)+)\1', li, + 'lecture url', default=None, group='url') + if not lecture_url: + continue + lecture_id = self._search_regex( + r'/lectures/(\d+)', lecture_url, 'lecture id', default=None) + title = self._html_search_regex( + r']+class=["\']lecture-name[^>]+>([^<]+)', li, + 'title', default=None) + entry_url = urljoin(url_base, lecture_url) + if prefixed: + entry_url = self._URL_PREFIX + entry_url + entries.append( + self.url_result( + entry_url, + ie=TeachableIE.ie_key(), video_id=lecture_id, + video_title=clean_html(title))) + + course_title = self._html_search_regex( + (r'(?s)]+class=["\']course-image[^>]+>\s*(.+?)]+class=["\']course-title[^>]+>(.+?)\d+)' _TESTS = [{ + # flowplayer 'url': 'http://www.teachertube.com/viewVideo.php?video_id=339997', 'md5': 'f9434ef992fd65936d72999951ee254c', 'info_dict': { @@ -24,19 +26,10 @@ 'ext': 'mp4', 'title': 'Measures of dispersion from a frequency table', 'description': 'Measures of dispersion from a frequency table', - 'thumbnail': 're:http://.*\.jpg', - }, - }, { - 'url': 'http://www.teachertube.com/viewVideo.php?video_id=340064', - 'md5': '0d625ec6bc9bf50f70170942ad580676', - 'info_dict': { - 'id': '340064', - 'ext': 'mp4', - 'title': 'How to Make Paper Dolls _ Paper Art Projects', - 'description': 'Learn how to make paper dolls in this simple', - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:https?://.*\.(?:jpg|png)', }, }, { + # jwplayer 'url': 'http://www.teachertube.com/music.php?music_id=8805', 'md5': '01e8352006c65757caf7b961f6050e21', 'info_dict': { @@ -46,20 +39,21 @@ 'description': 'RADIJSKA EMISIJA ZRAKOPLOVNE TEHNI?KE ?KOLE P', }, }, { + # unavailable video 'url': 'http://www.teachertube.com/video/intro-video-schleicher-297790', - 'md5': '9c79fbb2dd7154823996fc28d4a26998', - 'info_dict': { - 'id': '297790', - 'ext': 'mp4', - 'title': 'Intro Video - Schleicher', - 'description': 'Intro Video - Why to flip, how flipping will', - }, + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + error = self._search_regex( + r']+\bclass=["\']msgBox error[^>]+>([^<]+)', webpage, + 'error', default=None) + if error: + raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) + title = self._html_search_meta('title', webpage, 'title', fatal=True) TITLE_SUFFIX = ' - TeacherTube' if title.endswith(TITLE_SUFFIX): @@ -84,12 +78,16 @@ self._sort_formats(formats) + thumbnail = self._og_search_thumbnail( + webpage, default=None) or self._html_search_meta( + 'thumbnail', webpage) + return { 'id': video_id, 'title': title, - 'thumbnail': self._html_search_regex(r'\'image\'\s*:\s*["\']([^"\']+)["\']', webpage, 'thumbnail'), - 'formats': formats, 'description': description, + 'thumbnail': thumbnail, + 'formats': formats, } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/teachingchannel.py youtube-dl-2019.05.20/youtube_dl/extractor/teachingchannel.py --- youtube-dl-2016.02.22/youtube_dl/extractor/teachingchannel.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/teachingchannel.py 2019-06-07 18:49:07.000000000 +0000 @@ -7,10 +7,11 @@ class TeachingChannelIE(InfoExtractor): - _VALID_URL = r'https?://www\.teachingchannel\.org/videos/(?P.+)' + _VALID_URL = r'https?://(?:www\.)?teachingchannel\.org/videos/(?P<title>.+)' _TEST = { 'url': 'https://www.teachingchannel.org/videos/teacher-teaming-evolution', + 'md5': '3d6361864d7cac20b57c8784da17166f', 'info_dict': { 'id': 'F3bnlzbToeI6pLEfRyrlfooIILUjz4nM', 'ext': 'mp4', @@ -19,9 +20,9 @@ 'duration': 422.255, }, 'params': { - # m3u8 download 'skip_download': True, }, + 'add_ie': ['Ooyala'], } def _real_extract(self, url): diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/teamcoco.py youtube-dl-2019.05.20/youtube_dl/extractor/teamcoco.py --- youtube-dl-2016.02.22/youtube_dl/extractor/teamcoco.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/teamcoco.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,33 +1,34 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals -import base64 -import binascii -import re import json -from .common import InfoExtractor +from .turner import TurnerBaseIE from ..utils import ( + determine_ext, ExtractorError, + int_or_none, + mimetype2ext, + parse_duration, + parse_iso8601, qualities, - determine_ext, ) -from ..compat import compat_ord -class TeamcocoIE(InfoExtractor): - _VALID_URL = r'http://teamcoco\.com/video/(?P<video_id>[0-9]+)?/?(?P<display_id>.*)' +class TeamcocoIE(TurnerBaseIE): + _VALID_URL = r'https?://(?:\w+\.)?teamcoco\.com/(?P<id>([^/]+/)*[^/?#]+)' _TESTS = [ { - 'url': 'http://teamcoco.com/video/80187/conan-becomes-a-mary-kay-beauty-consultant', - 'md5': '3f7746aa0dc86de18df7539903d399ea', + 'url': 'http://teamcoco.com/video/mary-kay-remote', + 'md5': '55d532f81992f5c92046ad02fec34d7d', 'info_dict': { 'id': '80187', 'ext': 'mp4', 'title': 'Conan Becomes A Mary Kay Beauty Consultant', 'description': 'Mary Kay is perhaps the most trusted name in female beauty, so of course Conan is a natural choice to sell their products.', - 'duration': 504, - 'age_limit': 0, + 'duration': 495.0, + 'upload_date': '20140402', + 'timestamp': 1396407600, } }, { 'url': 'http://teamcoco.com/video/louis-ck-interview-george-w-bush', @@ -38,7 +39,8 @@ 'description': 'Louis C.K. got starstruck by George W. Bush, so what? Part one.', 'title': 'Louis C.K. Interview Pt. 1 11/3/11', 'duration': 288, - 'age_limit': 0, + 'upload_date': '20111104', + 'timestamp': 1320405840, } }, { 'url': 'http://teamcoco.com/video/timothy-olyphant-drinking-whiskey', @@ -47,6 +49,8 @@ 'ext': 'mp4', 'title': 'Timothy Olyphant Raises A Toast To “Justified”', 'description': 'md5:15501f23f020e793aeca761205e42c24', + 'upload_date': '20150415', + 'timestamp': 1429088400, }, 'params': { 'skip_download': True, # m3u8 downloads @@ -61,110 +65,135 @@ }, 'params': { 'skip_download': True, # m3u8 downloads - } + }, + 'skip': 'This video is no longer available.', + }, { + 'url': 'http://teamcoco.com/video/the-conan-audiencey-awards-for-04/25/18', + 'only_matching': True, + }, { + 'url': 'http://teamcoco.com/italy/conan-jordan-schlansky-hit-the-streets-of-florence', + 'only_matching': True, + }, { + 'url': 'http://teamcoco.com/haiti/conan-s-haitian-history-lesson', + 'only_matching': True, + }, { + 'url': 'http://teamcoco.com/israel/conan-hits-the-streets-beaches-of-tel-aviv', + 'only_matching': True, + }, { + 'url': 'https://conan25.teamcoco.com/video/ice-cube-kevin-hart-conan-share-lyft', + 'only_matching': True, } ] - _VIDEO_ID_REGEXES = ( - r'"eVar42"\s*:\s*(\d+)', - r'Ginger\.TeamCoco\.openInApp\("video",\s*"([^"]+)"', - r'"id_not"\s*:\s*(\d+)' - ) - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) + def _graphql_call(self, query_template, object_type, object_id): + find_object = 'find' + object_type + return self._download_json( + 'https://teamcoco.com/graphql', object_id, data=json.dumps({ + 'query': query_template % (find_object, object_id) + }).encode(), headers={ + 'Content-Type': 'application/json', + })['data'][find_object] - display_id = mobj.group('display_id') - webpage, urlh = self._download_webpage_handle(url, display_id) - if 'src=expired' in urlh.geturl(): - raise ExtractorError('This video is expired.', expected=True) - - video_id = mobj.group('video_id') - if not video_id: - video_id = self._html_search_regex( - self._VIDEO_ID_REGEXES, webpage, 'video id') - - data = None - - preload_codes = self._html_search_regex( - r'(function.+)setTimeout\(function\(\)\{playlist', - webpage, 'preload codes') - base64_fragments = re.findall(r'"([a-zA-z0-9+/=]+)"', preload_codes) - base64_fragments.remove('init') - - def _check_sequence(cur_fragments): - if not cur_fragments: - return - for i in range(len(cur_fragments)): - cur_sequence = (''.join(cur_fragments[i:] + cur_fragments[:i])).encode('ascii') - try: - raw_data = base64.b64decode(cur_sequence) - if compat_ord(raw_data[0]) == compat_ord('{'): - return json.loads(raw_data.decode('utf-8')) - except (TypeError, binascii.Error, UnicodeDecodeError, ValueError): - continue + def _real_extract(self, url): + display_id = self._match_id(url) - def _check_data(): - for i in range(len(base64_fragments) + 1): - for j in range(i, len(base64_fragments) + 1): - data = _check_sequence(base64_fragments[:i] + base64_fragments[j:]) - if data: - return data - - self.to_screen('Try to compute possible data sequence. This may take some time.') - data = _check_data() - - if not data: - raise ExtractorError( - 'Preload information could not be extracted', expected=True) - - formats = [] - get_quality = qualities(['500k', '480p', '1000k', '720p', '1080p']) - for filed in data['files']: - if determine_ext(filed['url']) == 'm3u8': - # compat_urllib_parse.urljoin does not work here - if filed['url'].startswith('/'): - m3u8_url = 'http://ht.cdn.turner.com/tbs/big/teamcoco' + filed['url'] - else: - m3u8_url = filed['url'] - m3u8_formats = self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4') - for m3u8_format in m3u8_formats: - if m3u8_format not in formats: - formats.append(m3u8_format) - elif determine_ext(filed['url']) == 'f4m': - # TODO Correct f4m extraction - continue - else: - if filed['url'].startswith('/mp4:protected/'): - # TODO Correct extraction for these files - continue - m_format = re.search(r'(\d+(k|p))\.mp4', filed['url']) - if m_format is not None: - format_id = m_format.group(1) - else: - format_id = filed['bitrate'] - tbr = ( - int(filed['bitrate']) - if filed['bitrate'].isdigit() - else None) - - formats.append({ - 'url': filed['url'], - 'ext': 'mp4', - 'tbr': tbr, - 'format_id': format_id, - 'quality': get_quality(format_id), - }) + response = self._graphql_call('''{ + %s(slug: "%s") { + ... on RecordSlug { + record { + id + title + teaser + publishOn + thumb { + preview + } + file { + url + } + tags { + name + } + duration + turnerMediaId + turnerMediaAuthToken + } + } + ... on NotFoundSlug { + status + } + } +}''', 'Slug', display_id) + if response.get('status'): + raise ExtractorError('This video is no longer available.', expected=True) - self._sort_formats(formats) + record = response['record'] + video_id = record['id'] - return { + info = { 'id': video_id, 'display_id': display_id, - 'formats': formats, - 'title': data['title'], - 'thumbnail': data.get('thumb', {}).get('href'), - 'description': data.get('teaser'), - 'duration': data.get('duration'), - 'age_limit': self._family_friendly_search(webpage), + 'title': record['title'], + 'thumbnail': record.get('thumb', {}).get('preview'), + 'description': record.get('teaser'), + 'duration': parse_duration(record.get('duration')), + 'timestamp': parse_iso8601(record.get('publishOn')), } + + media_id = record.get('turnerMediaId') + if media_id: + self._initialize_geo_bypass({ + 'countries': ['US'], + }) + info.update(self._extract_ngtv_info(media_id, { + 'accessToken': record['turnerMediaAuthToken'], + 'accessTokenType': 'jws', + })) + else: + d = self._download_json( + 'https://teamcoco.com/_truman/d/' + video_id, + video_id, fatal=False) or {} + video_sources = d.get('meta') or {} + if not video_sources: + video_sources = self._graphql_call('''{ + %s(id: "%s") { + src + } +}''', 'RecordVideoSource', video_id) or {} + + formats = [] + get_quality = qualities(['low', 'sd', 'hd', 'uhd']) + for format_id, src in video_sources.get('src', {}).items(): + if not isinstance(src, dict): + continue + src_url = src.get('src') + if not src_url: + continue + ext = determine_ext(src_url, mimetype2ext(src.get('type'))) + if format_id == 'hls' or ext == 'm3u8': + # compat_urllib_parse.urljoin does not work here + if src_url.startswith('/'): + src_url = 'http://ht.cdn.turner.com/tbs/big/teamcoco' + src_url + formats.extend(self._extract_m3u8_formats( + src_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)) + else: + if src_url.startswith('/mp4:protected/'): + # TODO Correct extraction for these files + continue + tbr = int_or_none(self._search_regex( + r'(\d+)k\.mp4', src_url, 'tbr', default=None)) + + formats.append({ + 'url': src_url, + 'ext': ext, + 'tbr': tbr, + 'format_id': format_id, + 'quality': get_quality(format_id), + }) + if not formats: + formats = self._extract_m3u8_formats( + record['file']['url'], video_id, 'mp4', fatal=False) + self._sort_formats(formats) + info['formats'] = formats + + return info diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/teamtreehouse.py youtube-dl-2019.05.20/youtube_dl/extractor/teamtreehouse.py --- youtube-dl-2016.02.22/youtube_dl/extractor/teamtreehouse.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/teamtreehouse.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,140 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + determine_ext, + ExtractorError, + float_or_none, + get_element_by_class, + get_element_by_id, + parse_duration, + remove_end, + urlencode_postdata, + urljoin, +) + + +class TeamTreeHouseIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?teamtreehouse\.com/library/(?P<id>[^/]+)' + _TESTS = [{ + # Course + 'url': 'https://teamtreehouse.com/library/introduction-to-user-authentication-in-php', + 'info_dict': { + 'id': 'introduction-to-user-authentication-in-php', + 'title': 'Introduction to User Authentication in PHP', + 'description': 'md5:405d7b4287a159b27ddf30ca72b5b053', + }, + 'playlist_mincount': 24, + }, { + # WorkShop + 'url': 'https://teamtreehouse.com/library/deploying-a-react-app', + 'info_dict': { + 'id': 'deploying-a-react-app', + 'title': 'Deploying a React App', + 'description': 'md5:10a82e3ddff18c14ac13581c9b8e5921', + }, + 'playlist_mincount': 4, + }, { + # Video + 'url': 'https://teamtreehouse.com/library/application-overview-2', + 'info_dict': { + 'id': 'application-overview-2', + 'ext': 'mp4', + 'title': 'Application Overview', + 'description': 'md5:4b0a234385c27140a4378de5f1e15127', + }, + 'expected_warnings': ['This is just a preview'], + }] + _NETRC_MACHINE = 'teamtreehouse' + + def _real_initialize(self): + email, password = self._get_login_info() + if email is None: + return + + signin_page = self._download_webpage( + 'https://teamtreehouse.com/signin', + None, 'Downloading signin page') + data = self._form_hidden_inputs('new_user_session', signin_page) + data.update({ + 'user_session[email]': email, + 'user_session[password]': password, + }) + error_message = get_element_by_class('error-message', self._download_webpage( + 'https://teamtreehouse.com/person_session', + None, 'Logging in', data=urlencode_postdata(data))) + if error_message: + raise ExtractorError(clean_html(error_message), expected=True) + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + title = self._html_search_meta(['og:title', 'twitter:title'], webpage) + description = self._html_search_meta( + ['description', 'og:description', 'twitter:description'], webpage) + entries = self._parse_html5_media_entries(url, webpage, display_id) + if entries: + info = entries[0] + + for subtitles in info.get('subtitles', {}).values(): + for subtitle in subtitles: + subtitle['ext'] = determine_ext(subtitle['url'], 'srt') + + is_preview = 'data-preview="true"' in webpage + if is_preview: + self.report_warning( + 'This is just a preview. You need to be signed in with a Basic account to download the entire video.', display_id) + duration = 30 + else: + duration = float_or_none(self._search_regex( + r'data-duration="(\d+)"', webpage, 'duration'), 1000) + if not duration: + duration = parse_duration(get_element_by_id( + 'video-duration', webpage)) + + info.update({ + 'id': display_id, + 'title': title, + 'description': description, + 'duration': duration, + }) + return info + else: + def extract_urls(html, extract_info=None): + for path in re.findall(r'<a[^>]+href="([^"]+)"', html): + page_url = urljoin(url, path) + entry = { + '_type': 'url_transparent', + 'id': self._match_id(page_url), + 'url': page_url, + 'id_key': self.ie_key(), + } + if extract_info: + entry.update(extract_info) + entries.append(entry) + + workshop_videos = self._search_regex( + r'(?s)<ul[^>]+id="workshop-videos"[^>]*>(.+?)</ul>', + webpage, 'workshop videos', default=None) + if workshop_videos: + extract_urls(workshop_videos) + else: + stages_path = self._search_regex( + r'(?s)<div[^>]+id="syllabus-stages"[^>]+data-url="([^"]+)"', + webpage, 'stages path') + if stages_path: + stages_page = self._download_webpage( + urljoin(url, stages_path), display_id, 'Downloading stages page') + for chapter_number, (chapter, steps_list) in enumerate(re.findall(r'(?s)<h2[^>]*>\s*(.+?)\s*</h2>.+?<ul[^>]*>(.+?)</ul>', stages_page), 1): + extract_urls(steps_list, { + 'chapter': chapter, + 'chapter_number': chapter_number, + }) + title = remove_end(title, ' Course') + + return self.playlist_result( + entries, display_id, title, description) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/techtalks.py youtube-dl-2019.05.20/youtube_dl/extractor/techtalks.py --- youtube-dl-2016.02.22/youtube_dl/extractor/techtalks.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/techtalks.py 2019-06-07 18:49:07.000000000 +0000 @@ -10,9 +10,9 @@ class TechTalksIE(InfoExtractor): - _VALID_URL = r'https?://techtalks\.tv/talks/[^/]*/(?P<id>\d+)/' + _VALID_URL = r'https?://techtalks\.tv/talks/(?:[^/]+/)?(?P<id>\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://techtalks.tv/talks/learning-topic-models-going-beyond-svd/57758/', 'info_dict': { 'id': '57758', @@ -38,7 +38,10 @@ # rtmp download 'skip_download': True, }, - } + }, { + 'url': 'http://techtalks.tv/talks/57758', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/ted.py youtube-dl-2019.05.20/youtube_dl/extractor/ted.py --- youtube-dl-2016.02.22/youtube_dl/extractor/ted.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/ted.py 2019-06-07 18:49:07.000000000 +0000 @@ -6,7 +6,12 @@ from .common import InfoExtractor from ..compat import compat_str -from ..utils import int_or_none +from ..utils import ( + float_or_none, + int_or_none, + try_get, + url_or_none, +) class TEDIE(InfoExtractor): @@ -27,7 +32,7 @@ ''' _TESTS = [{ 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html', - 'md5': 'fc94ac279feebbce69f21c0c6ee82810', + 'md5': 'b0ce2b05ca215042124fbc9e3886493a', 'info_dict': { 'id': '102', 'ext': 'mp4', @@ -37,21 +42,32 @@ 'consciousness, but that half the time our brains are ' 'actively fooling us.'), 'uploader': 'Dan Dennett', - 'width': 854, + 'width': 853, 'duration': 1308, - } + 'view_count': int, + 'comment_count': int, + 'tags': list, + }, + 'params': { + 'skip_download': True, + }, }, { - 'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms', - 'md5': '226f4fb9c62380d11b7995efa4c87994', + # missing HTTP bitrates + 'url': 'https://www.ted.com/talks/vishal_sikka_the_beauty_and_power_of_algorithms', 'info_dict': { - 'id': 'vishal-sikka-the-beauty-and-power-of-algorithms', + 'id': '6069', 'ext': 'mp4', - 'title': 'Vishal Sikka: The beauty and power of algorithms', - 'thumbnail': 're:^https?://.+\.jpg', - 'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.', - } + 'title': 'The beauty and power of algorithms', + 'thumbnail': r're:^https?://.+\.jpg', + 'description': 'md5:734e352710fb00d840ab87ae31aaf688', + 'uploader': 'Vishal Sikka', + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best', + 'md5': 'e6b9617c01a7970ceac8bb2c92c346c0', 'info_dict': { 'id': '1972', 'ext': 'mp4', @@ -60,6 +76,9 @@ 'description': 'md5:5174aed4d0f16021b704120360f72b92', 'duration': 1128, }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://www.ted.com/playlists/who_are_the_hackers', 'info_dict': { @@ -73,7 +92,7 @@ 'add_ie': ['Youtube'], 'info_dict': { 'id': '_ZG8HBuDjgc', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'Douglas Adams: Parrots the Universe and Everything', 'description': 'md5:01ad1e199c49ac640cb1196c0e9016af', 'uploader': 'University of California Television (UCTV)', @@ -84,17 +103,17 @@ 'skip_download': True, }, }, { - # YouTube video - 'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond', - 'add_ie': ['Youtube'], + # no nativeDownloads + 'url': 'https://www.ted.com/talks/tom_thum_the_orchestra_in_my_mouth', 'info_dict': { - 'id': 'aFBIPO-P7LM', + 'id': '1792', 'ext': 'mp4', - 'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville', - 'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1', - 'uploader': 'TEDx Talks', - 'uploader_id': 'TEDxTalks', - 'upload_date': '20111216', + 'title': 'The orchestra in my mouth', + 'description': 'md5:5d1d78650e2f8dfcbb8ebee2951ac29a', + 'uploader': 'Tom Thum', + 'view_count': int, + 'comment_count': int, + 'tags': list, }, 'params': { 'skip_download': True, @@ -102,14 +121,15 @@ }] _NATIVE_FORMATS = { - 'low': {'preference': 1, 'width': 320, 'height': 180}, - 'medium': {'preference': 2, 'width': 512, 'height': 288}, - 'high': {'preference': 3, 'width': 854, 'height': 480}, + 'low': {'width': 320, 'height': 180}, + 'medium': {'width': 512, 'height': 288}, + 'high': {'width': 854, 'height': 480}, } def _extract_info(self, webpage): - info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>', - webpage, 'info json') + info_json = self._search_regex( + r'(?s)q\(\s*"\w+.init"\s*,\s*({.+})\)\s*</script>', + webpage, 'info json') return json.loads(info_json) def _real_extract(self, url): @@ -131,11 +151,16 @@ webpage = self._download_webpage(url, name, 'Downloading playlist webpage') info = self._extract_info(webpage) - playlist_info = info['playlist'] + + playlist_info = try_get( + info, lambda x: x['__INITIAL_DATA__']['playlist'], + dict) or info['playlist'] playlist_entries = [ self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key()) - for talk in info['talks'] + for talk in try_get( + info, lambda x: x['__INITIAL_DATA__']['talks'], + dict) or info['talks'] ] return self.playlist_result( playlist_entries, @@ -144,42 +169,60 @@ def _talk_info(self, url, video_name): webpage = self._download_webpage(url, video_name) - self.report_extraction(video_name) - talk_info = self._extract_info(webpage)['talks'][0] + info = self._extract_info(webpage) + + data = try_get(info, lambda x: x['__INITIAL_DATA__'], dict) or info + talk_info = data['talks'][0] + + title = talk_info['title'].strip() - external = talk_info.get('external') - if external: - service = external['service'] - self.to_screen('Found video from %s' % service) - ext_url = None - if service.lower() == 'youtube': - ext_url = external.get('code') - return { - '_type': 'url', - 'url': ext_url or external['uri'], - } + native_downloads = try_get( + talk_info, + (lambda x: x['downloads']['nativeDownloads'], + lambda x: x['nativeDownloads']), + dict) or {} formats = [{ 'url': format_url, 'format_id': format_id, 'format': format_id, - } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None] + } for (format_id, format_url) in native_downloads.items() if format_url is not None] if formats: for f in formats: finfo = self._NATIVE_FORMATS.get(f['format_id']) if finfo: f.update(finfo) - for format_id, resources in talk_info['resources'].items(): + player_talk = talk_info['player_talks'][0] + + external = player_talk.get('external') + if isinstance(external, dict): + service = external.get('service') + if isinstance(service, compat_str): + ext_url = None + if service.lower() == 'youtube': + ext_url = external.get('code') + + return self.url_result(ext_url or external['uri']) + + resources_ = player_talk.get('resources') or talk_info.get('resources') + + http_url = None + for format_id, resources in resources_.items(): if format_id == 'h264': for resource in resources: + h264_url = resource.get('file') + if not h264_url: + continue bitrate = int_or_none(resource.get('bitrate')) formats.append({ - 'url': resource['file'], + 'url': h264_url, 'format_id': '%s-%sk' % (format_id, bitrate), 'tbr': bitrate, }) + if re.search(r'\d+k', h264_url): + http_url = h264_url elif format_id == 'rtmp': streamer = talk_info.get('streamer') if not streamer: @@ -195,16 +238,36 @@ 'tbr': int_or_none(resource.get('bitrate')), }) elif format_id == 'hls': - hls_formats = self._extract_m3u8_formats( - resources.get('stream'), video_name, 'mp4', m3u8_id=format_id) - for f in hls_formats: - if f.get('format_id') == 'hls-meta': - continue - if not f.get('height'): - f['vcodec'] = 'none' - else: - f['acodec'] = 'none' - formats.extend(hls_formats) + if not isinstance(resources, dict): + continue + stream_url = url_or_none(resources.get('stream')) + if not stream_url: + continue + formats.extend(self._extract_m3u8_formats( + stream_url, video_name, 'mp4', m3u8_id=format_id, + fatal=False)) + + m3u8_formats = list(filter( + lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none', + formats)) + if http_url: + for m3u8_format in m3u8_formats: + bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None) + if not bitrate: + continue + bitrate_url = re.sub(r'\d+k', bitrate, http_url) + if not self._is_valid_url( + bitrate_url, video_name, '%s bitrate' % bitrate): + continue + f = m3u8_format.copy() + f.update({ + 'url': bitrate_url, + 'format_id': m3u8_format['format_id'].replace('hls', 'http'), + 'protocol': 'http', + }) + if f.get('acodec') == 'none': + del f['acodec'] + formats.append(f) audio_download = talk_info.get('audioDownload') if audio_download: @@ -212,49 +275,55 @@ 'url': audio_download, 'format_id': 'audio', 'vcodec': 'none', - 'preference': -0.5, }) self._sort_formats(formats) video_id = compat_str(talk_info['id']) - thumbnail = talk_info['thumb'] - if not thumbnail.startswith('http'): - thumbnail = 'http://' + thumbnail return { 'id': video_id, - 'title': talk_info['title'].strip(), - 'uploader': talk_info['speaker'], - 'thumbnail': thumbnail, + 'title': title, + 'uploader': player_talk.get('speaker') or talk_info.get('speaker'), + 'thumbnail': player_talk.get('thumb') or talk_info.get('thumb'), 'description': self._og_search_description(webpage), 'subtitles': self._get_subtitles(video_id, talk_info), 'formats': formats, - 'duration': talk_info.get('duration'), + 'duration': float_or_none(talk_info.get('duration')), + 'view_count': int_or_none(data.get('viewed_count')), + 'comment_count': int_or_none( + try_get(data, lambda x: x['comments']['count'])), + 'tags': try_get(talk_info, lambda x: x['tags'], list), } def _get_subtitles(self, video_id, talk_info): - languages = [lang['languageCode'] for lang in talk_info.get('languages', [])] - if languages: - sub_lang_list = {} - for l in languages: - sub_lang_list[l] = [ - { - 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext), - 'ext': ext, - } - for ext in ['ted', 'srt'] - ] - return sub_lang_list - else: - return {} + sub_lang_list = {} + for language in try_get( + talk_info, + (lambda x: x['downloads']['languages'], + lambda x: x['languages']), list): + lang_code = language.get('languageCode') or language.get('ianaCode') + if not lang_code: + continue + sub_lang_list[lang_code] = [ + { + 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, lang_code, ext), + 'ext': ext, + } + for ext in ['ted', 'srt'] + ] + return sub_lang_list def _watch_info(self, url, name): webpage = self._download_webpage(url, name) config_json = self._html_search_regex( r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>', - webpage, 'config') + webpage, 'config', default=None) + if not config_json: + embed_url = self._search_regex( + r"<iframe[^>]+class='pages-video-embed__video__object'[^>]+src='([^']+)'", webpage, 'embed url') + return self.url_result(self._proto_relative_url(embed_url)) config = json.loads(config_json)['config'] video_url = config['video']['url'] thumbnail = config.get('image', {}).get('url') diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/tele13.py youtube-dl-2019.05.20/youtube_dl/extractor/tele13.py --- youtube-dl-2016.02.22/youtube_dl/extractor/tele13.py 2015-12-31 15:50:46.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/tele13.py 2019-06-07 18:49:07.000000000 +0000 @@ -11,7 +11,7 @@ class Tele13IE(InfoExtractor): - _VALID_URL = r'^http://(?:www\.)?t13\.cl/videos(?:/[^/]+)+/(?P<id>[\w-]+)' + _VALID_URL = r'^https?://(?:www\.)?t13\.cl/videos(?:/[^/]+)+/(?P<id>[\w-]+)' _TESTS = [ { 'url': 'http://www.t13.cl/videos/actualidad/el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/tele5.py youtube-dl-2019.05.20/youtube_dl/extractor/tele5.py --- youtube-dl-2016.02.22/youtube_dl/extractor/tele5.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/tele5.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,57 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .nexx import NexxIE +from ..compat import compat_urlparse + + +class Tele5IE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tele5\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.tele5.de/mediathek/filme-online/videos?vid=1549416', + 'info_dict': { + 'id': '1549416', + 'ext': 'mp4', + 'upload_date': '20180814', + 'timestamp': 1534290623, + 'title': 'Pandorum', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.tele5.de/kalkofes-mattscheibe/video-clips/politik-und-gesellschaft?ve_id=1551191', + 'only_matching': True, + }, { + 'url': 'https://www.tele5.de/video-clip/?ve_id=1609440', + 'only_matching': True, + }, { + 'url': 'https://www.tele5.de/filme/schlefaz-dragon-crusaders/', + 'only_matching': True, + }, { + 'url': 'https://www.tele5.de/filme/making-of/avengers-endgame/', + 'only_matching': True, + }, { + 'url': 'https://www.tele5.de/star-trek/raumschiff-voyager/ganze-folge/das-vinculum/', + 'only_matching': True, + }, { + 'url': 'https://www.tele5.de/anders-ist-sevda/', + 'only_matching': True, + }] + + def _real_extract(self, url): + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + video_id = (qs.get('vid') or qs.get('ve_id') or [None])[0] + + if not video_id: + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._html_search_regex( + (r'id\s*=\s*["\']video-player["\'][^>]+data-id\s*=\s*["\'](\d+)', + r'\s+id\s*=\s*["\']player_(\d{6,})', + r'\bdata-id\s*=\s*["\'](\d{6,})'), webpage, 'video id') + + return self.url_result( + 'https://api.nexx.cloud/v3/759/videos/byid/%s' % video_id, + ie=NexxIE.ie_key(), video_id=video_id) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/telebruxelles.py youtube-dl-2019.05.20/youtube_dl/extractor/telebruxelles.py --- youtube-dl-2016.02.22/youtube_dl/extractor/telebruxelles.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/telebruxelles.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,37 +1,46 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor class TeleBruxellesIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?telebruxelles\.be/(news|sport|dernier-jt)/?(?P<id>[^/#?]+)' + _VALID_URL = r'https?://(?:www\.)?(?:telebruxelles|bx1)\.be/(?:[^/]+/)*(?P<id>[^/#?]+)' _TESTS = [{ - 'url': 'http://www.telebruxelles.be/news/auditions-devant-parlement-francken-galant-tres-attendus/', - 'md5': '59439e568c9ee42fb77588b2096b214f', + 'url': 'http://bx1.be/news/que-risque-lauteur-dune-fausse-alerte-a-la-bombe/', + 'md5': 'a2a67a5b1c3e8c9d33109b902f474fd9', 'info_dict': { - 'id': '11942', - 'display_id': 'auditions-devant-parlement-francken-galant-tres-attendus', - 'ext': 'flv', - 'title': 'Parlement : Francken et Galant répondent aux interpellations de l’opposition', - 'description': 're:Les auditions des ministres se poursuivent*' - }, - 'params': { - 'skip_download': 'requires rtmpdump' + 'id': '158856', + 'display_id': 'que-risque-lauteur-dune-fausse-alerte-a-la-bombe', + 'ext': 'mp4', + 'title': 'Que risque l’auteur d’une fausse alerte à la bombe ?', + 'description': 'md5:3cf8df235d44ebc5426373050840e466', }, }, { - 'url': 'http://www.telebruxelles.be/sport/basket-brussels-bat-mons-80-74/', - 'md5': '181d3fbdcf20b909309e5aef5c6c6047', + 'url': 'http://bx1.be/sport/futsal-schaerbeek-sincline-5-3-a-thulin/', + 'md5': 'dfe07ecc9c153ceba8582ac912687675', 'info_dict': { - 'id': '10091', - 'display_id': 'basket-brussels-bat-mons-80-74', - 'ext': 'flv', - 'title': 'Basket : le Brussels bat Mons 80-74', - 'description': 're:^Ils l\u2019on fait ! En basket, le B*', - }, - 'params': { - 'skip_download': 'requires rtmpdump' + 'id': '158433', + 'display_id': 'futsal-schaerbeek-sincline-5-3-a-thulin', + 'ext': 'mp4', + 'title': 'Futsal : Schaerbeek s’incline 5-3 à Thulin', + 'description': 'md5:fd013f1488d5e2dceb9cebe39e2d569b', }, + }, { + 'url': 'http://bx1.be/emission/bxenf1-gastronomie/', + 'only_matching': True, + }, { + 'url': 'https://bx1.be/berchem-sainte-agathe/personnel-carrefour-de-berchem-sainte-agathe-inquiet/', + 'only_matching': True, + }, { + 'url': 'https://bx1.be/dernier-jt/', + 'only_matching': True, + }, { + # live stream + 'url': 'https://bx1.be/lives/direct-tv/', + 'only_matching': True, }] def _real_extract(self, url): @@ -39,22 +48,29 @@ webpage = self._download_webpage(url, display_id) article_id = self._html_search_regex( - r"<article id=\"post-(\d+)\"", webpage, 'article ID') + r'<article[^>]+\bid=["\']post-(\d+)', webpage, 'article ID', default=None) title = self._html_search_regex( - r'<h1 class=\"entry-title\">(.*?)</h1>', webpage, 'title') - description = self._og_search_description(webpage) + r'<h1[^>]*>(.+?)</h1>', webpage, 'title', + default=None) or self._og_search_title(webpage) + description = self._og_search_description(webpage, default=None) rtmp_url = self._html_search_regex( - r"file: \"(rtmp://\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}/vod/mp4:\" \+ \"\w+\" \+ \".mp4)\"", + r'file["\']?\s*:\s*"(r(?:tm|mt)ps?://[^/]+/(?:vod/mp4:"\s*\+\s*"[^"]+"\s*\+\s*"\.mp4|stream/live))"', webpage, 'RTMP url') - rtmp_url = rtmp_url.replace("\" + \"", "") + # Yes, they have a typo in scheme name for live stream URLs (e.g. + # https://bx1.be/lives/direct-tv/) + rtmp_url = re.sub(r'^rmtp', 'rtmp', rtmp_url) + rtmp_url = re.sub(r'"\s*\+\s*"', '', rtmp_url) + formats = self._extract_wowza_formats(rtmp_url, article_id or display_id) + self._sort_formats(formats) + + is_live = 'stream/live' in rtmp_url return { - 'id': article_id, + 'id': article_id or display_id, 'display_id': display_id, - 'title': title, + 'title': self._live_title(title) if is_live else title, 'description': description, - 'url': rtmp_url, - 'ext': 'flv', - 'rtmp_live': True # if rtmpdump is not called with "--live" argument, the download is blocked and can be completed + 'formats': formats, + 'is_live': is_live, } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/telecinco.py youtube-dl-2019.05.20/youtube_dl/extractor/telecinco.py --- youtube-dl-2016.02.22/youtube_dl/extractor/telecinco.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/telecinco.py 2019-06-07 18:49:07.000000000 +0000 @@ -2,49 +2,57 @@ from __future__ import unicode_literals import json +import re from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse, - compat_urllib_parse_unquote, - compat_urlparse, -) +from .ooyala import OoyalaIE from ..utils import ( - get_element_by_attribute, - parse_duration, - strip_jsonp, + clean_html, + determine_ext, + int_or_none, + str_or_none, + urljoin, ) class TelecincoIE(InfoExtractor): IE_DESC = 'telecinco.es, cuatro.com and mediaset.es' - _VALID_URL = r'https?://www\.(?:telecinco\.es|cuatro\.com|mediaset\.es)/(?:[^/]+/)+(?P<id>.+?)\.html' + _VALID_URL = r'https?://(?:www\.)?(?:telecinco\.es|cuatro\.com|mediaset\.es)/(?:[^/]+/)+(?P<id>.+?)\.html' _TESTS = [{ 'url': 'http://www.telecinco.es/robinfood/temporada-01/t01xp14/Bacalao-cocochas-pil-pil_0_1876350223.html', - 'md5': '5cbef3ad5ef17bf0d21570332d140729', 'info_dict': { - 'id': 'MDSVID20141015_0058', - 'ext': 'mp4', - 'title': 'Con Martín Berasategui, hacer un bacalao al ...', - 'duration': 662, + 'id': '1876350223', + 'title': 'Bacalao con kokotxas al pil-pil', + 'description': 'md5:1382dacd32dd4592d478cbdca458e5bb', }, + 'playlist': [{ + 'md5': 'adb28c37238b675dad0f042292f209a7', + 'info_dict': { + 'id': 'JEA5ijCnF6p5W08A1rNKn7', + 'ext': 'mp4', + 'title': 'Con Martín Berasategui, hacer un bacalao al pil-pil es fácil y divertido', + 'duration': 662, + }, + }] }, { 'url': 'http://www.cuatro.com/deportes/futbol/barcelona/Leo_Messi-Champions-Roma_2_2052780128.html', - 'md5': '0a5b9f3cc8b074f50a0578f823a12694', + 'md5': '9468140ebc300fbb8b9d65dc6e5c4b43', 'info_dict': { - 'id': 'MDSVID20150916_0128', + 'id': 'jn24Od1zGLG4XUZcnUnZB6', 'ext': 'mp4', - 'title': '¿Quién es este ex futbolista con el que hablan ...', + 'title': '¿Quién es este ex futbolista con el que hablan Leo Messi y Luis Suárez?', + 'description': 'md5:a62ecb5f1934fc787107d7b9a2262805', 'duration': 79, }, }, { 'url': 'http://www.mediaset.es/12meses/campanas/doylacara/conlatratanohaytrato/Ayudame-dar-cara-trata-trato_2_1986630220.html', - 'md5': 'ad1bfaaba922dd4a295724b05b68f86a', + 'md5': 'ae2dc6b7b50b2392076a51c0f70e01f6', 'info_dict': { - 'id': 'MDSVID20150513_0220', + 'id': 'aywerkD2Sv1vGNqq9b85Q2', 'ext': 'mp4', 'title': '#DOYLACARA. Con la trata no hay trato', + 'description': 'md5:2771356ff7bfad9179c5f5cd954f1477', 'duration': 50, }, }, { @@ -53,42 +61,96 @@ }, { 'url': 'http://www.telecinco.es/espanasinirmaslejos/Espana-gran-destino-turistico_2_1240605043.html', 'only_matching': True, + }, { + # ooyala video + 'url': 'http://www.cuatro.com/chesterinlove/a-carta/chester-chester_in_love-chester_edu_2_2331030022.html', + 'only_matching': True, }] - def _real_extract(self, url): - episode = self._match_id(url) - webpage = self._download_webpage(url, episode) - embed_data_json = self._search_regex( - r'(?s)MSV\.embedData\[.*?\]\s*=\s*({.*?});', webpage, 'embed data', - ).replace('\'', '"') - embed_data = json.loads(embed_data_json) - - domain = embed_data['mediaUrl'] - if not domain.startswith('http'): - # only happens in telecinco.es videos - domain = 'http://' + domain - info_url = compat_urlparse.urljoin( - domain, - compat_urllib_parse_unquote(embed_data['flashvars']['host']) - ) - info_el = self._download_xml(info_url, episode).find('./video/info') - - video_link = info_el.find('videoUrl/link').text - token_query = compat_urllib_parse.urlencode({'id': video_link}) - token_info = self._download_json( - embed_data['flashvars']['ov_tk'] + '?' + token_query, - episode, - transform_source=strip_jsonp - ) - formats = self._extract_m3u8_formats( - token_info['tokenizedUrl'], episode, ext='mp4', entry_protocol='m3u8_native') + def _parse_content(self, content, url): + video_id = content['dataMediaId'] + if content.get('dataCmsId') == 'ooyala': + return self.url_result( + 'ooyala:%s' % video_id, OoyalaIE.ie_key(), video_id) + config_url = urljoin(url, content['dataConfig']) + config = self._download_json( + config_url, video_id, 'Downloading config JSON') + title = config['info']['title'] + + def mmc_url(mmc_type): + return re.sub( + r'/(?:flash|html5)\.json', '/%s.json' % mmc_type, + config['services']['mmc']) + + duration = None + formats = [] + for mmc_type in ('flash', 'html5'): + mmc = self._download_json( + mmc_url(mmc_type), video_id, + 'Downloading %s mmc JSON' % mmc_type, fatal=False) + if not mmc: + continue + if not duration: + duration = int_or_none(mmc.get('duration')) + for location in mmc['locations']: + gat = self._proto_relative_url(location.get('gat'), 'http:') + gcp = location.get('gcp') + ogn = location.get('ogn') + if None in (gat, gcp, ogn): + continue + token_data = { + 'gcp': gcp, + 'ogn': ogn, + 'sta': 0, + } + media = self._download_json( + gat, video_id, data=json.dumps(token_data).encode('utf-8'), + headers={ + 'Content-Type': 'application/json;charset=utf-8', + 'Referer': url, + }, fatal=False) or {} + stream = media.get('stream') or media.get('file') + if not stream: + continue + ext = determine_ext(stream) + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + stream + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18', + video_id, f4m_id='hds', fatal=False)) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + stream, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + self._sort_formats(formats) return { - 'id': embed_data['videoId'], - 'display_id': episode, - 'title': info_el.find('title').text, + 'id': video_id, + 'title': title, 'formats': formats, - 'description': get_element_by_attribute('class', 'text', webpage), - 'thumbnail': info_el.find('thumb').text, - 'duration': parse_duration(info_el.find('duration').text), + 'thumbnail': content.get('dataPoster') or config.get('poster', {}).get('imageUrl'), + 'duration': duration, } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + article = self._parse_json(self._search_regex( + r'window\.\$REACTBASE_STATE\.article\s*=\s*({.+})', + webpage, 'article'), display_id)['article'] + title = article.get('title') + description = clean_html(article.get('leadParagraph')) + if article.get('editorialType') != 'VID': + entries = [] + for p in article.get('body', []): + content = p.get('content') + if p.get('type') != 'video' or not content: + continue + entries.append(self._parse_content(content, url)) + return self.playlist_result( + entries, str_or_none(article.get('id')), title, description) + content = article['opening']['content'] + info = self._parse_content(content, url) + info.update({ + 'description': description, + }) + return info diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/telegraaf.py youtube-dl-2019.05.20/youtube_dl/extractor/telegraaf.py --- youtube-dl-2016.02.22/youtube_dl/extractor/telegraaf.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/telegraaf.py 2019-06-07 18:49:07.000000000 +0000 @@ -2,34 +2,77 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import remove_end +from ..utils import ( + determine_ext, + remove_end, +) class TelegraafIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?telegraaf\.nl/tv/(?:[^/]+/)+(?P<id>\d+)/[^/]+\.html' _TEST = { 'url': 'http://www.telegraaf.nl/tv/nieuws/binnenland/24353229/__Tikibad_ontruimd_wegens_brand__.html', - 'md5': '83245a9779bcc4a24454bfd53c65b6dc', 'info_dict': { 'id': '24353229', 'ext': 'mp4', 'title': 'Tikibad ontruimd wegens brand', 'description': 'md5:05ca046ff47b931f9b04855015e163a4', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 33, }, + 'params': { + # m3u8 download + 'skip_download': True, + }, } def _real_extract(self, url): - playlist_id = self._match_id(url) + video_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) + webpage = self._download_webpage(url, video_id) + player_url = self._html_search_regex( + r'<iframe[^>]+src="([^"]+")', webpage, 'player URL') + player_page = self._download_webpage( + player_url, video_id, note='Download player webpage') playlist_url = self._search_regex( - r"iframe\.loadPlayer\('([^']+)'", webpage, 'player') + r'playlist\s*:\s*"([^"]+)"', player_page, 'playlist URL') + playlist_data = self._download_json(playlist_url, video_id) + + item = playlist_data['items'][0] + formats = [] + locations = item['locations'] + for location in locations.get('adaptive', []): + manifest_url = location['src'] + ext = determine_ext(manifest_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + manifest_url, video_id, ext='mp4', m3u8_id='hls', fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + manifest_url, video_id, mpd_id='dash', fatal=False)) + else: + self.report_warning('Unknown adaptive format %s' % ext) + for location in locations.get('progressive', []): + formats.append({ + 'url': location['sources'][0]['src'], + 'width': location.get('width'), + 'height': location.get('height'), + 'format_id': 'http-%s' % location['label'], + }) + + self._sort_formats(formats) - entries = self._extract_xspf_playlist(playlist_url, playlist_id) title = remove_end(self._og_search_title(webpage), ' - VIDEO') description = self._og_search_description(webpage) + duration = item.get('duration') + thumbnail = item.get('poster') - return self.playlist_result(entries, playlist_id, title, description) + return { + 'id': video_id, + 'title': title, + 'description': description, + 'formats': formats, + 'duration': duration, + 'thumbnail': thumbnail, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/telemb.py youtube-dl-2019.05.20/youtube_dl/extractor/telemb.py --- youtube-dl-2016.02.22/youtube_dl/extractor/telemb.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/telemb.py 2019-06-07 18:49:07.000000000 +0000 @@ -19,7 +19,7 @@ 'ext': 'mp4', 'title': 'Mons - Cook with Danielle : des cours de cuisine en anglais ! - Les reportages', 'description': 'md5:bc5225f47b17c309761c856ad4776265', - 'thumbnail': 're:^http://.*\.(?:jpg|png)$', + 'thumbnail': r're:^http://.*\.(?:jpg|png)$', } }, { @@ -32,7 +32,7 @@ 'ext': 'mp4', 'title': 'Havré - Incendie mortel - Les reportages', 'description': 'md5:5e54cb449acb029c2b7734e2d946bd4a', - 'thumbnail': 're:^http://.*\.(?:jpg|png)$', + 'thumbnail': r're:^http://.*\.(?:jpg|png)$', } }, ] diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/telequebec.py youtube-dl-2019.05.20/youtube_dl/extractor/telequebec.py --- youtube-dl-2016.02.22/youtube_dl/extractor/telequebec.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/telequebec.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,151 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + smuggle_url, + try_get, +) + + +class TeleQuebecBaseIE(InfoExtractor): + @staticmethod + def _limelight_result(media_id): + return { + '_type': 'url_transparent', + 'url': smuggle_url( + 'limelight:media:' + media_id, {'geo_countries': ['CA']}), + 'ie_key': 'LimelightMedia', + } + + +class TeleQuebecIE(TeleQuebecBaseIE): + _VALID_URL = r'https?://zonevideo\.telequebec\.tv/media/(?P<id>\d+)' + _TESTS = [{ + # available till 01.01.2023 + 'url': 'http://zonevideo.telequebec.tv/media/37578/un-petit-choc-et-puis-repart/un-chef-a-la-cabane', + 'info_dict': { + 'id': '577116881b4b439084e6b1cf4ef8b1b3', + 'ext': 'mp4', + 'title': 'Un petit choc et puis repart!', + 'description': 'md5:b04a7e6b3f74e32d7b294cffe8658374', + 'upload_date': '20180222', + 'timestamp': 1519326631, + }, + 'params': { + 'skip_download': True, + }, + }, { + # no description + 'url': 'http://zonevideo.telequebec.tv/media/30261', + 'only_matching': True, + }] + + def _real_extract(self, url): + media_id = self._match_id(url) + + media_data = self._download_json( + 'https://mnmedias.api.telequebec.tv/api/v2/media/' + media_id, + media_id)['media'] + + info = self._limelight_result(media_data['streamInfo']['sourceId']) + info.update({ + 'title': media_data.get('title'), + 'description': try_get( + media_data, lambda x: x['descriptions'][0]['text'], compat_str), + 'duration': int_or_none( + media_data.get('durationInMilliseconds'), 1000), + }) + return info + + +class TeleQuebecEmissionIE(TeleQuebecBaseIE): + _VALID_URL = r'''(?x) + https?:// + (?: + [^/]+\.telequebec\.tv/emissions/| + (?:www\.)?telequebec\.tv/ + ) + (?P<id>[^?#&]+) + ''' + _TESTS = [{ + 'url': 'http://lindicemcsween.telequebec.tv/emissions/100430013/des-soins-esthetiques-a-377-d-interets-annuels-ca-vous-tente', + 'info_dict': { + 'id': '66648a6aef914fe3badda25e81a4d50a', + 'ext': 'mp4', + 'title': "Des soins esthétiques à 377 % d'intérêts annuels, ça vous tente?", + 'description': 'md5:369e0d55d0083f1fc9b71ffb640ea014', + 'upload_date': '20171024', + 'timestamp': 1508862118, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://bancpublic.telequebec.tv/emissions/emission-49/31986/jeunes-meres-sous-pression', + 'only_matching': True, + }, { + 'url': 'http://www.telequebec.tv/masha-et-michka/epi059masha-et-michka-3-053-078', + 'only_matching': True, + }, { + 'url': 'http://www.telequebec.tv/documentaire/bebes-sur-mesure/', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + media_id = self._search_regex( + r'mediaUID\s*:\s*["\'][Ll]imelight_(?P<id>[a-z0-9]{32})', webpage, + 'limelight id') + + info = self._limelight_result(media_id) + info.update({ + 'title': self._og_search_title(webpage, default=None), + 'description': self._og_search_description(webpage, default=None), + }) + return info + + +class TeleQuebecLiveIE(InfoExtractor): + _VALID_URL = r'https?://zonevideo\.telequebec\.tv/(?P<id>endirect)' + _TEST = { + 'url': 'http://zonevideo.telequebec.tv/endirect/', + 'info_dict': { + 'id': 'endirect', + 'ext': 'mp4', + 'title': 're:^Télé-Québec - En direct [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + m3u8_url = None + webpage = self._download_webpage( + 'https://player.telequebec.tv/Tq_VideoPlayer.js', video_id, + fatal=False) + if webpage: + m3u8_url = self._search_regex( + r'm3U8Url\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'm3u8 url', default=None, group='url') + if not m3u8_url: + m3u8_url = 'https://teleqmmd.mmdlive.lldns.net/teleqmmd/f386e3b206814e1f8c8c1c71c0f8e748/manifest.m3u8' + formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', m3u8_id='hls') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._live_title('Télé-Québec - En direct'), + 'is_live': True, + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/telewebion.py youtube-dl-2019.05.20/youtube_dl/extractor/telewebion.py --- youtube-dl-2016.02.22/youtube_dl/extractor/telewebion.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/telewebion.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,55 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class TelewebionIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?telewebion\.com/#!/episode/(?P<id>\d+)' + + _TEST = { + 'url': 'http://www.telewebion.com/#!/episode/1263668/', + 'info_dict': { + 'id': '1263668', + 'ext': 'mp4', + 'title': 'قرعه\u200cکشی لیگ قهرمانان اروپا', + 'thumbnail': r're:^https?://.*\.jpg', + 'view_count': int, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + secure_token = self._download_webpage( + 'http://m.s2.telewebion.com/op/op?action=getSecurityToken', video_id) + episode_details = self._download_json( + 'http://m.s2.telewebion.com/op/op', video_id, + query={'action': 'getEpisodeDetails', 'episode_id': video_id}) + + m3u8_url = 'http://m.s1.telewebion.com/smil/%s.m3u8?filepath=%s&m3u8=1&secure_token=%s' % ( + video_id, episode_details['file_path'], secure_token) + formats = self._extract_m3u8_formats( + m3u8_url, video_id, ext='mp4', m3u8_id='hls') + + picture_paths = [ + episode_details.get('picture_path'), + episode_details.get('large_picture_path'), + ] + + thumbnails = [{ + 'url': picture_path, + 'preference': idx, + } for idx, picture_path in enumerate(picture_paths) if picture_path is not None] + + return { + 'id': video_id, + 'title': episode_details['title'], + 'formats': formats, + 'thumbnails': thumbnails, + 'view_count': episode_details.get('view_count'), + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/tennistv.py youtube-dl-2019.05.20/youtube_dl/extractor/tennistv.py --- youtube-dl-2016.02.22/youtube_dl/extractor/tennistv.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/tennistv.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,112 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor + +from ..utils import ( + ExtractorError, + unified_timestamp, +) + + +class TennisTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tennistv\.com/videos/(?P<id>[-a-z0-9]+)' + _TEST = { + 'url': 'https://www.tennistv.com/videos/indian-wells-2018-verdasco-fritz', + 'info_dict': { + 'id': 'indian-wells-2018-verdasco-fritz', + 'ext': 'mp4', + 'title': 'Fernando Verdasco v Taylor Fritz', + 'description': 're:^After his stunning victory.{174}$', + 'thumbnail': 'https://atp-prod.akamaized.net/api/images/v1/images/112831/landscape/1242/0', + 'timestamp': 1521017381, + 'upload_date': '20180314', + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Requires email and password of a subscribed account', + } + _NETRC_MACHINE = 'tennistv' + + def _login(self): + username, password = self._get_login_info() + if not username or not password: + raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) + + login_form = { + 'Email': username, + 'Password': password, + } + login_json = json.dumps(login_form).encode('utf-8') + headers = { + 'content-type': 'application/json', + 'Referer': 'https://www.tennistv.com/login', + 'Origin': 'https://www.tennistv.com', + } + + login_result = self._download_json( + 'https://www.tennistv.com/api/users/v1/login', None, + note='Logging in', + errnote='Login failed (wrong password?)', + headers=headers, + data=login_json) + + if login_result['error']['errorCode']: + raise ExtractorError('Login failed, %s said: %r' % (self.IE_NAME, login_result['error']['errorMessage'])) + + if login_result['entitlement'] != 'SUBSCRIBED': + self.report_warning('%s may not be subscribed to %s.' % (username, self.IE_NAME)) + + self._session_token = login_result['sessionToken'] + + def _real_initialize(self): + self._login() + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + internal_id = self._search_regex(r'video=([0-9]+)', webpage, 'internal video id') + + headers = { + 'Origin': 'https://www.tennistv.com', + 'authorization': 'ATP %s' % self._session_token, + 'content-type': 'application/json', + 'Referer': url, + } + check_data = { + 'videoID': internal_id, + 'VideoUrlType': 'HLSV3', + } + check_json = json.dumps(check_data).encode('utf-8') + check_result = self._download_json( + 'https://www.tennistv.com/api/users/v1/entitlementchecknondiva', + video_id, note='Checking video authorization', headers=headers, data=check_json) + formats = self._extract_m3u8_formats(check_result['contentUrl'], video_id, ext='mp4') + + vdata_url = 'https://www.tennistv.com/api/channels/v1/de/none/video/%s' % video_id + vdata = self._download_json(vdata_url, video_id) + + timestamp = unified_timestamp(vdata['timestamp']) + thumbnail = vdata['video']['thumbnailUrl'] + description = vdata['displayText']['description'] + title = vdata['video']['title'] + + series = vdata['tour'] + venue = vdata['displayText']['venue'] + round_str = vdata['seo']['round'] + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'formats': formats, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'series': series, + 'season': venue, + 'episode': round_str, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/tenplay.py youtube-dl-2019.05.20/youtube_dl/extractor/tenplay.py --- youtube-dl-2016.02.22/youtube_dl/extractor/tenplay.py 2016-02-14 22:26:17.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/tenplay.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,90 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - float_or_none, -) - - -class TenPlayIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ten(play)?\.com\.au/.+' - _TEST = { - 'url': 'http://tenplay.com.au/ten-insider/extra/season-2013/tenplay-tv-your-way', - 'info_dict': { - 'id': '2695695426001', - 'ext': 'flv', - 'title': 'TENplay: TV your way', - 'description': 'Welcome to a new TV experience. Enjoy a taste of the TENplay benefits.', - 'timestamp': 1380150606.889, - 'upload_date': '20130925', - 'uploader': 'TENplay', - }, - 'params': { - 'skip_download': True, # Requires rtmpdump - } - } - - _video_fields = [ - 'id', 'name', 'shortDescription', 'longDescription', 'creationDate', - 'publishedDate', 'lastModifiedDate', 'customFields', 'videoStillURL', - 'thumbnailURL', 'referenceId', 'length', 'playsTotal', - 'playsTrailingWeek', 'renditions', 'captioning', 'startDate', 'endDate'] - - def _real_extract(self, url): - webpage = self._download_webpage(url, url) - video_id = self._html_search_regex( - r'videoID: "(\d+?)"', webpage, 'video_id') - api_token = self._html_search_regex( - r'apiToken: "([a-zA-Z0-9-_\.]+?)"', webpage, 'api_token') - title = self._html_search_regex( - r'<meta property="og:title" content="\s*(.*?)\s*"\s*/?\s*>', - webpage, 'title') - - json = self._download_json('https://api.brightcove.com/services/library?command=find_video_by_id&video_id=%s&token=%s&video_fields=%s' % (video_id, api_token, ','.join(self._video_fields)), title) - - formats = [] - for rendition in json['renditions']: - url = rendition['remoteUrl'] or rendition['url'] - protocol = 'rtmp' if url.startswith('rtmp') else 'http' - ext = 'flv' if protocol == 'rtmp' else rendition['videoContainer'].lower() - - if protocol == 'rtmp': - url = url.replace('&mp4:', '') - - tbr = int_or_none(rendition.get('encodingRate'), 1000) - - formats.append({ - 'format_id': '_'.join( - ['rtmp', rendition['videoContainer'].lower(), - rendition['videoCodec'].lower(), '%sk' % tbr]), - 'width': int_or_none(rendition['frameWidth']), - 'height': int_or_none(rendition['frameHeight']), - 'tbr': tbr, - 'filesize': int_or_none(rendition['size']), - 'protocol': protocol, - 'ext': ext, - 'vcodec': rendition['videoCodec'].lower(), - 'container': rendition['videoContainer'].lower(), - 'url': url, - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'display_id': json['referenceId'], - 'title': json['name'], - 'description': json['shortDescription'] or json['longDescription'], - 'formats': formats, - 'thumbnails': [{ - 'url': json['videoStillURL'] - }, { - 'url': json['thumbnailURL'] - }], - 'thumbnail': json['videoStillURL'], - 'duration': float_or_none(json.get('length'), 1000), - 'timestamp': float_or_none(json.get('creationDate'), 1000), - 'uploader': json.get('customFields', {}).get('production_company_distributor') or 'TENplay', - 'view_count': int_or_none(json.get('playsTotal')), - } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/testurl.py youtube-dl-2019.05.20/youtube_dl/extractor/testurl.py --- youtube-dl-2016.02.22/youtube_dl/extractor/testurl.py 2016-01-14 09:35:12.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/testurl.py 2019-06-07 18:49:07.000000000 +0000 @@ -61,8 +61,4 @@ self.to_screen('Test URL: %s' % tc['url']) - return { - '_type': 'url', - 'url': tc['url'], - 'id': video_id, - } + return self.url_result(tc['url'], video_id=video_id) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/tf1.py youtube-dl-2019.05.20/youtube_dl/extractor/tf1.py --- youtube-dl-2016.02.22/youtube_dl/extractor/tf1.py 2016-02-22 10:57:19.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/tf1.py 2019-06-07 18:49:07.000000000 +0000 @@ -6,7 +6,7 @@ class TF1IE(InfoExtractor): """TF1 uses the wat.tv player.""" - _VALID_URL = r'http://(?:(?:videos|www|lci)\.tf1|www\.tfou)\.fr/(?:[^/]+/)*(?P<id>.+?)\.html' + _VALID_URL = r'https?://(?:(?:videos|www|lci)\.tf1|(?:www\.)?(?:tfou|ushuaiatv|histoire|tvbreizh))\.fr/(?:[^/]+/)*(?P<id>[^/?#.]+)' _TESTS = [{ 'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html', 'info_dict': { @@ -19,6 +19,7 @@ # Sometimes wat serves the whole file with the --test option 'skip_download': True, }, + 'expected_warnings': ['HTTP Error 404'], }, { 'url': 'http://www.tfou.fr/chuggington/videos/le-grand-mysterioso-chuggington-7085291-739.html', 'info_dict': { diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/tfo.py youtube-dl-2019.05.20/youtube_dl/extractor/tfo.py --- youtube-dl-2016.02.22/youtube_dl/extractor/tfo.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/tfo.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,57 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..utils import ( + HEADRequest, + ExtractorError, + int_or_none, + clean_html, +) + + +class TFOIE(InfoExtractor): + _GEO_COUNTRIES = ['CA'] + _VALID_URL = r'https?://(?:www\.)?tfo\.org/(?:en|fr)/(?:[^/]+/){2}(?P<id>\d+)' + _TEST = { + 'url': 'http://www.tfo.org/en/universe/tfo-247/100463871/video-game-hackathon', + 'md5': '47c987d0515561114cf03d1226a9d4c7', + 'info_dict': { + 'id': '100463871', + 'ext': 'mp4', + 'title': 'Video Game Hackathon', + 'description': 'md5:558afeba217c6c8d96c60e5421795c07', + 'upload_date': '20160212', + 'timestamp': 1455310233, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + self._request_webpage(HEADRequest('http://www.tfo.org/'), video_id) + infos = self._download_json( + 'http://www.tfo.org/api/web/video/get_infos', video_id, data=json.dumps({ + 'product_id': video_id, + }).encode(), headers={ + 'X-tfo-session': self._get_cookies('http://www.tfo.org/')['tfo-session'].value, + }) + if infos.get('success') == 0: + if infos.get('code') == 'ErrGeoBlocked': + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + raise ExtractorError('%s said: %s' % (self.IE_NAME, clean_html(infos['msg'])), expected=True) + video_data = infos['data'] + + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': 'limelight:media:' + video_data['llid'], + 'title': video_data['title'], + 'description': video_data.get('description'), + 'series': video_data.get('collection'), + 'season_number': int_or_none(video_data.get('season')), + 'episode_number': int_or_none(video_data.get('episode')), + 'duration': int_or_none(video_data.get('duration')), + 'ie_key': 'LimelightMedia', + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/theintercept.py youtube-dl-2019.05.20/youtube_dl/extractor/theintercept.py --- youtube-dl-2016.02.22/youtube_dl/extractor/theintercept.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/theintercept.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor @@ -11,7 +11,7 @@ class TheInterceptIE(InfoExtractor): - _VALID_URL = r'https://theintercept.com/fieldofvision/(?P<id>[^/?#]+)' + _VALID_URL = r'https?://theintercept\.com/fieldofvision/(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'https://theintercept.com/fieldofvision/thisisacoup-episode-four-surrender-or-die/', 'md5': '145f28b41d44aab2f87c0a4ac8ec95bd', diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/theonion.py youtube-dl-2019.05.20/youtube_dl/extractor/theonion.py --- youtube-dl-2016.02.22/youtube_dl/extractor/theonion.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/theonion.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,63 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - - -class TheOnionIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?theonion\.com/video/[^,]+,(?P<id>[0-9]+)/?' - _TEST = { - 'url': 'http://www.theonion.com/video/man-wearing-mm-jacket-gods-image,36918/', - 'md5': '19eaa9a39cf9b9804d982e654dc791ee', - 'info_dict': { - 'id': '2133', - 'ext': 'mp4', - 'title': 'Man Wearing M&M Jacket Apparently Made In God\'s Image', - 'description': 'md5:cc12448686b5600baae9261d3e180910', - 'thumbnail': 're:^https?://.*\.jpg\?\d+$', - } - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - video_id = self._search_regex( - r'"videoId":\s(\d+),', webpage, 'video ID') - title = self._og_search_title(webpage) - description = self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) - - sources = re.findall(r'<source src="([^"]+)" type="([^"]+)"', webpage) - formats = [] - for src, type_ in sources: - if type_ == 'video/mp4': - formats.append({ - 'format_id': 'mp4_sd', - 'preference': 1, - 'url': src, - }) - elif type_ == 'video/webm': - formats.append({ - 'format_id': 'webm_sd', - 'preference': 0, - 'url': src, - }) - elif type_ == 'application/x-mpegURL': - formats.extend( - self._extract_m3u8_formats(src, display_id, preference=-1)) - else: - self.report_warning( - 'Encountered unexpected format: %s' % type_) - self._sort_formats(formats) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'formats': formats, - 'thumbnail': thumbnail, - 'description': description, - } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/theplatform.py youtube-dl-2019.05.20/youtube_dl/extractor/theplatform.py --- youtube-dl-2016.02.22/youtube_dl/extractor/theplatform.py 2016-02-22 10:57:19.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/theplatform.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals import re @@ -8,7 +8,8 @@ import hashlib -from .common import InfoExtractor +from .once import OnceIE +from .adobepass import AdobePassIE from ..compat import ( compat_parse_qs, compat_urllib_parse_urlparse, @@ -20,72 +21,113 @@ int_or_none, sanitized_Request, unsmuggle_url, + update_url_query, xpath_with_ns, mimetype2ext, + find_xpath_attr, ) default_ns = 'http://www.w3.org/2005/SMIL21/Language' _x = lambda p: xpath_with_ns(p, {'smil': default_ns}) -class ThePlatformBaseIE(InfoExtractor): +class ThePlatformBaseIE(OnceIE): + _TP_TLD = 'com' + def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'): - meta = self._download_xml(smil_url, video_id, note=note) - try: - error_msg = next( - n.attrib['abstract'] - for n in meta.findall(_x('.//smil:ref')) - if n.attrib.get('title') == 'Geographic Restriction' or n.attrib.get('title') == 'Expired') - except StopIteration: - pass - else: - raise ExtractorError(error_msg, expected=True) + meta = self._download_xml( + smil_url, video_id, note=note, query={'format': 'SMIL'}, + headers=self.geo_verification_headers()) + error_element = find_xpath_attr(meta, _x('.//smil:ref'), 'src') + if error_element is not None: + exception = find_xpath_attr( + error_element, _x('.//smil:param'), 'name', 'exception') + if exception is not None: + if exception.get('value') == 'GeoLocationBlocked': + self.raise_geo_restricted(error_element.attrib['abstract']) + elif error_element.attrib['src'].startswith( + 'http://link.theplatform.%s/s/errorFiles/Unavailable.' + % self._TP_TLD): + raise ExtractorError( + error_element.attrib['abstract'], expected=True) - formats = self._parse_smil_formats( + smil_formats = self._parse_smil_formats( meta, smil_url, video_id, namespace=default_ns, # the parameters are from syfy.com, other sites may use others, # they also work for nbc.com f4m_params={'g': 'UXWGVKRWHFSP', 'hdcore': '3.0.3'}, transform_rtmp_url=lambda streamer, src: (streamer, 'mp4:' + src)) - for _format in formats: - ext = determine_ext(_format['url']) - if ext == 'once': - _format['ext'] = 'mp4' + formats = [] + for _format in smil_formats: + if OnceIE.suitable(_format['url']): + formats.extend(self._extract_once_formats(_format['url'])) + else: + media_url = _format['url'] + if determine_ext(media_url) == 'm3u8': + hdnea2 = self._get_cookies(media_url).get('hdnea2') + if hdnea2: + _format['url'] = update_url_query(media_url, {'hdnea3': hdnea2.value}) - self._sort_formats(formats) + formats.append(_format) subtitles = self._parse_smil_subtitles(meta, default_ns) return formats, subtitles - def get_metadata(self, path, video_id): - info_url = 'http://link.theplatform.com/s/%s?format=preview' % path - info = self._download_json(info_url, video_id) + def _download_theplatform_metadata(self, path, video_id): + info_url = 'http://link.theplatform.%s/s/%s?format=preview' % (self._TP_TLD, path) + return self._download_json(info_url, video_id) + def _parse_theplatform_metadata(self, info): subtitles = {} captions = info.get('captions') if isinstance(captions, list): for caption in captions: lang, src, mime = caption.get('lang', 'en'), caption.get('src'), caption.get('type') - subtitles[lang] = [{ + subtitles.setdefault(lang, []).append({ 'ext': mimetype2ext(mime), 'url': src, - }] + }) + + duration = info.get('duration') + tp_chapters = info.get('chapters', []) + chapters = [] + if tp_chapters: + def _add_chapter(start_time, end_time): + start_time = float_or_none(start_time, 1000) + end_time = float_or_none(end_time, 1000) + if start_time is None or end_time is None: + return + chapters.append({ + 'start_time': start_time, + 'end_time': end_time, + }) + + for chapter in tp_chapters[:-1]: + _add_chapter(chapter.get('startTime'), chapter.get('endTime')) + _add_chapter(tp_chapters[-1].get('startTime'), tp_chapters[-1].get('endTime') or duration) return { 'title': info['title'], 'subtitles': subtitles, 'description': info['description'], 'thumbnail': info['defaultThumbnailUrl'], - 'duration': int_or_none(info.get('duration'), 1000), + 'duration': float_or_none(duration, 1000), + 'timestamp': int_or_none(info.get('pubDate'), 1000) or None, + 'uploader': info.get('billingCode'), + 'chapters': chapters, } + def _extract_theplatform_metadata(self, path, video_id): + info = self._download_theplatform_metadata(path, video_id) + return self._parse_theplatform_metadata(info) + -class ThePlatformIE(ThePlatformBaseIE): +class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): _VALID_URL = r'''(?x) (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P<provider_id>[^/]+)/ - (?:(?P<media>(?:(?:[^/]+/)+select/)?media/)|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))? + (?:(?:(?:[^/]+/)+select/)?(?P<media>media/(?:guid/\d+/)?)?|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))? |theplatform:)(?P<id>[^/\?&]+)''' _TESTS = [{ @@ -97,11 +139,15 @@ 'title': 'Blackberry\'s big, bold Z30', 'description': 'The Z30 is Blackberry\'s biggest, baddest mobile messaging device yet.', 'duration': 247, + 'timestamp': 1383239700, + 'upload_date': '20131031', + 'uploader': 'CBSI-NEW', }, 'params': { # rtmp download 'skip_download': True, }, + 'skip': '404 Not Found', }, { # from http://www.cnet.com/videos/tesla-model-s-a-second-step-towards-a-cleaner-motoring-future/ 'url': 'http://link.theplatform.com/s/kYEXFC/22d_qsQ6MIRT', @@ -110,6 +156,9 @@ 'ext': 'flv', 'description': 'md5:ac330c9258c04f9d7512cf26b9595409', 'title': 'Tesla Model S: A second step towards a cleaner motoring future', + 'timestamp': 1426176191, + 'upload_date': '20150312', + 'uploader': 'CBSI-NEW', }, 'params': { # rtmp download @@ -122,23 +171,24 @@ 'ext': 'mp4', 'description': 'md5:644ad9188d655b742f942bf2e06b002d', 'title': 'HIGHLIGHTS: USA bag first ever series Cup win', + 'uploader': 'EGSM', } }, { 'url': 'http://player.theplatform.com/p/NnzsPC/widget/select/media/4Y0TlYUr_ZT7', 'only_matching': True, }, { 'url': 'http://player.theplatform.com/p/2E2eJC/nbcNewsOffsite?guid=tdy_or_siri_150701', - 'md5': '734f3790fb5fc4903da391beeebc4836', + 'md5': 'fb96bb3d85118930a5b055783a3bd992', 'info_dict': { 'id': 'tdy_or_siri_150701', 'ext': 'mp4', 'title': 'iPhone Siri’s sassy response to a math question has people talking', 'description': 'md5:a565d1deadd5086f3331d57298ec6333', 'duration': 83.0, - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': 1435752600, 'upload_date': '20150701', - 'categories': ['Today/Shows/Orange Room', 'Today/Sections/Money', 'Today/Topics/Tech', "Today/Topics/Editor's picks"], + 'uploader': 'NBCU-NEWS', }, }, { # From http://www.nbc.com/the-blacklist/video/sir-crispin-crandall/2928790?onid=137781#vc137781=1 @@ -147,6 +197,24 @@ 'only_matching': True, }] + @classmethod + def _extract_urls(cls, webpage): + m = re.search( + r'''(?x) + <meta\s+ + property=(["'])(?:og:video(?::(?:secure_)?url)?|twitter:player)\1\s+ + content=(["'])(?P<url>https?://player\.theplatform\.com/p/.+?)\2 + ''', webpage) + if m: + return [m.group('url')] + + # Are whitesapces ignored in URLs? + # https://github.com/ytdl-org/youtube-dl/issues/12044 + matches = re.findall( + r'(?s)<(?:iframe|script)[^>]+src=(["\'])((?:https?:)?//player\.theplatform\.com/p/.+?)\1', webpage) + if matches: + return [re.sub(r'\s', '', list(zip(*matches))[1][0])] + @staticmethod def _sign_url(url, sig_key, sig_secret, life=600, include_qs=False): flags = '10' if include_qs else '00' @@ -155,11 +223,11 @@ def str_to_hex(str): return binascii.b2a_hex(str.encode('ascii')).decode('ascii') - def hex_to_str(hex): - return binascii.a2b_hex(hex) + def hex_to_bytes(hex): + return binascii.a2b_hex(hex.encode('ascii')) - relative_path = url.split('http://link.theplatform.com/s/')[1].split('?')[0] - clear_text = hex_to_str(flags + expiration_date + str_to_hex(relative_path)) + relative_path = re.match(r'https?://link\.theplatform\.com/s/([^?]+)', url).group(1) + clear_text = hex_to_bytes(flags + expiration_date + str_to_hex(relative_path)) checksum = hmac.new(sig_key.encode('ascii'), clear_text, hashlib.sha1).hexdigest() sig = flags + expiration_date + checksum + str_to_hex(sig_secret) return '%s&sig=%s' % (url, sig) @@ -174,10 +242,10 @@ if not provider_id: provider_id = 'dJ5BDC' - path = provider_id + path = provider_id + '/' if mobj.group('media'): - path += '/media' - path += '/' + video_id + path += mobj.group('media') + path += video_id qs_dict = compat_parse_qs(compat_urllib_parse_urlparse(url).query) if 'guid' in qs_dict: @@ -203,7 +271,7 @@ if smuggled_data.get('force_smil_url', False): smil_url = url - # Explicitly specified SMIL (see https://github.com/rg3/youtube-dl/issues/7385) + # Explicitly specified SMIL (see https://github.com/ytdl-org/youtube-dl/issues/7385) elif '/guid/' in url: headers = {} source_url = smuggled_data.get('source_url') @@ -216,7 +284,7 @@ webpage, 'smil url', group='url') path = self._search_regex( r'link\.theplatform\.com/s/((?:[^/?#&]+/)+[^/?#&]+)', smil_url, 'path') - smil_url += '?' if '?' not in smil_url else '&' + 'formats=m3u,mpeg4&format=SMIL' + smil_url += '?' if '?' not in smil_url else '&' + 'formats=m3u,mpeg4' elif mobj.group('config'): config_url = url + '&form=json' config_url = config_url.replace('swf/', 'config/') @@ -226,17 +294,18 @@ release_url = config['releaseUrl'] else: release_url = 'http://link.theplatform.com/s/%s?mbr=true' % path - smil_url = release_url + '&format=SMIL&formats=MPEG4&manifest=f4m' + smil_url = release_url + '&formats=MPEG4&manifest=f4m' else: - smil_url = 'http://link.theplatform.com/s/%s/meta.smil?format=smil&mbr=true' % path + smil_url = 'http://link.theplatform.com/s/%s?mbr=true' % path sig = smuggled_data.get('sig') if sig: smil_url = self._sign_url(smil_url, sig['key'], sig['secret']) formats, subtitles = self._extract_theplatform_smil(smil_url, video_id) + self._sort_formats(formats) - ret = self.get_metadata(path, video_id) + ret = self._extract_theplatform_metadata(path, video_id) combined_subtitles = self._merge_subtitles(ret.get('subtitles', {}), subtitles) ret.update({ 'id': video_id, @@ -248,49 +317,61 @@ class ThePlatformFeedIE(ThePlatformBaseIE): - _URL_TEMPLATE = '%s//feed.theplatform.com/f/%s/%s?form=json&byGuid=%s' - _VALID_URL = r'https?://feed\.theplatform\.com/f/(?P<provider_id>[^/]+)/(?P<feed_id>[^?/]+)\?(?:[^&]+&)*byGuid=(?P<id>[a-zA-Z0-9_]+)' - _TEST = { + _URL_TEMPLATE = '%s//feed.theplatform.com/f/%s/%s?form=json&%s' + _VALID_URL = r'https?://feed\.theplatform\.com/f/(?P<provider_id>[^/]+)/(?P<feed_id>[^?/]+)\?(?:[^&]+&)*(?P<filter>by(?:Gui|I)d=(?P<id>[^&]+))' + _TESTS = [{ # From http://player.theplatform.com/p/7wvmTC/MSNBCEmbeddedOffSite?guid=n_hardball_5biden_140207 'url': 'http://feed.theplatform.com/f/7wvmTC/msnbc_video-p-test?form=json&pretty=true&range=-40&byGuid=n_hardball_5biden_140207', - 'md5': '22d2b84f058d3586efcd99e57d59d314', + 'md5': '6e32495b5073ab414471b615c5ded394', 'info_dict': { 'id': 'n_hardball_5biden_140207', 'ext': 'mp4', 'title': 'The Biden factor: will Joe run in 2016?', 'description': 'Could Vice President Joe Biden be preparing a 2016 campaign? Mark Halperin and Sam Stein weigh in.', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20140208', 'timestamp': 1391824260, 'duration': 467.0, 'categories': ['MSNBC/Issues/Democrats', 'MSNBC/Issues/Elections/Election 2016'], + 'uploader': 'NBCU-NEWS', }, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - - video_id = mobj.group('id') - provider_id = mobj.group('provider_id') - feed_id = mobj.group('feed_id') + }, { + 'url': 'http://feed.theplatform.com/f/2E2eJC/nnd_NBCNews?byGuid=nn_netcast_180306.Copy.01', + 'only_matching': True, + }] - real_url = self._URL_TEMPLATE % (self.http_scheme(), provider_id, feed_id, video_id) - feed = self._download_json(real_url, video_id) - entry = feed['entries'][0] + def _extract_feed_info(self, provider_id, feed_id, filter_query, video_id, custom_fields=None, asset_types_query={}, account_id=None): + real_url = self._URL_TEMPLATE % (self.http_scheme(), provider_id, feed_id, filter_query) + entry = self._download_json(real_url, video_id)['entries'][0] + main_smil_url = 'http://link.theplatform.com/s/%s/media/guid/%d/%s' % (provider_id, account_id, entry['guid']) if account_id else entry.get('plmedia$publicUrl') formats = [] subtitles = {} first_video_id = None duration = None + asset_types = [] for item in entry['media$content']: - smil_url = item['plfile$url'] + '&format=SMIL&mbr=true' + smil_url = item['plfile$url'] cur_video_id = ThePlatformIE._match_id(smil_url) if first_video_id is None: first_video_id = cur_video_id duration = float_or_none(item.get('plfile$duration')) - cur_formats, cur_subtitles = self._extract_theplatform_smil(smil_url, video_id, 'Downloading SMIL data for %s' % cur_video_id) - formats.extend(cur_formats) - subtitles = self._merge_subtitles(subtitles, cur_subtitles) + file_asset_types = item.get('plfile$assetTypes') or compat_parse_qs(compat_urllib_parse_urlparse(smil_url).query)['assetTypes'] + for asset_type in file_asset_types: + if asset_type in asset_types: + continue + asset_types.append(asset_type) + query = { + 'mbr': 'true', + 'formats': item['plfile$format'], + 'assetTypes': asset_type, + } + if asset_type in asset_types_query: + query.update(asset_types_query[asset_type]) + cur_formats, cur_subtitles = self._extract_theplatform_smil(update_url_query( + main_smil_url or smil_url, query), video_id, 'Downloading SMIL data for %s' % asset_type) + formats.extend(cur_formats) + subtitles = self._merge_subtitles(subtitles, cur_subtitles) self._sort_formats(formats) @@ -303,7 +384,7 @@ timestamp = int_or_none(entry.get('media$availableDate'), scale=1000) categories = [item['media$name'] for item in entry.get('media$categories', [])] - ret = self.get_metadata('%s/%s' % (provider_id, first_video_id), video_id) + ret = self._extract_theplatform_metadata('%s/%s' % (provider_id, first_video_id), video_id) subtitles = self._merge_subtitles(subtitles, ret['subtitles']) ret.update({ 'id': video_id, @@ -314,5 +395,17 @@ 'timestamp': timestamp, 'categories': categories, }) + if custom_fields: + ret.update(custom_fields(entry)) return ret + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + video_id = mobj.group('id') + provider_id = mobj.group('provider_id') + feed_id = mobj.group('feed_id') + filter_query = mobj.group('filter') + + return self._extract_feed_info(provider_id, feed_id, filter_query, video_id) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/thescene.py youtube-dl-2019.05.20/youtube_dl/extractor/thescene.py --- youtube-dl-2016.02.22/youtube_dl/extractor/thescene.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/thescene.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,44 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + +from ..compat import compat_urlparse + + +class TheSceneIE(InfoExtractor): + _VALID_URL = r'https?://thescene\.com/watch/[^/]+/(?P<id>[^/#?]+)' + + _TEST = { + 'url': 'https://thescene.com/watch/vogue/narciso-rodriguez-spring-2013-ready-to-wear', + 'info_dict': { + 'id': '520e8faac2b4c00e3c6e5f43', + 'ext': 'mp4', + 'title': 'Narciso Rodriguez: Spring 2013 Ready-to-Wear', + 'display_id': 'narciso-rodriguez-spring-2013-ready-to-wear', + 'duration': 127, + 'series': 'Style.com Fashion Shows', + 'season': 'Ready To Wear Spring 2013', + 'tags': list, + 'categories': list, + 'upload_date': '20120913', + 'timestamp': 1347512400, + 'uploader': 'vogue', + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + player_url = compat_urlparse.urljoin( + url, + self._html_search_regex( + r'id=\'js-player-script\'[^>]+src=\'(.+?)\'', webpage, 'player url')) + + return { + '_type': 'url_transparent', + 'display_id': display_id, + 'url': player_url, + 'ie_key': 'CondeNast', + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/thesixtyone.py youtube-dl-2019.05.20/youtube_dl/extractor/thesixtyone.py --- youtube-dl-2016.02.22/youtube_dl/extractor/thesixtyone.py 2016-02-14 22:26:17.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/thesixtyone.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,102 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import unified_strdate - - -class TheSixtyOneIE(InfoExtractor): - _VALID_URL = r'''(?x)https?://(?:www\.)?thesixtyone\.com/ - (?:.*?/)* - (?: - s| - song/comments/list| - song - )/(?P<id>[A-Za-z0-9]+)/?$''' - _SONG_URL_TEMPLATE = 'http://thesixtyone.com/s/{0:}' - _SONG_FILE_URL_TEMPLATE = 'http://{audio_server:}/thesixtyone_production/audio/{0:}_stream' - _THUMBNAIL_URL_TEMPLATE = '{photo_base_url:}_desktop' - _TESTS = [ - { - 'url': 'http://www.thesixtyone.com/s/SrE3zD7s1jt/', - 'md5': '821cc43b0530d3222e3e2b70bb4622ea', - 'info_dict': { - 'id': 'SrE3zD7s1jt', - 'ext': 'mp3', - 'title': 'CASIO - Unicorn War Mixtape', - 'thumbnail': 're:^https?://.*_desktop$', - 'upload_date': '20071217', - 'duration': 3208, - } - }, - { - 'url': 'http://www.thesixtyone.com/song/comments/list/SrE3zD7s1jt', - 'only_matching': True, - }, - { - 'url': 'http://www.thesixtyone.com/s/ULoiyjuJWli#/s/SrE3zD7s1jt/', - 'only_matching': True, - }, - { - 'url': 'http://www.thesixtyone.com/#/s/SrE3zD7s1jt/', - 'only_matching': True, - }, - { - 'url': 'http://www.thesixtyone.com/song/SrE3zD7s1jt/', - 'only_matching': True, - }, - ] - - _DECODE_MAP = { - 'x': 'a', - 'm': 'b', - 'w': 'c', - 'q': 'd', - 'n': 'e', - 'p': 'f', - 'a': '0', - 'h': '1', - 'e': '2', - 'u': '3', - 's': '4', - 'i': '5', - 'o': '6', - 'y': '7', - 'r': '8', - 'c': '9' - } - - def _real_extract(self, url): - song_id = self._match_id(url) - - webpage = self._download_webpage( - self._SONG_URL_TEMPLATE.format(song_id), song_id) - - song_data = self._parse_json(self._search_regex( - r'"%s":\s(\{.*?\})' % song_id, webpage, 'song_data'), song_id) - - if self._search_regex(r'(t61\.s3_audio_load\s*=\s*1\.0;)', webpage, 's3_audio_load marker', default=None): - song_data['audio_server'] = 's3.amazonaws.com' - else: - song_data['audio_server'] = song_data['audio_server'] + '.thesixtyone.com' - - keys = [self._DECODE_MAP.get(s, s) for s in song_data['key']] - url = self._SONG_FILE_URL_TEMPLATE.format( - "".join(reversed(keys)), **song_data) - - formats = [{ - 'format_id': 'sd', - 'url': url, - 'ext': 'mp3', - }] - - return { - 'id': song_id, - 'title': '{artist:} - {name:}'.format(**song_data), - 'formats': formats, - 'comment_count': song_data.get('comments_count'), - 'duration': song_data.get('play_time'), - 'like_count': song_data.get('score'), - 'thumbnail': self._THUMBNAIL_URL_TEMPLATE.format(**song_data), - 'upload_date': unified_strdate(song_data.get('publish_date')), - } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/thestar.py youtube-dl-2019.05.20/youtube_dl/extractor/thestar.py --- youtube-dl-2016.02.22/youtube_dl/extractor/thestar.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/thestar.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,36 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class TheStarIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?thestar\.com/(?:[^/]+/)*(?P<id>.+)\.html' + _TEST = { + 'url': 'http://www.thestar.com/life/2016/02/01/mankind-why-this-woman-started-a-men-s-skincare-line.html', + 'md5': '2c62dd4db2027e35579fefb97a8b6554', + 'info_dict': { + 'id': '4732393888001', + 'ext': 'mp4', + 'title': 'Mankind: Why this woman started a men\'s skin care line', + 'description': 'Robert Cribb talks to Young Lee, the founder of Uncle Peter\'s MAN.', + 'uploader_id': '794267642001', + 'timestamp': 1454353482, + 'upload_date': '20160201', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/794267642001/default_default/index.html?videoId=%s' + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + brightcove_id = self._search_regex( + r'mainartBrightcoveVideoId["\']?\s*:\s*["\']?(\d+)', + webpage, 'brightcove id') + return self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + 'BrightcoveNew', brightcove_id) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/thesun.py youtube-dl-2019.05.20/youtube_dl/extractor/thesun.py --- youtube-dl-2016.02.22/youtube_dl/extractor/thesun.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/thesun.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,32 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .ooyala import OoyalaIE + + +class TheSunIE(InfoExtractor): + _VALID_URL = r'https://(?:www\.)?thesun\.co\.uk/[^/]+/(?P<id>\d+)' + _TEST = { + 'url': 'https://www.thesun.co.uk/tvandshowbiz/2261604/orlando-bloom-and-katy-perry-post-adorable-instagram-video-together-celebrating-thanksgiving-after-split-rumours/', + 'info_dict': { + 'id': '2261604', + 'title': 'md5:cba22f48bad9218b64d5bbe0e16afddf', + }, + 'playlist_count': 2, + } + + def _real_extract(self, url): + article_id = self._match_id(url) + + webpage = self._download_webpage(url, article_id) + + entries = [] + for ooyala_id in re.findall( + r'<[^>]+\b(?:id\s*=\s*"thesun-ooyala-player-|data-content-id\s*=\s*")([^"]+)', + webpage): + entries.append(OoyalaIE._build_url_result(ooyala_id)) + + return self.playlist_result( + entries, article_id, self._og_search_title(webpage, fatal=False)) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/theweatherchannel.py youtube-dl-2019.05.20/youtube_dl/extractor/theweatherchannel.py --- youtube-dl-2016.02.22/youtube_dl/extractor/theweatherchannel.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/theweatherchannel.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,79 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .theplatform import ThePlatformIE +from ..utils import ( + determine_ext, + parse_duration, +) + + +class TheWeatherChannelIE(ThePlatformIE): + _VALID_URL = r'https?://(?:www\.)?weather\.com/(?:[^/]+/)*video/(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://weather.com/series/great-outdoors/video/ice-climber-is-in-for-a-shock', + 'md5': 'ab924ac9574e79689c24c6b95e957def', + 'info_dict': { + 'id': 'cc82397e-cc3f-4d11-9390-a785add090e8', + 'ext': 'mp4', + 'title': 'Ice Climber Is In For A Shock', + 'description': 'md5:55606ce1378d4c72e6545e160c9d9695', + 'uploader': 'TWC - Digital (No Distro)', + 'uploader_id': '6ccd5455-16bb-46f2-9c57-ff858bb9f62c', + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + drupal_settings = self._parse_json(self._search_regex( + r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', + webpage, 'drupal settings'), display_id) + video_id = drupal_settings['twc']['contexts']['node']['uuid'] + video_data = self._download_json( + 'https://dsx.weather.com/cms/v4/asset-collection/en_US/' + video_id, video_id) + seo_meta = video_data.get('seometa', {}) + title = video_data.get('title') or seo_meta['title'] + + urls = [] + thumbnails = [] + formats = [] + for variant_id, variant_url in video_data.get('variants', []).items(): + variant_url = variant_url.strip() + if not variant_url or variant_url in urls: + continue + urls.append(variant_url) + ext = determine_ext(variant_url) + if ext == 'jpg': + thumbnails.append({ + 'url': variant_url, + 'id': variant_id, + }) + elif ThePlatformIE.suitable(variant_url): + tp_formats, _ = self._extract_theplatform_smil(variant_url, video_id) + formats.extend(tp_formats) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + variant_url, video_id, 'mp4', 'm3u8_native', + m3u8_id=variant_id, fatal=False)) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + variant_url, video_id, f4m_id=variant_id, fatal=False)) + else: + formats.append({ + 'url': variant_url, + 'format_id': variant_id, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': video_data.get('description') or seo_meta.get('description') or seo_meta.get('og:description'), + 'duration': parse_duration(video_data.get('duration')), + 'uploader': video_data.get('providername'), + 'uploader_id': video_data.get('providerid'), + 'thumbnails': thumbnails, + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/thisamericanlife.py youtube-dl-2019.05.20/youtube_dl/extractor/thisamericanlife.py --- youtube-dl-2016.02.22/youtube_dl/extractor/thisamericanlife.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/thisamericanlife.py 2019-06-07 18:49:07.000000000 +0000 @@ -13,7 +13,7 @@ 'ext': 'm4a', 'title': '487: Harper High School, Part One', 'description': 'md5:ee40bdf3fb96174a9027f76dbecea655', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, }, { 'url': 'http://www.thisamericanlife.org/play_full.php?play=487', diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/thisav.py youtube-dl-2019.05.20/youtube_dl/extractor/thisav.py --- youtube-dl-2016.02.22/youtube_dl/extractor/thisav.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/thisav.py 2019-06-07 18:49:07.000000000 +0000 @@ -4,12 +4,13 @@ import re from .common import InfoExtractor -from ..utils import determine_ext +from ..utils import remove_end class ThisAVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?thisav\.com/video/(?P<id>[0-9]+)/.*' - _TEST = { + _TESTS = [{ + # jwplayer 'url': 'http://www.thisav.com/video/47734/%98%26sup1%3B%83%9E%83%82---just-fit.html', 'md5': '0480f1ef3932d901f0e0e719f188f19b', 'info_dict': { @@ -19,29 +20,54 @@ 'uploader': 'dj7970', 'uploader_id': 'dj7970' } - } + }, { + # html5 media + 'url': 'http://www.thisav.com/video/242352/nerdy-18yo-big-ass-tattoos-and-glasses.html', + 'md5': 'ba90c076bd0f80203679e5b60bf523ee', + 'info_dict': { + 'id': '242352', + 'ext': 'mp4', + 'title': 'Nerdy 18yo Big Ass Tattoos and Glasses', + 'uploader': 'cybersluts', + 'uploader_id': 'cybersluts', + }, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'<h1>([^<]*)</h1>', webpage, 'title') + title = remove_end(self._html_search_regex( + r'<title>([^<]+)', webpage, 'title'), + ' - 視頻 - ThisAV.com-世界第一中文成人娛樂網站') video_url = self._html_search_regex( - r"addVariable\('file','([^']+)'\);", webpage, 'video url') + r"addVariable\('file','([^']+)'\);", webpage, 'video url', default=None) + if video_url: + info_dict = { + 'formats': [{ + 'url': video_url, + }], + } + else: + entries = self._parse_html5_media_entries(url, webpage, video_id) + if entries: + info_dict = entries[0] + else: + info_dict = self._extract_jwplayer_data( + webpage, video_id, require_title=False) uploader = self._html_search_regex( - r': ([^<]+)', + r': ([^<]+)', webpage, 'uploader name', fatal=False) uploader_id = self._html_search_regex( - r': (?:[^<]+)', + r': (?:[^<]+)', webpage, 'uploader id', fatal=False) - ext = determine_ext(video_url) - return { + info_dict.update({ 'id': video_id, - 'url': video_url, 'uploader': uploader, 'uploader_id': uploader_id, 'title': title, - 'ext': ext, - } + }) + + return info_dict diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/thisoldhouse.py youtube-dl-2019.05.20/youtube_dl/extractor/thisoldhouse.py --- youtube-dl-2016.02.22/youtube_dl/extractor/thisoldhouse.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/thisoldhouse.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,44 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import try_get + + +class ThisOldHouseIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?thisoldhouse\.com/(?:watch|how-to|tv-episode)/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.thisoldhouse.com/how-to/how-to-build-storage-bench', + 'md5': '568acf9ca25a639f0c4ff905826b662f', + 'info_dict': { + 'id': '2REGtUDQ', + 'ext': 'mp4', + 'title': 'How to Build a Storage Bench', + 'description': 'In the workshop, Tom Silva and Kevin O\'Connor build a storage bench for an entryway.', + 'timestamp': 1442548800, + 'upload_date': '20150918', + } + }, { + 'url': 'https://www.thisoldhouse.com/watch/arlington-arts-crafts-arts-and-crafts-class-begins', + 'only_matching': True, + }, { + 'url': 'https://www.thisoldhouse.com/tv-episode/ask-toh-shelf-rough-electric', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex( + (r'data-mid=(["\'])(?P(?:(?!\1).)+)\1', + r'id=(["\'])inline-video-player-(?P(?:(?!\1).)+)\1'), + webpage, 'video id', default=None, group='id') + if not video_id: + drupal_settings = self._parse_json(self._search_regex( + r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', + webpage, 'drupal settings'), display_id) + video_id = try_get( + drupal_settings, lambda x: x['jwplatform']['video_id'], + compat_str) or list(drupal_settings['comScore'])[0] + return self.url_result('jwplatform:' + video_id, 'JWPlatform', video_id) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/threeqsdn.py youtube-dl-2019.05.20/youtube_dl/extractor/threeqsdn.py --- youtube-dl-2016.02.22/youtube_dl/extractor/threeqsdn.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/threeqsdn.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,142 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + js_to_json, + mimetype2ext, +) + + +class ThreeQSDNIE(InfoExtractor): + IE_NAME = '3qsdn' + IE_DESC = '3Q SDN' + _VALID_URL = r'https?://playout\.3qsdn\.com/(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _TESTS = [{ + # ondemand from http://www.philharmonie.tv/veranstaltung/26/ + 'url': 'http://playout.3qsdn.com/0280d6b9-1215-11e6-b427-0cc47a188158?protocol=http', + 'md5': 'ab040e37bcfa2e0c079f92cb1dd7f6cd', + 'info_dict': { + 'id': '0280d6b9-1215-11e6-b427-0cc47a188158', + 'ext': 'mp4', + 'title': '0280d6b9-1215-11e6-b427-0cc47a188158', + 'is_live': False, + }, + 'expected_warnings': ['Failed to download MPD manifest', 'Failed to parse JSON'], + }, { + # live video stream + 'url': 'https://playout.3qsdn.com/d755d94b-4ab9-11e3-9162-0025907ad44f?js=true', + 'info_dict': { + 'id': 'd755d94b-4ab9-11e3-9162-0025907ad44f', + 'ext': 'mp4', + 'title': 're:^d755d94b-4ab9-11e3-9162-0025907ad44f [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'is_live': True, + }, + 'params': { + 'skip_download': True, # m3u8 downloads + }, + 'expected_warnings': ['Failed to download MPD manifest'], + }, { + # live audio stream + 'url': 'http://playout.3qsdn.com/9edf36e0-6bf2-11e2-a16a-9acf09e2db48', + 'only_matching': True, + }, { + # live audio stream with some 404 URLs + 'url': 'http://playout.3qsdn.com/ac5c3186-777a-11e2-9c30-9acf09e2db48', + 'only_matching': True, + }, { + # geo restricted with 'This content is not available in your country' + 'url': 'http://playout.3qsdn.com/d63a3ffe-75e8-11e2-9c30-9acf09e2db48', + 'only_matching': True, + }, { + # geo restricted with 'playout.3qsdn.com/forbidden' + 'url': 'http://playout.3qsdn.com/8e330f26-6ae2-11e2-a16a-9acf09e2db48', + 'only_matching': True, + }, { + # live video with rtmp link + 'url': 'https://playout.3qsdn.com/6092bb9e-8f72-11e4-a173-002590c750be', + 'only_matching': True, + }] + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r']+\b(?:data-)?src=(["\'])(?P%s.*?)\1' % ThreeQSDNIE._VALID_URL, webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + video_id = self._match_id(url) + + js = self._download_webpage( + 'http://playout.3qsdn.com/%s' % video_id, video_id, + query={'js': 'true'}) + + if any(p in js for p in ( + '>This content is not available in your country', + 'playout.3qsdn.com/forbidden')): + self.raise_geo_restricted() + + stream_content = self._search_regex( + r'streamContent\s*:\s*(["\'])(?P.+?)\1', js, + 'stream content', default='demand', group='content') + + live = stream_content == 'live' + + stream_type = self._search_regex( + r'streamType\s*:\s*(["\'])(?Paudio|video)\1', js, + 'stream type', default='video', group='type') + + formats = [] + urls = set() + + def extract_formats(item_url, item={}): + if not item_url or item_url in urls: + return + urls.add(item_url) + ext = mimetype2ext(item.get('type')) or determine_ext(item_url, default_ext=None) + if ext == 'mpd': + formats.extend(self._extract_mpd_formats( + item_url, video_id, mpd_id='mpd', fatal=False)) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + item_url, video_id, 'mp4', + entry_protocol='m3u8' if live else 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + item_url, video_id, f4m_id='hds', fatal=False)) + else: + if not self._is_valid_url(item_url, video_id): + return + formats.append({ + 'url': item_url, + 'format_id': item.get('quality'), + 'ext': 'mp4' if item_url.startswith('rtsp') else ext, + 'vcodec': 'none' if stream_type == 'audio' else None, + }) + + for item_js in re.findall(r'({[^{]*?\b(?:src|source)\s*:\s*["\'].+?})', js): + f = self._parse_json( + item_js, video_id, transform_source=js_to_json, fatal=False) + if not f: + continue + extract_formats(f.get('src'), f) + + # More relaxed version to collect additional URLs and acting + # as a future-proof fallback + for _, src in re.findall(r'\b(?:src|source)\s*:\s*(["\'])((?:https?|rtsp)://.+?)\1', js): + extract_formats(src) + + self._sort_formats(formats) + + title = self._live_title(video_id) if live else video_id + + return { + 'id': video_id, + 'title': title, + 'is_live': live, + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/thvideo.py youtube-dl-2019.05.20/youtube_dl/extractor/thvideo.py --- youtube-dl-2016.02.22/youtube_dl/extractor/thvideo.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/thvideo.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,84 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - unified_strdate -) - - -class THVideoIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?thvideo\.tv/(?:v/th|mobile\.php\?cid=)(?P[0-9]+)' - _TEST = { - 'url': 'http://thvideo.tv/v/th1987/', - 'md5': 'fa107b1f73817e325e9433505a70db50', - 'info_dict': { - 'id': '1987', - 'ext': 'mp4', - 'title': '【动画】秘封活动记录 ~ The Sealed Esoteric History.分镜稿预览', - 'display_id': 'th1987', - 'thumbnail': 'http://thvideo.tv/uploadfile/2014/0722/20140722013459856.jpg', - 'description': '社团京都幻想剧团的第一个东方二次同人动画作品「秘封活动记录 ~ The Sealed Esoteric History.」 本视频是该动画第一期的分镜草稿...', - 'upload_date': '20140722' - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - # extract download link from mobile player page - webpage_player = self._download_webpage( - 'http://thvideo.tv/mobile.php?cid=%s-0' % (video_id), - video_id, note='Downloading video source page') - video_url = self._html_search_regex( - r'', webpage, - 'upload date', fatal=False)) - - return { - 'id': video_id, - 'ext': 'mp4', - 'url': video_url, - 'title': title, - 'display_id': display_id, - 'thumbnail': thumbnail, - 'description': description, - 'upload_date': upload_date - } - - -class THVideoPlaylistIE(InfoExtractor): - _VALID_URL = r'http?://(?:www\.)?thvideo\.tv/mylist(?P[0-9]+)' - _TEST = { - 'url': 'http://thvideo.tv/mylist2', - 'info_dict': { - 'id': '2', - 'title': '幻想万華鏡', - }, - 'playlist_mincount': 23, - } - - def _real_extract(self, url): - playlist_id = self._match_id(url) - - webpage = self._download_webpage(url, playlist_id) - list_title = self._html_search_regex( - r'

    (.*?)\d+) + ''' + _TESTS = [{ + 'url': 'https://m.tiktok.com/v/6606727368545406213.html', + 'md5': 'd584b572e92fcd48888051f238022420', + 'info_dict': { + 'id': '6606727368545406213', + 'ext': 'mp4', + 'title': 'Zureeal', + 'description': '#bowsette#mario#cosplay#uk#lgbt#gaming#asian#bowsettecosplay', + 'thumbnail': r're:^https?://.*~noop.image', + 'uploader': 'Zureeal', + 'timestamp': 1538248586, + 'upload_date': '20180929', + 'comment_count': int, + 'repost_count': int, + } + }, { + 'url': 'https://www.tiktok.com/share/video/6606727368545406213', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage( + 'https://m.tiktok.com/v/%s.html' % video_id, video_id) + data = self._parse_json(self._search_regex( + r'\bdata\s*=\s*({.+?})\s*;', webpage, 'data'), video_id) + return self._extract_aweme(data) + + +class TikTokUserIE(TikTokBaseIE): + _VALID_URL = r'''(?x) + https?:// + (?: + (?:m\.)?tiktok\.com/h5/share/usr| + (?:www\.)?tiktok\.com/share/user + ) + /(?P\d+) + ''' + _TESTS = [{ + 'url': 'https://m.tiktok.com/h5/share/usr/188294915489964032.html', + 'info_dict': { + 'id': '188294915489964032', + }, + 'playlist_mincount': 24, + }, { + 'url': 'https://www.tiktok.com/share/user/188294915489964032', + 'only_matching': True, + }] + + def _real_extract(self, url): + user_id = self._match_id(url) + data = self._download_json( + 'https://m.tiktok.com/h5/share/usr/list/%s/' % user_id, user_id, + query={'_signature': '_'}) + entries = [] + for aweme in data['aweme_list']: + try: + entry = self._extract_aweme(aweme) + except ExtractorError: + continue + entry['extractor_key'] = TikTokIE.ie_key() + entries.append(entry) + return self.playlist_result(entries, user_id) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/tinypic.py youtube-dl-2019.05.20/youtube_dl/extractor/tinypic.py --- youtube-dl-2016.02.22/youtube_dl/extractor/tinypic.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/tinypic.py 2019-06-07 18:49:07.000000000 +0000 @@ -9,7 +9,7 @@ class TinyPicIE(InfoExtractor): IE_NAME = 'tinypic' IE_DESC = 'tinypic.com videos' - _VALID_URL = r'http://(?:.+?\.)?tinypic\.com/player\.php\?v=(?P[^&]+)&s=\d+' + _VALID_URL = r'https?://(?:.+?\.)?tinypic\.com/player\.php\?v=(?P[^&]+)&s=\d+' _TESTS = [ { @@ -34,7 +34,7 @@ webpage = self._download_webpage(url, video_id, 'Downloading page') mobj = re.search(r'(?m)fo\.addVariable\("file",\s"(?P[\da-z]+)"\);\n' - '\s+fo\.addVariable\("s",\s"(?P\d+)"\);', webpage) + r'\s+fo\.addVariable\("s",\s"(?P\d+)"\);', webpage) if mobj is None: raise ExtractorError('Video %s does not exist' % video_id, expected=True) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/tlc.py youtube-dl-2019.05.20/youtube_dl/extractor/tlc.py --- youtube-dl-2016.02.22/youtube_dl/extractor/tlc.py 2016-01-01 11:16:56.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/tlc.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,48 +0,0 @@ -# encoding: utf-8 -from __future__ import unicode_literals -import re - -from .common import InfoExtractor -from .brightcove import BrightcoveLegacyIE -from ..compat import compat_urlparse - - -class TlcDeIE(InfoExtractor): - IE_NAME = 'tlc.de' - _VALID_URL = r'http://www\.tlc\.de/sendungen/[^/]+/videos/(?P[^/?]+)' - - _TEST = { - 'url': 'http://www.tlc.de/sendungen/breaking-amish/videos/#3235167922001', - 'info_dict': { - 'id': '3235167922001', - 'ext': 'mp4', - 'title': 'Breaking Amish: Die Welt da draußen', - 'uploader': 'Discovery Networks - Germany', - 'description': ( - 'Vier Amische und eine Mennonitin wagen in New York' - ' den Sprung in ein komplett anderes Leben. Begleitet sie auf' - ' ihrem spannenden Weg.'), - }, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - title = mobj.group('title') - webpage = self._download_webpage(url, title) - iframe_url = self._search_regex( - '<iframe src="(http://www\.tlc\.de/wp-content/.+?)"', webpage, - 'iframe url') - # Otherwise we don't get the correct 'BrightcoveExperience' element, - # example: http://www.tlc.de/sendungen/cake-boss/videos/cake-boss-cannoli-drama/ - iframe_url = iframe_url.replace('.htm?', '.php?') - url_fragment = compat_urlparse.urlparse(url).fragment - if url_fragment: - # Since the fragment is not send to the server, we always get the same iframe - iframe_url = re.sub(r'playlist=(\d+)', 'playlist=%s' % url_fragment, iframe_url) - iframe = self._download_webpage(iframe_url, title) - - return { - '_type': 'url', - 'url': BrightcoveLegacyIE._extract_brightcove_url(iframe), - 'ie': BrightcoveLegacyIE.ie_key(), - } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/tmz.py youtube-dl-2019.05.20/youtube_dl/extractor/tmz.py --- youtube-dl-2016.02.22/youtube_dl/extractor/tmz.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/tmz.py 2019-06-07 18:49:07.000000000 +0000 @@ -5,43 +5,42 @@ class TMZIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tmz\.com/videos/(?P<id>[^/]+)/?' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?tmz\.com/videos/(?P<id>[^/?#]+)' + _TESTS = [{ 'url': 'http://www.tmz.com/videos/0_okj015ty/', - 'md5': '791204e3bf790b1426cb2db0706184c0', + 'md5': '4d22a51ef205b6c06395d8394f72d560', 'info_dict': { 'id': '0_okj015ty', - 'url': 'http://tmz.vo.llnwd.net/o28/2014-03/13/0_okj015ty_0_rt8ro3si_2.mp4', 'ext': 'mp4', 'title': 'Kim Kardashian\'s Boobs Unlock a Mystery!', 'description': 'Did Kim Kardasain try to one-up Khloe by one-upping Kylie??? Or is she just showing off her amazing boobs?', - 'thumbnail': r're:http://cdnbakmi\.kaltura\.com/.*thumbnail.*', + 'timestamp': 1394747163, + 'uploader_id': 'batchUser', + 'upload_date': '20140313', } - } + }, { + 'url': 'http://www.tmz.com/videos/0-cegprt2p/', + 'only_matching': True, + }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - return { - 'id': video_id, - 'url': self._html_search_meta('VideoURL', webpage, fatal=True), - 'title': self._og_search_title(webpage), - 'description': self._og_search_description(webpage), - 'thumbnail': self._html_search_meta('ThumbURL', webpage), - } + video_id = self._match_id(url).replace('-', '_') + return self.url_result('kaltura:591531:%s' % video_id, 'Kaltura', video_id) class TMZArticleIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?tmz\.com/\d{4}/\d{2}/\d{2}/(?P<id>[^/]+)/?' _TEST = { 'url': 'http://www.tmz.com/2015/04/19/bobby-brown-bobbi-kristina-awake-video-concert', - 'md5': 'e482a414a38db73087450e3a6ce69d00', + 'md5': '3316ff838ae5bb7f642537825e1e90d2', 'info_dict': { 'id': '0_6snoelag', - 'ext': 'mp4', + 'ext': 'mov', 'title': 'Bobby Brown Tells Crowd ... Bobbi Kristina is Awake', 'description': 'Bobby Brown stunned his audience during a concert Saturday night, when he told the crowd, "Bobbi is awake. She\'s watching me."', + 'timestamp': 1429467813, + 'upload_date': '20150419', + 'uploader_id': 'batchUser', } } @@ -49,12 +48,9 @@ video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - embedded_video_info_str = self._html_search_regex( - r'tmzVideoEmbedV2\("([^)]+)"\);', webpage, 'embedded video info') - - embedded_video_info = self._parse_json( - embedded_video_info_str, video_id, - transform_source=lambda s: s.replace('\\', '')) + embedded_video_info = self._parse_json(self._html_search_regex( + r'tmzVideoEmbed\(({.+?})\);', webpage, 'embedded video info'), + video_id) return self.url_result( 'http://www.tmz.com/videos/%s/' % embedded_video_info['id']) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/tnaflix.py youtube-dl-2019.05.20/youtube_dl/extractor/tnaflix.py --- youtube-dl-2016.02.22/youtube_dl/extractor/tnaflix.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/tnaflix.py 2019-06-07 18:49:07.000000000 +0000 @@ -10,6 +10,7 @@ int_or_none, parse_duration, str_to_int, + unescapeHTML, xpath_text, ) @@ -17,9 +18,12 @@ class TNAFlixNetworkBaseIE(InfoExtractor): # May be overridden in descendants if necessary _CONFIG_REGEX = [ - r'flashvars\.config\s*=\s*escape\("([^"]+)"', - r'<input[^>]+name="config\d?" value="([^"]+)"', + r'flashvars\.config\s*=\s*escape\("(?P<url>[^"]+)"', + r'<input[^>]+name="config\d?" value="(?P<url>[^"]+)"', + r'config\s*=\s*(["\'])(?P<url>(?:https?:)?//(?:(?!\1).)+)\1', ] + _HOST = 'tna' + _VKEY_SUFFIX = '' _TITLE_REGEX = r'<input[^>]+name="title" value="([^"]+)"' _DESCRIPTION_REGEX = r'<input[^>]+name="description" value="([^"]+)"' _UPLOADER_REGEX = r'<input[^>]+name="username" value="([^"]+)"' @@ -71,21 +75,34 @@ def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - display_id = mobj.group('display_id') + for display_id_key in ('display_id', 'display_id_2'): + if display_id_key in mobj.groupdict(): + display_id = mobj.group(display_id_key) + if display_id: + break + else: + display_id = video_id webpage = self._download_webpage(url, display_id) cfg_url = self._proto_relative_url(self._html_search_regex( - self._CONFIG_REGEX, webpage, 'flashvars.config'), 'http:') + self._CONFIG_REGEX, webpage, 'flashvars.config', default=None, + group='url'), 'http:') + + if not cfg_url: + inputs = self._hidden_inputs(webpage) + cfg_url = ('https://cdn-fck.%sflix.com/%sflix/%s%s.fid?key=%s&VID=%s&premium=1&vip=1&alpha' + % (self._HOST, self._HOST, inputs['vkey'], self._VKEY_SUFFIX, inputs['nkey'], video_id)) cfg_xml = self._download_xml( cfg_url, display_id, 'Downloading metadata', - transform_source=fix_xml_ampersands) + transform_source=fix_xml_ampersands, headers={'Referer': url}) formats = [] def extract_video_url(vl): - return re.sub('speed=\d+', 'speed=', vl.text) + # Any URL modification now results in HTTP Error 403: Forbidden + return unescapeHTML(vl.text) video_link = cfg_xml.find('./videoLink') if video_link is not None: @@ -114,10 +131,14 @@ xpath_text(cfg_xml, './startThumb', 'thumbnail'), 'http:') thumbnails = self._extract_thumbnails(cfg_xml) - title = self._html_search_regex( - self._TITLE_REGEX, webpage, 'title') if self._TITLE_REGEX else self._og_search_title(webpage) + title = None + if self._TITLE_REGEX: + title = self._html_search_regex( + self._TITLE_REGEX, webpage, 'title', default=None) + if not title: + title = self._og_search_title(webpage) - age_limit = self._rta_search(webpage) + age_limit = self._rta_search(webpage) or 18 duration = parse_duration(self._html_search_meta( 'duration', webpage, 'duration', default=None)) @@ -132,7 +153,7 @@ average_rating = float_or_none(extract_field(self._AVERAGE_RATING_REGEX, 'average rating')) categories_str = extract_field(self._CATEGORIES_REGEX, 'categories') - categories = categories_str.split(', ') if categories_str is not None else [] + categories = [c.strip() for c in categories_str.split(',')] if categories_str is not None else [] return { 'id': video_id, @@ -152,27 +173,60 @@ } -class TNAFlixIE(TNAFlixNetworkBaseIE): +class TNAFlixNetworkEmbedIE(TNAFlixNetworkBaseIE): + _VALID_URL = r'https?://player\.(?:tna|emp)flix\.com/video/(?P<id>\d+)' + + _TITLE_REGEX = r'<title>([^<]+)' + + _TESTS = [{ + 'url': 'https://player.tnaflix.com/video/6538', + 'info_dict': { + 'id': '6538', + 'display_id': '6538', + 'ext': 'mp4', + 'title': 'Educational xxx video', + 'thumbnail': r're:https?://.*\.jpg$', + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://player.empflix.com/video/33051', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [url for _, url in re.findall( + r']+?src=(["\'])(?P(?:https?:)?//player\.(?:tna|emp)flix\.com/video/\d+)\1', + webpage)] + + +class TNAEMPFlixBaseIE(TNAFlixNetworkBaseIE): + _DESCRIPTION_REGEX = r'(?s)>Description:]+>(.+?)<' + _UPLOADER_REGEX = r'by\s*]+\bhref=["\']/profile/[^>]+>([^<]+)<' + _CATEGORIES_REGEX = r'(?s)]*>Categories:(.+?)

    ' + + +class TNAFlixIE(TNAEMPFlixBaseIE): _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P[^/]+)/video(?P\d+)' - _TITLE_REGEX = r'(.+?) - TNAFlix Porn Videos' - _DESCRIPTION_REGEX = r'

    ([^<]+)

    ' - _UPLOADER_REGEX = r'(?s)]+class="infoTitle"[^>]*>Uploaded By:(.+?)(.+?) - (?:TNAFlix Porn Videos|TNAFlix\.com)' _TESTS = [{ # anonymous uploader, no categories 'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878', - 'md5': 'ecf3498417d09216374fc5907f9c6ec0', + 'md5': '7e569419fe6d69543d01e6be22f5f7c4', 'info_dict': { 'id': '553878', 'display_id': 'Carmella-Decesare-striptease', 'ext': 'mp4', 'title': 'Carmella Decesare - striptease', - 'thumbnail': 're:https?://.*\.jpg$', + 'thumbnail': r're:https?://.*\.jpg$', 'duration': 91, 'age_limit': 18, - 'uploader': 'Anonymous', - 'categories': [], + 'categories': ['Porn Stars'], } }, { # non-anonymous uploader, categories @@ -184,11 +238,11 @@ 'ext': 'mp4', 'title': 'Educational xxx video', 'description': 'md5:b4fab8f88a8621c8fabd361a173fe5b8', - 'thumbnail': 're:https?://.*\.jpg$', + 'thumbnail': r're:https?://.*\.jpg$', 'duration': 164, 'age_limit': 18, 'uploader': 'bobwhite39', - 'categories': ['Amateur Porn', 'Squirting Videos', 'Teen Girls 18+'], + 'categories': list, } }, { 'url': 'https://www.tnaflix.com/amateur-porn/bunzHD-Ms.Donk/video358632', @@ -196,21 +250,22 @@ }] -class EMPFlixIE(TNAFlixNetworkBaseIE): - _VALID_URL = r'https?://(?:www\.)?empflix\.com/videos/(?P.+?)-(?P[0-9]+)\.html' +class EMPFlixIE(TNAEMPFlixBaseIE): + _VALID_URL = r'https?://(?:www\.)?empflix\.com/(?:videos/(?P.+?)-|[^/]+/(?P[^/]+)/video)(?P[0-9]+)' - _UPLOADER_REGEX = r']+class="infoTitle"[^>]*>Uploaded By:(.+?)' + _HOST = 'emp' + _VKEY_SUFFIX = '-1' _TESTS = [{ 'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html', - 'md5': 'b1bc15b6412d33902d6e5952035fcabc', + 'md5': 'bc30d48b91a7179448a0bda465114676', 'info_dict': { 'id': '33051', 'display_id': 'Amateur-Finger-Fuck', 'ext': 'mp4', 'title': 'Amateur Finger Fuck', 'description': 'Amateur solo finger fucking.', - 'thumbnail': 're:https?://.*\.jpg$', + 'thumbnail': r're:https?://.*\.jpg$', 'duration': 83, 'age_limit': 18, 'uploader': 'cwbike', @@ -219,6 +274,9 @@ }, { 'url': 'http://www.empflix.com/videos/[AROMA][ARMD-718]-Aoi-Yoshino-Sawa-25826.html', 'only_matching': True, + }, { + 'url': 'https://www.empflix.com/amateur-porn/Amateur-Finger-Fuck/video33051', + 'only_matching': True, }] @@ -240,7 +298,7 @@ 'ext': 'mp4', 'title': 'Experienced MILF Amazing Handjob', 'description': 'Experienced MILF giving an Amazing Handjob', - 'thumbnail': 're:https?://.*\.jpg$', + 'thumbnail': r're:https?://.*\.jpg$', 'age_limit': 18, 'uploader': 'darvinfred06', 'view_count': int, @@ -258,7 +316,7 @@ 'ext': 'flv', 'title': 'Jeune Couple Russe', 'description': 'Amateur', - 'thumbnail': 're:https?://.*\.jpg$', + 'thumbnail': r're:https?://.*\.jpg$', 'age_limit': 18, 'uploader': 'whiskeyjar', 'view_count': int, diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/toggle.py youtube-dl-2019.05.20/youtube_dl/extractor/toggle.py --- youtube-dl-2016.02.22/youtube_dl/extractor/toggle.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/toggle.py 2019-06-07 18:49:07.000000000 +0000 @@ -17,7 +17,7 @@ class ToggleIE(InfoExtractor): IE_NAME = 'toggle' - _VALID_URL = r'https?://video\.toggle\.sg/(?:en|zh)/(?:series|clips|movies)/(?:[^/]+/)+(?P[0-9]+)' + _VALID_URL = r'https?://video\.toggle\.sg/(?:en|zh)/(?:[^/]+/){2,}(?P[0-9]+)' _TESTS = [{ 'url': 'http://video.toggle.sg/en/series/lion-moms-tif/trailers/lion-moms-premier/343115', 'info_dict': { @@ -73,6 +73,12 @@ }, { 'url': 'http://video.toggle.sg/en/movies/seven-days/321936', 'only_matching': True, + }, { + 'url': 'https://video.toggle.sg/en/tv-show/news/may-2017-cna-singapore-tonight/fri-19-may-2017/512456', + 'only_matching': True, + }, { + 'url': 'http://video.toggle.sg/en/channels/eleven-plus/401585', + 'only_matching': True, }] _FORMAT_PREFERENCES = { @@ -126,7 +132,7 @@ formats = [] for video_file in info.get('Files', []): video_url, vid_format = video_file.get('URL'), video_file.get('Format') - if not video_url or not vid_format: + if not video_url or video_url == 'NA' or not vid_format: continue ext = determine_ext(video_url) vid_format = vid_format.replace(' ', '') @@ -137,6 +143,18 @@ note='Downloading %s m3u8 information' % vid_format, errnote='Failed to download %s m3u8 information' % vid_format, fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + video_url, video_id, mpd_id=vid_format, + note='Downloading %s MPD manifest' % vid_format, + errnote='Failed to download %s MPD manifest' % vid_format, + fatal=False)) + elif ext == 'ism': + formats.extend(self._extract_ism_formats( + video_url, video_id, ism_id=vid_format, + note='Downloading %s ISM manifest' % vid_format, + errnote='Failed to download %s ISM manifest' % vid_format, + fatal=False)) elif ext in ('mp4', 'wvm'): # wvm are drm-protected files formats.append({ diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/tonline.py youtube-dl-2019.05.20/youtube_dl/extractor/tonline.py --- youtube-dl-2016.02.22/youtube_dl/extractor/tonline.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/tonline.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,59 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import int_or_none + + +class TOnlineIE(InfoExtractor): + IE_NAME = 't-online.de' + _VALID_URL = r'https?://(?:www\.)?t-online\.de/tv/(?:[^/]+/)*id_(?P\d+)' + _TEST = { + 'url': 'http://www.t-online.de/tv/sport/fussball/id_79166266/drittes-remis-zidane-es-muss-etwas-passieren-.html', + 'md5': '7d94dbdde5f9d77c5accc73c39632c29', + 'info_dict': { + 'id': '79166266', + 'ext': 'mp4', + 'title': 'Drittes Remis! Zidane: "Es muss etwas passieren"', + 'description': 'Es läuft nicht rund bei Real Madrid. Das 1:1 gegen den SD Eibar war das dritte Unentschieden in Folge in der Liga.', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._download_json( + 'http://www.t-online.de/tv/id_%s/tid_json_video' % video_id, video_id) + title = video_data['subtitle'] + + formats = [] + for asset in video_data.get('assets', []): + asset_source = asset.get('source') or asset.get('source2') + if not asset_source: + continue + formats_id = [] + for field_key in ('type', 'profile'): + field_value = asset.get(field_key) + if field_value: + formats_id.append(field_value) + formats.append({ + 'format_id': '-'.join(formats_id), + 'url': asset_source, + }) + + thumbnails = [] + for image in video_data.get('images', []): + image_source = image.get('source') + if not image_source: + continue + thumbnails.append({ + 'url': image_source, + }) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'duration': int_or_none(video_data.get('duration')), + 'thumbnails': thumbnails, + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/toongoggles.py youtube-dl-2019.05.20/youtube_dl/extractor/toongoggles.py --- youtube-dl-2016.02.22/youtube_dl/extractor/toongoggles.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/toongoggles.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,81 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_duration, +) + + +class ToonGogglesIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?toongoggles\.com/shows/(?P\d+)(?:/[^/]+/episodes/(?P\d+))?' + _TESTS = [{ + 'url': 'http://www.toongoggles.com/shows/217143/bernard-season-2/episodes/217147/football', + 'md5': '18289fc2b951eff6b953a9d8f01e6831', + 'info_dict': { + 'id': '217147', + 'ext': 'mp4', + 'title': 'Football', + 'uploader_id': '1', + 'description': 'Bernard decides to play football in order to be better than Lloyd and tries to beat him no matter how, he even cheats.', + 'upload_date': '20160718', + 'timestamp': 1468879330, + } + }, { + 'url': 'http://www.toongoggles.com/shows/227759/om-nom-stories-around-the-world', + 'info_dict': { + 'id': '227759', + 'title': 'Om Nom Stories Around The World', + }, + 'playlist_mincount': 11, + }] + + def _call_api(self, action, page_id, query): + query.update({ + 'for_ng': 1, + 'for_web': 1, + 'show_meta': 1, + 'version': 7.0, + }) + return self._download_json('http://api.toongoggles.com/' + action, page_id, query=query) + + def _parse_episode_data(self, episode_data): + title = episode_data['episode_name'] + + return { + '_type': 'url_transparent', + 'id': episode_data['episode_id'], + 'title': title, + 'url': 'kaltura:513551:' + episode_data['entry_id'], + 'thumbnail': episode_data.get('thumbnail_url'), + 'description': episode_data.get('description'), + 'duration': parse_duration(episode_data.get('hms')), + 'series': episode_data.get('show_name'), + 'season_number': int_or_none(episode_data.get('season_num')), + 'episode_id': episode_data.get('episode_id'), + 'episode': title, + 'episode_number': int_or_none(episode_data.get('episode_num')), + 'categories': episode_data.get('categories'), + 'ie_key': 'Kaltura', + } + + def _real_extract(self, url): + show_id, episode_id = re.match(self._VALID_URL, url).groups() + if episode_id: + episode_data = self._call_api('search', episode_id, { + 'filter': 'episode', + 'id': episode_id, + })['objects'][0] + return self._parse_episode_data(episode_data) + else: + show_data = self._call_api('getepisodesbyshow', show_id, { + 'max': 1000000000, + 'showid': show_id, + }) + entries = [] + for episode_data in show_data.get('objects', []): + entries.append(self._parse_episode_data(episode_data)) + return self.playlist_result(entries, show_id, show_data.get('show_name')) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/toutv.py youtube-dl-2019.05.20/youtube_dl/extractor/toutv.py --- youtube-dl-2016.02.22/youtube_dl/extractor/toutv.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/toutv.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,74 +1,93 @@ # coding: utf-8 from __future__ import unicode_literals -import re +import json -from .common import InfoExtractor +from .radiocanada import RadioCanadaIE +from ..compat import compat_HTTPError from ..utils import ( ExtractorError, - unified_strdate, + int_or_none, + merge_dicts, ) -class TouTvIE(InfoExtractor): +class TouTvIE(RadioCanadaIE): + _NETRC_MACHINE = 'toutv' IE_NAME = 'tou.tv' - _VALID_URL = r'https?://www\.tou\.tv/(?P[a-zA-Z0-9_-]+(?:/(?PS[0-9]+E[0-9]+)))' + _VALID_URL = r'https?://ici\.tou\.tv/(?P[a-zA-Z0-9_-]+(?:/S[0-9]+[EC][0-9]+)?)' - _TEST = { - 'url': 'http://www.tou.tv/30-vies/S04E41', + _TESTS = [{ + 'url': 'http://ici.tou.tv/garfield-tout-court/S2015E17', 'info_dict': { - 'id': '30-vies_S04E41', + 'id': '122017', 'ext': 'mp4', - 'title': '30 vies Saison 4 / Épisode 41', - 'description': 'md5:da363002db82ccbe4dafeb9cab039b09', - 'age_limit': 8, - 'uploader': 'Groupe des Nouveaux Médias', - 'duration': 1296, - 'upload_date': '20131118', - 'thumbnail': 'http://static.tou.tv/medias/images/2013-11-18_19_00_00_30VIES_0341_01_L.jpeg', + 'title': 'Saison 2015 Épisode 17', + 'description': 'La photo de famille 2', + 'upload_date': '20100717', }, 'params': { - 'skip_download': True, # Requires rtmpdump + # m3u8 download + 'skip_download': True, }, - 'skip': 'Only available in Canada' - } + 'skip': '404 Not Found', + }, { + 'url': 'http://ici.tou.tv/hackers', + 'only_matching': True, + }, { + 'url': 'https://ici.tou.tv/l-age-adulte/S01C501', + 'only_matching': True, + }] + _CLIENT_KEY = '4dd36440-09d5-4468-8923-b6d91174ad36' + + def _real_initialize(self): + email, password = self._get_login_info() + if email is None: + return + try: + self._access_token = self._download_json( + 'https://services.radio-canada.ca/toutv/profiling/accounts/login', + None, 'Logging in', data=json.dumps({ + 'ClientId': self._CLIENT_KEY, + 'ClientSecret': '34026772-244b-49b6-8b06-317b30ac9a20', + 'Email': email, + 'Password': password, + 'Scope': 'id.write media-validation.read', + }).encode(), headers={ + 'Authorization': 'client-key ' + self._CLIENT_KEY, + 'Content-Type': 'application/json;charset=utf-8', + })['access_token'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + error = self._parse_json(e.cause.read().decode(), None)['Message'] + raise ExtractorError(error, expected=True) + raise + self._claims = self._call_api('validation/v2/getClaims')['claims'] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) - - mediaId = self._search_regex( - r'"idMedia":\s*"([^"]+)"', webpage, 'media ID') - - streams_url = 'http://release.theplatform.com/content.select?pid=' + mediaId - streams_doc = self._download_xml( - streams_url, video_id, note='Downloading stream list') - - video_url = next(n.text - for n in streams_doc.findall('.//choice/url') - if '//ad.doubleclick' not in n.text) - if video_url.endswith('/Unavailable.flv'): - raise ExtractorError( - 'Access to this video is blocked from outside of Canada', - expected=True) - - duration_str = self._html_search_meta( - 'video:duration', webpage, 'duration') - duration = int(duration_str) if duration_str else None - upload_date_str = self._html_search_meta( - 'video:release_date', webpage, 'upload date') - upload_date = unified_strdate(upload_date_str) if upload_date_str else None + path = self._match_id(url) + metadata = self._download_json( + 'https://services.radio-canada.ca/toutv/presentation/%s' % path, path, query={ + 'client_key': self._CLIENT_KEY, + 'device': 'web', + 'version': 4, + }) + # IsDrm does not necessarily mean the video is DRM protected (see + # https://github.com/ytdl-org/youtube-dl/issues/13994). + if metadata.get('IsDrm'): + self.report_warning('This video is probably DRM protected.', path) + video_id = metadata['IdMedia'] + details = metadata['Details'] - return { + return merge_dicts({ 'id': video_id, - 'title': self._og_search_title(webpage), - 'url': video_url, - 'description': self._og_search_description(webpage), - 'uploader': self._dc_search_uploader(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), - 'age_limit': self._media_rating_search(webpage), - 'duration': duration, - 'upload_date': upload_date, - 'ext': 'mp4', - } + 'title': details.get('OriginalTitle'), + 'description': details.get('Description'), + 'thumbnail': details.get('ImageUrl'), + 'duration': int_or_none(details.get('LengthInSeconds')), + 'series': metadata.get('ProgramTitle'), + 'season_number': int_or_none(metadata.get('SeasonNumber')), + 'season': metadata.get('SeasonTitle'), + 'episode_number': int_or_none(metadata.get('EpisodeNumber')), + 'episode': metadata.get('EpisodeTitle'), + }, self._extract_info(metadata.get('AppCode', 'toutv'), video_id)) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/toypics.py youtube-dl-2019.05.20/youtube_dl/extractor/toypics.py --- youtube-dl-2016.02.22/youtube_dl/extractor/toypics.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/toypics.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,4 +1,4 @@ -# -*- coding:utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor @@ -6,42 +6,48 @@ class ToypicsIE(InfoExtractor): - IE_DESC = 'Toypics user profile' - _VALID_URL = r'https?://videos\.toypics\.net/view/(?P[0-9]+)/.*' + IE_DESC = 'Toypics video' + _VALID_URL = r'https?://videos\.toypics\.net/view/(?P[0-9]+)' _TEST = { 'url': 'http://videos.toypics.net/view/514/chancebulged,-2-1/', 'md5': '16e806ad6d6f58079d210fe30985e08b', 'info_dict': { 'id': '514', 'ext': 'mp4', - 'title': 'Chance-Bulge\'d, 2', + 'title': "Chance-Bulge'd, 2", 'age_limit': 18, 'uploader': 'kidsune', } } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - page = self._download_webpage(url, video_id) - video_url = self._html_search_regex( - r'src:\s+"(http://static[0-9]+\.toypics\.net/flvideo/[^"]+)"', page, 'video URL') - title = self._html_search_regex( - r'Toypics - ([^<]+)', page, 'title') - username = self._html_search_regex( - r'toypics.net/([^/"]+)" class="user-name">', page, 'username') + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + formats = self._parse_html5_media_entries( + url, webpage, video_id)[0]['formats'] + title = self._html_search_regex([ + r']+class=["\']view-video-title[^>]+>([^<]+)([^<]+) - Toypics', + ], webpage, 'title') + + uploader = self._html_search_regex( + r'More videos from ([^<]+)', webpage, 'uploader', + fatal=False) + return { 'id': video_id, - 'url': video_url, + 'formats': formats, 'title': title, - 'uploader': username, + 'uploader': uploader, 'age_limit': 18, } class ToypicsUserIE(InfoExtractor): IE_DESC = 'Toypics user profile' - _VALID_URL = r'http://videos\.toypics\.net/(?P[^/?]+)(?:$|[?#])' + _VALID_URL = r'https?://videos\.toypics\.net/(?!view)(?P[^/?#&]+)' _TEST = { 'url': 'http://videos.toypics.net/Mikey', 'info_dict': { @@ -51,8 +57,7 @@ } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - username = mobj.group('username') + username = self._match_id(url) profile_page = self._download_webpage( url, username, note='Retrieving profile page') @@ -71,7 +76,7 @@ note='Downloading page %d/%d' % (n, page_count)) urls.extend( re.findall( - r'

    \s+', + r']+class=["\']preview[^>]+>\s*]+href="(https?://videos\.toypics\.net/view/[^"]+)"', lpage)) return { diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/traileraddict.py youtube-dl-2019.05.20/youtube_dl/extractor/traileraddict.py --- youtube-dl-2016.02.22/youtube_dl/extractor/traileraddict.py 2016-02-14 22:26:17.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/traileraddict.py 2019-06-07 18:49:07.000000000 +0000 @@ -7,7 +7,7 @@ class TrailerAddictIE(InfoExtractor): _WORKING = False - _VALID_URL = r'(?:http://)?(?:www\.)?traileraddict\.com/(?:trailer|clip)/(?P.+?)/(?P.+)' + _VALID_URL = r'(?:https?://)?(?:www\.)?traileraddict\.com/(?:trailer|clip)/(?P.+?)/(?P.+)' _TEST = { 'url': 'http://www.traileraddict.com/trailer/prince-avalanche/trailer', 'md5': '41365557f3c8c397d091da510e73ceb4', diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/trollvids.py youtube-dl-2019.05.20/youtube_dl/extractor/trollvids.py --- youtube-dl-2016.02.22/youtube_dl/extractor/trollvids.py 2016-01-23 10:08:46.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/trollvids.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,36 +0,0 @@ -# encoding: utf-8 -from __future__ import unicode_literals - -import re - -from .nuevo import NuevoBaseIE - - -class TrollvidsIE(NuevoBaseIE): - _VALID_URL = r'http://(?:www\.)?trollvids\.com/video/(?P\d+)/(?P[^/?#&]+)' - IE_NAME = 'trollvids' - _TEST = { - 'url': 'http://trollvids.com/video/2349002/%E3%80%90MMD-R-18%E3%80%91%E3%82%AC%E3%83%BC%E3%83%AB%E3%83%95%E3%83%AC%E3%83%B3%E3%83%89-carrymeoff', - 'md5': '1d53866b2c514b23ed69e4352fdc9839', - 'info_dict': { - 'id': '2349002', - 'ext': 'mp4', - 'title': '【MMD R-18】ガールフレンド carry_me_off', - 'age_limit': 18, - 'duration': 216.78, - }, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') - - info = self._extract_nuevo( - 'http://trollvids.com/nuevo/player/config.php?v=%s' % video_id, - video_id) - info.update({ - 'display_id': display_id, - 'age_limit': 18 - }) - return info diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/trunews.py youtube-dl-2019.05.20/youtube_dl/extractor/trunews.py --- youtube-dl-2016.02.22/youtube_dl/extractor/trunews.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/trunews.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,75 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + dict_get, + float_or_none, + int_or_none, + unified_timestamp, + update_url_query, + url_or_none, +) + + +class TruNewsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?trunews\.com/stream/(?P[^/?#&]+)' + _TEST = { + 'url': 'https://www.trunews.com/stream/will-democrats-stage-a-circus-during-president-trump-s-state-of-the-union-speech', + 'md5': 'a19c024c3906ff954fac9b96ce66bb08', + 'info_dict': { + 'id': '5c5a21e65d3c196e1c0020cc', + 'display_id': 'will-democrats-stage-a-circus-during-president-trump-s-state-of-the-union-speech', + 'ext': 'mp4', + 'title': "Will Democrats Stage a Circus During President Trump's State of the Union Speech?", + 'description': 'md5:c583b72147cc92cf21f56a31aff7a670', + 'duration': 3685, + 'timestamp': 1549411440, + 'upload_date': '20190206', + }, + 'add_ie': ['Zype'], + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + video = self._download_json( + 'https://api.zype.com/videos', display_id, query={ + 'app_key': 'PUVKp9WgGUb3-JUw6EqafLx8tFVP6VKZTWbUOR-HOm__g4fNDt1bCsm_LgYf_k9H', + 'per_page': 1, + 'active': 'true', + 'friendly_title': display_id, + })['response'][0] + + zype_id = video['_id'] + + thumbnails = [] + thumbnails_list = video.get('thumbnails') + if isinstance(thumbnails_list, list): + for thumbnail in thumbnails_list: + if not isinstance(thumbnail, dict): + continue + thumbnail_url = url_or_none(thumbnail.get('url')) + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + return { + '_type': 'url_transparent', + 'url': update_url_query( + 'https://player.zype.com/embed/%s.js' % zype_id, + {'api_key': 'X5XnahkjCwJrT_l5zUqypnaLEObotyvtUKJWWlONxDoHVjP8vqxlArLV8llxMbyt'}), + 'ie_key': 'Zype', + 'id': zype_id, + 'display_id': display_id, + 'title': video.get('title'), + 'description': dict_get(video, ('description', 'ott_description', 'short_description')), + 'duration': int_or_none(video.get('duration')), + 'timestamp': unified_timestamp(video.get('published_at')), + 'average_rating': float_or_none(video.get('rating')), + 'view_count': int_or_none(video.get('request_count')), + 'thumbnails': thumbnails, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/trutube.py youtube-dl-2019.05.20/youtube_dl/extractor/trutube.py --- youtube-dl-2016.02.22/youtube_dl/extractor/trutube.py 2016-01-23 10:08:46.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/trutube.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,26 +0,0 @@ -from __future__ import unicode_literals - -from .nuevo import NuevoBaseIE - - -class TruTubeIE(NuevoBaseIE): - _VALID_URL = r'https?://(?:www\.)?trutube\.tv/(?:video/|nuevo/player/embed\.php\?v=)(?P\d+)' - _TESTS = [{ - 'url': 'http://trutube.tv/video/14880/Ramses-II-Proven-To-Be-A-Red-Headed-Caucasoid-', - 'md5': 'c5b6e301b0a2040b074746cbeaa26ca1', - 'info_dict': { - 'id': '14880', - 'ext': 'flv', - 'title': 'Ramses II - Proven To Be A Red Headed Caucasoid', - 'thumbnail': 're:^http:.*\.jpg$', - } - }, { - 'url': 'https://trutube.tv/nuevo/player/embed.php?v=14880', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - return self._extract_nuevo( - 'https://trutube.tv/nuevo/player/config.php?v=%s' % video_id, - video_id) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/trutv.py youtube-dl-2019.05.20/youtube_dl/extractor/trutv.py --- youtube-dl-2016.02.22/youtube_dl/extractor/trutv.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/trutv.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,75 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .turner import TurnerBaseIE +from ..utils import ( + int_or_none, + parse_iso8601, +) + + +class TruTVIE(TurnerBaseIE): + _VALID_URL = r'https?://(?:www\.)?trutv\.com/(?:shows|full-episodes)/(?P[0-9A-Za-z-]+)/(?:videos/(?P[0-9A-Za-z-]+)|(?P\d+))' + _TEST = { + 'url': 'https://www.trutv.com/shows/the-carbonaro-effect/videos/sunlight-activated-flower.html', + 'info_dict': { + 'id': 'f16c03beec1e84cd7d1a51f11d8fcc29124cc7f1', + 'ext': 'mp4', + 'title': 'Sunlight-Activated Flower', + 'description': "A customer is stunned when he sees Michael's sunlight-activated flower.", + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + series_slug, clip_slug, video_id = re.match(self._VALID_URL, url).groups() + + if video_id: + path = 'episode' + display_id = video_id + else: + path = 'series/clip' + display_id = clip_slug + + data = self._download_json( + 'https://api.trutv.com/v2/web/%s/%s/%s' % (path, series_slug, display_id), + display_id) + video_data = data['episode'] if video_id else data['info'] + media_id = video_data['mediaId'] + title = video_data['title'].strip() + + info = self._extract_ngtv_info( + media_id, {}, { + 'url': url, + 'site_name': 'truTV', + 'auth_required': video_data.get('isAuthRequired'), + }) + + thumbnails = [] + for image in video_data.get('images', []): + image_url = image.get('srcUrl') + if not image_url: + continue + thumbnails.append({ + 'url': image_url, + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + }) + + info.update({ + 'id': media_id, + 'display_id': display_id, + 'title': title, + 'description': video_data.get('description'), + 'thumbnails': thumbnails, + 'timestamp': parse_iso8601(video_data.get('publicationDate')), + 'series': video_data.get('showTitle'), + 'season_number': int_or_none(video_data.get('seasonNum')), + 'episode_number': int_or_none(video_data.get('episodeNum')), + }) + return info diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/tube8.py youtube-dl-2019.05.20/youtube_dl/extractor/tube8.py --- youtube-dl-2016.02.22/youtube_dl/extractor/tube8.py 2016-01-23 10:08:46.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/tube8.py 2019-06-07 18:49:07.000000000 +0000 @@ -2,17 +2,14 @@ import re -from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( int_or_none, - sanitized_Request, str_to_int, ) -from ..aes import aes_decrypt_text +from .keezmovies import KeezMoviesIE -class Tube8IE(InfoExtractor): +class Tube8IE(KeezMoviesIE): _VALID_URL = r'https?://(?:www\.)?tube8\.com/(?:[^/]+/)+(?P[^/]+)/(?P\d+)' _TESTS = [{ 'url': 'http://www.tube8.com/teen/kasia-music-video/229795/', @@ -26,78 +23,64 @@ 'title': 'Kasia music video', 'age_limit': 18, 'duration': 230, - } + 'categories': ['Teen'], + 'tags': ['dancing'], + }, }, { 'url': 'http://www.tube8.com/shemale/teen/blonde-cd-gets-kidnapped-by-two-blacks-and-punished-for-being-a-slutty-girl/19569151/', 'only_matching': True, }] + @staticmethod + def _extract_urls(webpage): + return re.findall( + r']+\bsrc=["\']((?:https?:)?//(?:www\.)?tube8\.com/embed/(?:[^/]+/)+\d+)', + webpage) + def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') - - req = sanitized_Request(url) - req.add_header('Cookie', 'age_verified=1') - webpage = self._download_webpage(req, display_id) - - flashvars = self._parse_json( - self._search_regex( - r'flashvars\s*=\s*({.+?});\r?\n', webpage, 'flashvars'), - video_id) - - formats = [] - for key, video_url in flashvars.items(): - if not isinstance(video_url, compat_str) or not video_url.startswith('http'): - continue - height = self._search_regex( - r'quality_(\d+)[pP]', key, 'height', default=None) - if not height: - continue - if flashvars.get('encrypted') is True: - video_url = aes_decrypt_text( - video_url, flashvars['video_title'], 32).decode('utf-8') - formats.append({ - 'url': video_url, - 'format_id': '%sp' % height, - 'height': int(height), - }) - self._sort_formats(formats) + webpage, info = self._extract_info(url) - thumbnail = flashvars.get('image_url') + if not info['title']: + info['title'] = self._html_search_regex( + r'videoTitle\s*=\s*"([^"]+)', webpage, 'title') - title = self._html_search_regex( - r'videoTitle\s*=\s*"([^"]+)', webpage, 'title') description = self._html_search_regex( - r'>Description:\s*(.+?)\s*<', webpage, 'description', fatal=False) + r'(?s)Description:\s*

    (.+?)
    ', webpage, 'description', fatal=False) uploader = self._html_search_regex( r'\s*(.+?)\s*<', webpage, 'uploader', fatal=False) - duration = int_or_none(flashvars.get('video_duration')) like_count = int_or_none(self._search_regex( r'rupVar\s*=\s*"(\d+)"', webpage, 'like count', fatal=False)) dislike_count = int_or_none(self._search_regex( r'rdownVar\s*=\s*"(\d+)"', webpage, 'dislike count', fatal=False)) view_count = str_to_int(self._search_regex( - r'Views: ([\d,\.]+)\s*', + r'Views:\s*\s*
    ([\d,\.]+)', webpage, 'view count', fatal=False)) comment_count = str_to_int(self._search_regex( r'(\d+)', webpage, 'comment count', fatal=False)) - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, + category = self._search_regex( + r'Category:\s*\s*
    \s*]+href=[^>]+>([^<]+)', + webpage, 'category', fatal=False) + categories = [category] if category else None + + tags_str = self._search_regex( + r'(?s)Tags:\s*\s*
    (.+?)]+href=[^>]+>([^<]+)', tags_str)] if tags_str else None + + info.update({ 'description': description, - 'thumbnail': thumbnail, 'uploader': uploader, - 'duration': duration, 'view_count': view_count, 'like_count': like_count, 'dislike_count': dislike_count, 'comment_count': comment_count, - 'age_limit': 18, - 'formats': formats, - } + 'categories': categories, + 'tags': tags, + }) + + return info diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/tubitv.py youtube-dl-2019.05.20/youtube_dl/extractor/tubitv.py --- youtube-dl-2016.02.22/youtube_dl/extractor/tubitv.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/tubitv.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,39 +1,42 @@ # coding: utf-8 from __future__ import unicode_literals -import codecs import re from .common import InfoExtractor -from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, int_or_none, sanitized_Request, + urlencode_postdata, ) class TubiTvIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tubitv\.com/video\?id=(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?tubitv\.com/(?:video|movies|tv-shows)/(?P[0-9]+)' _LOGIN_URL = 'http://tubitv.com/login' _NETRC_MACHINE = 'tubitv' - _TEST = { - 'url': 'http://tubitv.com/video?id=54411&title=The_Kitchen_Musical_-_EP01', + _GEO_COUNTRIES = ['US'] + _TESTS = [{ + 'url': 'http://tubitv.com/video/283829/the_comedian_at_the_friday', + 'md5': '43ac06be9326f41912dc64ccf7a80320', 'info_dict': { - 'id': '54411', + 'id': '283829', 'ext': 'mp4', - 'title': 'The Kitchen Musical - EP01', - 'thumbnail': 're:^https?://.*\.png$', - 'description': 'md5:37532716166069b353e8866e71fefae7', - 'duration': 2407, + 'title': 'The Comedian at The Friday', + 'description': 'A stand up comedian is forced to look at the decisions in his life while on a one week trip to the west coast.', + 'uploader_id': 'bc168bee0d18dd1cb3b86c68706ab434', }, - 'params': { - 'skip_download': 'HLS download', - }, - } + }, { + 'url': 'http://tubitv.com/tv-shows/321886/s01_e01_on_nom_stories', + 'only_matching': True, + }, { + 'url': 'http://tubitv.com/movies/383676/tracker', + 'only_matching': True, + }] def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return self.report_login() @@ -41,7 +44,7 @@ 'username': username, 'password': password, } - payload = compat_urllib_parse.urlencode(form_data).encode('utf-8') + payload = urlencode_postdata(form_data) request = sanitized_Request(self._LOGIN_URL, payload) request.add_header('Content-Type', 'application/x-www-form-urlencoded') login_page = self._download_webpage( @@ -55,26 +58,39 @@ def _real_extract(self, url): video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - if re.search(r"<(?:DIV|div) class='login-required-screen'>", webpage): - self.raise_login_required('This video requires login') - - title = self._og_search_title(webpage) - description = self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) - duration = int_or_none(self._html_search_meta( - 'video:duration', webpage, 'duration')) - - apu = self._search_regex(r"apu='([^']+)'", webpage, 'apu') - m3u8_url = codecs.decode(apu, 'rot_13')[::-1] - formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') + video_data = self._download_json( + 'http://tubitv.com/oz/videos/%s/content' % video_id, video_id) + title = video_data['title'] + + formats = self._extract_m3u8_formats( + self._proto_relative_url(video_data['url']), + video_id, 'mp4', 'm3u8_native') + self._sort_formats(formats) + + thumbnails = [] + for thumbnail_url in video_data.get('thumbnails', []): + if not thumbnail_url: + continue + thumbnails.append({ + 'url': self._proto_relative_url(thumbnail_url), + }) + + subtitles = {} + for sub in video_data.get('subtitles', []): + sub_url = sub.get('url') + if not sub_url: + continue + subtitles.setdefault(sub.get('lang', 'English'), []).append({ + 'url': self._proto_relative_url(sub_url), + }) return { 'id': video_id, 'title': title, 'formats': formats, - 'thumbnail': thumbnail, - 'description': description, - 'duration': duration, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + 'description': video_data.get('description'), + 'duration': int_or_none(video_data.get('duration')), + 'uploader_id': video_data.get('publisher_id'), } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/tudou.py youtube-dl-2019.05.20/youtube_dl/extractor/tudou.py --- youtube-dl-2016.02.22/youtube_dl/extractor/tudou.py 2016-02-14 22:26:17.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/tudou.py 2019-06-07 18:49:07.000000000 +0000 @@ -3,109 +3,6 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - int_or_none, - float_or_none, - unescapeHTML, -) - - -class TudouIE(InfoExtractor): - IE_NAME = 'tudou' - _VALID_URL = r'https?://(?:www\.)?tudou\.com/(?:(?:programs|wlplay)/view|(?:listplay|albumplay)/[\w-]{11})/(?P[\w-]{11})' - _TESTS = [{ - 'url': 'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html', - 'md5': '140a49ed444bd22f93330985d8475fcb', - 'info_dict': { - 'id': '159448201', - 'ext': 'f4v', - 'title': '卡马乔国足开大脚长传冲吊集锦', - 'thumbnail': 're:^https?://.*\.jpg$', - 'timestamp': 1372113489000, - 'description': '卡马乔卡家军,开大脚先进战术不完全集锦!', - 'duration': 289.04, - 'view_count': int, - 'filesize': int, - } - }, { - 'url': 'http://www.tudou.com/programs/view/ajX3gyhL0pc/', - 'info_dict': { - 'id': '117049447', - 'ext': 'f4v', - 'title': 'La Sylphide-Bolshoi-Ekaterina Krysanova & Vyacheslav Lopatin 2012', - 'thumbnail': 're:^https?://.*\.jpg$', - 'timestamp': 1349207518000, - 'description': 'md5:294612423894260f2dcd5c6c04fe248b', - 'duration': 5478.33, - 'view_count': int, - 'filesize': int, - } - }] - - _PLAYER_URL = 'http://js.tudouui.com/bin/lingtong/PortalPlayer_177.swf' - - def _url_for_id(self, video_id, quality=None): - info_url = 'http://v2.tudou.com/f?id=' + compat_str(video_id) - if quality: - info_url += '&hd' + quality - xml_data = self._download_xml(info_url, video_id, 'Opening the info XML page') - final_url = xml_data.text - return final_url - - def _real_extract(self, url): - video_id = self._match_id(url) - item_data = self._download_json( - 'http://www.tudou.com/tvp/getItemInfo.action?ic=%s' % video_id, video_id) - - youku_vcode = item_data.get('vcode') - if youku_vcode: - return self.url_result('youku:' + youku_vcode, ie='Youku') - - title = unescapeHTML(item_data['kw']) - description = item_data.get('desc') - thumbnail_url = item_data.get('pic') - view_count = int_or_none(item_data.get('playTimes')) - timestamp = int_or_none(item_data.get('pt')) - - segments = self._parse_json(item_data['itemSegs'], video_id) - # It looks like the keys are the arguments that have to be passed as - # the hd field in the request url, we pick the higher - # Also, filter non-number qualities (see issue #3643). - quality = sorted(filter(lambda k: k.isdigit(), segments.keys()), - key=lambda k: int(k))[-1] - parts = segments[quality] - result = [] - len_parts = len(parts) - if len_parts > 1: - self.to_screen('%s: found %s parts' % (video_id, len_parts)) - for part in parts: - part_id = part['k'] - final_url = self._url_for_id(part_id, quality) - ext = (final_url.split('?')[0]).split('.')[-1] - part_info = { - 'id': '%s' % part_id, - 'url': final_url, - 'ext': ext, - 'title': title, - 'thumbnail': thumbnail_url, - 'description': description, - 'view_count': view_count, - 'timestamp': timestamp, - 'duration': float_or_none(part.get('seconds'), 1000), - 'filesize': int_or_none(part.get('size')), - 'http_headers': { - 'Referer': self._PLAYER_URL, - }, - } - result.append(part_info) - - return { - '_type': 'multi_video', - 'entries': result, - 'id': video_id, - 'title': title, - } class TudouPlaylistIE(InfoExtractor): diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/tumblr.py youtube-dl-2019.05.20/youtube_dl/extractor/tumblr.py --- youtube-dl-2016.02.22/youtube_dl/extractor/tumblr.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/tumblr.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,14 +1,21 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import int_or_none +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + urlencode_postdata +) class TumblrIE(InfoExtractor): - _VALID_URL = r'http://(?P.*?)\.tumblr\.com/(?:post|video)/(?P[0-9]+)(?:$|[/?#])' + _VALID_URL = r'https?://(?P[^/?#&]+)\.tumblr\.com/(?:post|video)/(?P[0-9]+)(?:$|[/?#])' + _NETRC_MACHINE = 'tumblr' + _LOGIN_URL = 'https://www.tumblr.com/login' _TESTS = [{ 'url': 'http://tatianamaslanydaily.tumblr.com/post/54196191430/orphan-black-dvd-extra-behind-the-scenes', 'md5': '479bb068e5b16462f5176a6828829767', @@ -17,7 +24,7 @@ 'ext': 'mp4', 'title': 'tatiana maslany news, Orphan Black || DVD extra - behind the scenes ↳...', 'description': 'md5:37db8211e40b50c7c44e95da14f630b7', - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', } }, { 'url': 'http://5sostrum.tumblr.com/post/90208453769/yall-forgetting-the-greatest-keek-of-them-all', @@ -27,7 +34,7 @@ 'ext': 'mp4', 'title': '5SOS STRUM ;]', 'description': 'md5:dba62ac8639482759c8eb10ce474586a', - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', } }, { 'url': 'http://hdvideotest.tumblr.com/post/130323439814/test-description-for-my-hd-video', @@ -37,7 +44,7 @@ 'ext': 'mp4', 'title': 'HD Video Testing \u2014 Test description for my HD video', 'description': 'md5:97cc3ab5fcd27ee4af6356701541319c', - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', }, 'params': { 'format': 'hd', @@ -67,8 +74,75 @@ 'uploader_id': 'user32021558', }, 'add_ie': ['Vimeo'], + }, { + 'url': 'http://sutiblr.tumblr.com/post/139638707273', + 'md5': '2dd184b3669e049ba40563a7d423f95c', + 'info_dict': { + 'id': 'ir7qBEIKqvq', + 'ext': 'mp4', + 'title': 'Vine by sutiblr', + 'alt_title': 'Vine by sutiblr', + 'uploader': 'sutiblr', + 'uploader_id': '1198993975374495744', + 'upload_date': '20160220', + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + }, + 'add_ie': ['Vine'], + }, { + 'url': 'http://vitasidorkina.tumblr.com/post/134652425014/joskriver-victoriassecret-invisibility-or', + 'md5': '01c12ceb82cbf6b2fe0703aa56b3ad72', + 'info_dict': { + 'id': '-7LnUPGlSo', + 'ext': 'mp4', + 'title': 'Video by victoriassecret', + 'description': 'Invisibility or flight…which superpower would YOU choose? #VSFashionShow #ThisOrThat', + 'uploader_id': 'victoriassecret', + 'thumbnail': r're:^https?://.*\.jpg' + }, + 'add_ie': ['Instagram'], }] + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page') + + login_form = self._hidden_inputs(login_page) + login_form.update({ + 'user[email]': username, + 'user[password]': password + }) + + response, urlh = self._download_webpage_handle( + self._LOGIN_URL, None, 'Logging in', + data=urlencode_postdata(login_form), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': self._LOGIN_URL, + }) + + # Successful login + if '/dashboard' in urlh.geturl(): + return + + login_errors = self._parse_json( + self._search_regex( + r'RegistrationForm\.errors\s*=\s*(\[.+?\])\s*;', response, + 'login errors', default='[]'), + None, fatal=False) + if login_errors: + raise ExtractorError( + 'Unable to login: %s' % login_errors[0], expected=True) + + self.report_warning('Login has probably failed') + def _real_extract(self, url): m_url = re.match(self._VALID_URL, url) video_id = m_url.group('id') @@ -77,11 +151,19 @@ url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id) webpage, urlh = self._download_webpage_handle(url, video_id) + redirect_url = compat_str(urlh.geturl()) + if 'tumblr.com/safe-mode' in redirect_url or redirect_url.startswith('/safe-mode'): + raise ExtractorError( + 'This Tumblr may contain sensitive media. ' + 'Disable safe mode in your account settings ' + 'at https://www.tumblr.com/settings/account#safe_mode', + expected=True) + iframe_url = self._search_regex( r'src=\'(https?://www\.tumblr\.com/video/[^\']+)\'', webpage, 'iframe url', default=None) if iframe_url is None: - return self.url_result(urlh.geturl(), 'Generic') + return self.url_result(redirect_url, 'Generic') iframe = self._download_webpage(iframe_url, video_id, 'Downloading iframe page') diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/tunein.py youtube-dl-2019.05.20/youtube_dl/extractor/tunein.py --- youtube-dl-2016.02.22/youtube_dl/extractor/tunein.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/tunein.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,7 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals -import json +import re from .common import InfoExtractor from ..utils import ExtractorError @@ -11,6 +11,12 @@ class TuneInBaseIE(InfoExtractor): _API_BASE_URL = 'http://tunein.com/tuner/tune/' + @staticmethod + def _extract_urls(webpage): + return re.findall( + r']+src=["\'](?P(?:https?://)?tunein\.com/embed/player/[pst]\d+)', + webpage) + def _real_extract(self, url): content_id = self._match_id(url) @@ -27,10 +33,9 @@ if not streams_url.startswith('http://'): streams_url = compat_urlparse.urljoin(url, streams_url) - stream_data = self._download_webpage( - streams_url, content_id, note='Downloading stream data') - streams = json.loads(self._search_regex( - r'\((.*)\);', stream_data, 'stream info'))['Streams'] + streams = self._download_json( + streams_url, content_id, note='Downloading stream data', + transform_source=lambda s: re.sub(r'^\s*\((.*)\);\s*$', r'\1', s))['Streams'] is_live = None formats = [] @@ -57,7 +62,7 @@ return { 'id': content_id, - 'title': title, + 'title': self._live_title(title) if is_live else title, 'formats': formats, 'thumbnail': thumbnail, 'location': location, @@ -70,82 +75,83 @@ _VALID_URL = r'https?://(?:www\.)?tunein\.com/station/.*?audioClipId\=(?P\d+)' _API_URL_QUERY = '?tuneType=AudioClip&audioclipId=%s' - _TESTS = [ - { - 'url': 'http://tunein.com/station/?stationId=246119&audioClipId=816', - 'md5': '99f00d772db70efc804385c6b47f4e77', - 'info_dict': { - 'id': '816', - 'title': '32m', - 'ext': 'mp3', - }, + _TESTS = [{ + 'url': 'http://tunein.com/station/?stationId=246119&audioClipId=816', + 'md5': '99f00d772db70efc804385c6b47f4e77', + 'info_dict': { + 'id': '816', + 'title': '32m', + 'ext': 'mp3', }, - ] + }] class TuneInStationIE(TuneInBaseIE): IE_NAME = 'tunein:station' - _VALID_URL = r'https?://(?:www\.)?tunein\.com/(?:radio/.*?-s|station/.*?StationId\=)(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?tunein\.com/(?:radio/.*?-s|station/.*?StationId=|embed/player/s)(?P\d+)' _API_URL_QUERY = '?tuneType=Station&stationId=%s' @classmethod def suitable(cls, url): return False if TuneInClipIE.suitable(url) else super(TuneInStationIE, cls).suitable(url) - _TESTS = [ - { - 'url': 'http://tunein.com/radio/Jazz24-885-s34682/', - 'info_dict': { - 'id': '34682', - 'title': 'Jazz 24 on 88.5 Jazz24 - KPLU-HD2', - 'ext': 'mp3', - 'location': 'Tacoma, WA', - }, - 'params': { - 'skip_download': True, # live stream - }, + _TESTS = [{ + 'url': 'http://tunein.com/radio/Jazz24-885-s34682/', + 'info_dict': { + 'id': '34682', + 'title': 'Jazz 24 on 88.5 Jazz24 - KPLU-HD2', + 'ext': 'mp3', + 'location': 'Tacoma, WA', + }, + 'params': { + 'skip_download': True, # live stream }, - ] + }, { + 'url': 'http://tunein.com/embed/player/s6404/', + 'only_matching': True, + }] class TuneInProgramIE(TuneInBaseIE): IE_NAME = 'tunein:program' - _VALID_URL = r'https?://(?:www\.)?tunein\.com/(?:radio/.*?-p|program/.*?ProgramId\=)(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?tunein\.com/(?:radio/.*?-p|program/.*?ProgramId=|embed/player/p)(?P\d+)' _API_URL_QUERY = '?tuneType=Program&programId=%s' - _TESTS = [ - { - 'url': 'http://tunein.com/radio/Jazz-24-p2506/', - 'info_dict': { - 'id': '2506', - 'title': 'Jazz 24 on 91.3 WUKY-HD3', - 'ext': 'mp3', - 'location': 'Lexington, KY', - }, - 'params': { - 'skip_download': True, # live stream - }, + _TESTS = [{ + 'url': 'http://tunein.com/radio/Jazz-24-p2506/', + 'info_dict': { + 'id': '2506', + 'title': 'Jazz 24 on 91.3 WUKY-HD3', + 'ext': 'mp3', + 'location': 'Lexington, KY', + }, + 'params': { + 'skip_download': True, # live stream }, - ] + }, { + 'url': 'http://tunein.com/embed/player/p191660/', + 'only_matching': True, + }] class TuneInTopicIE(TuneInBaseIE): IE_NAME = 'tunein:topic' - _VALID_URL = r'https?://(?:www\.)?tunein\.com/topic/.*?TopicId\=(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?tunein\.com/(?:topic/.*?TopicId=|embed/player/t)(?P\d+)' _API_URL_QUERY = '?tuneType=Topic&topicId=%s' - _TESTS = [ - { - 'url': 'http://tunein.com/topic/?TopicId=101830576', - 'md5': 'c31a39e6f988d188252eae7af0ef09c9', - 'info_dict': { - 'id': '101830576', - 'title': 'Votez pour moi du 29 octobre 2015 (29/10/15)', - 'ext': 'mp3', - 'location': 'Belgium', - }, + _TESTS = [{ + 'url': 'http://tunein.com/topic/?TopicId=101830576', + 'md5': 'c31a39e6f988d188252eae7af0ef09c9', + 'info_dict': { + 'id': '101830576', + 'title': 'Votez pour moi du 29 octobre 2015 (29/10/15)', + 'ext': 'mp3', + 'location': 'Belgium', }, - ] + }, { + 'url': 'http://tunein.com/embed/player/t101830576/', + 'only_matching': True, + }] class TuneInShortenerIE(InfoExtractor): diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/tunepk.py youtube-dl-2019.05.20/youtube_dl/extractor/tunepk.py --- youtube-dl-2016.02.22/youtube_dl/extractor/tunepk.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/tunepk.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,90 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + try_get, + unified_timestamp, +) + + +class TunePkIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?tune\.pk/(?:video/|player/embed_player.php?.*?\bvid=)| + embed\.tune\.pk/play/ + ) + (?P\d+) + ''' + _TESTS = [{ + 'url': 'https://tune.pk/video/6919541/maudie-2017-international-trailer-1-ft-ethan-hawke-sally-hawkins', + 'md5': '0c537163b7f6f97da3c5dd1e3ef6dd55', + 'info_dict': { + 'id': '6919541', + 'ext': 'mp4', + 'title': 'Maudie (2017) | International Trailer # 1 ft Ethan Hawke, Sally Hawkins', + 'description': 'md5:eb5a04114fafef5cec90799a93a2d09c', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1487327564, + 'upload_date': '20170217', + 'uploader': 'Movie Trailers', + 'duration': 107, + 'view_count': int, + } + }, { + 'url': 'https://tune.pk/player/embed_player.php?vid=6919541&folder=2017/02/17/&width=600&height=350&autoplay=no', + 'only_matching': True, + }, { + 'url': 'https://embed.tune.pk/play/6919541?autoplay=no&ssl=yes&inline=true', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'https://tune.pk/video/%s' % video_id, video_id) + + details = self._parse_json( + self._search_regex( + r'new\s+TunePlayer\(({.+?})\)\s*;\s*\n', webpage, 'tune player'), + video_id)['details'] + + video = details['video'] + title = video.get('title') or self._og_search_title( + webpage, default=None) or self._html_search_meta( + 'title', webpage, 'title', fatal=True) + + formats = self._parse_jwplayer_formats( + details['player']['sources'], video_id) + self._sort_formats(formats) + + description = self._og_search_description( + webpage, default=None) or self._html_search_meta( + 'description', webpage, 'description') + + thumbnail = video.get('thumb') or self._og_search_thumbnail( + webpage, default=None) or self._html_search_meta( + 'thumbnail', webpage, 'thumbnail') + + timestamp = unified_timestamp(video.get('date_added')) + uploader = try_get( + video, lambda x: x['uploader']['name'], + compat_str) or self._html_search_meta('author', webpage, 'author') + + duration = int_or_none(video.get('duration')) + view_count = int_or_none(video.get('views')) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'uploader': uploader, + 'duration': duration, + 'view_count': view_count, + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/turbo.py youtube-dl-2019.05.20/youtube_dl/extractor/turbo.py --- youtube-dl-2016.02.22/youtube_dl/extractor/turbo.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/turbo.py 2019-06-07 18:49:07.000000000 +0000 @@ -4,6 +4,7 @@ import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, @@ -24,7 +25,7 @@ 'duration': 3715, 'title': 'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia... ', 'description': 'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia...', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', } } @@ -49,7 +50,7 @@ for child in item: m = re.search(r'url_video_(?P.+)', child.tag) if m: - quality = m.group('quality') + quality = compat_str(m.group('quality')) formats.append({ 'format_id': quality, 'url': child.text, diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/turner.py youtube-dl-2019.05.20/youtube_dl/extractor/turner.py --- youtube-dl-2016.02.22/youtube_dl/extractor/turner.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/turner.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,234 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .adobepass import AdobePassIE +from ..compat import compat_str +from ..utils import ( + xpath_text, + int_or_none, + determine_ext, + float_or_none, + parse_duration, + xpath_attr, + update_url_query, + ExtractorError, + strip_or_none, + url_or_none, +) + + +class TurnerBaseIE(AdobePassIE): + _AKAMAI_SPE_TOKEN_CACHE = {} + + def _extract_timestamp(self, video_data): + return int_or_none(xpath_attr(video_data, 'dateCreated', 'uts')) + + def _add_akamai_spe_token(self, tokenizer_src, video_url, content_id, ap_data, custom_tokenizer_query=None): + secure_path = self._search_regex(r'https?://[^/]+(.+/)', video_url, 'secure path') + '*' + token = self._AKAMAI_SPE_TOKEN_CACHE.get(secure_path) + if not token: + query = { + 'path': secure_path, + } + if custom_tokenizer_query: + query.update(custom_tokenizer_query) + else: + query['videoId'] = content_id + if ap_data.get('auth_required'): + query['accessToken'] = self._extract_mvpd_auth(ap_data['url'], content_id, ap_data['site_name'], ap_data['site_name']) + auth = self._download_xml( + tokenizer_src, content_id, query=query) + error_msg = xpath_text(auth, 'error/msg') + if error_msg: + raise ExtractorError(error_msg, expected=True) + token = xpath_text(auth, 'token') + if not token: + return video_url + self._AKAMAI_SPE_TOKEN_CACHE[secure_path] = token + return video_url + '?hdnea=' + token + + def _extract_cvp_info(self, data_src, video_id, path_data={}, ap_data={}): + video_data = self._download_xml(data_src, video_id) + video_id = video_data.attrib['id'] + title = xpath_text(video_data, 'headline', fatal=True) + content_id = xpath_text(video_data, 'contentId') or video_id + # rtmp_src = xpath_text(video_data, 'akamai/src') + # if rtmp_src: + # splited_rtmp_src = rtmp_src.split(',') + # if len(splited_rtmp_src) == 2: + # rtmp_src = splited_rtmp_src[1] + # aifp = xpath_text(video_data, 'akamai/aifp', default='') + + urls = [] + formats = [] + rex = re.compile( + r'(?P[0-9]+)x(?P[0-9]+)(?:_(?P[0-9]+))?') + # Possible formats locations: files/file, files/groupFiles/files + # and maybe others + for video_file in video_data.findall('.//file'): + video_url = video_file.text.strip() + if not video_url: + continue + ext = determine_ext(video_url) + if video_url.startswith('/mp4:protected/'): + continue + # TODO Correct extraction for these files + # protected_path_data = path_data.get('protected') + # if not protected_path_data or not rtmp_src: + # continue + # protected_path = self._search_regex( + # r'/mp4:(.+)\.[a-z0-9]', video_url, 'secure path') + # auth = self._download_webpage( + # protected_path_data['tokenizer_src'], query={ + # 'path': protected_path, + # 'videoId': content_id, + # 'aifp': aifp, + # }) + # token = xpath_text(auth, 'token') + # if not token: + # continue + # video_url = rtmp_src + video_url + '?' + token + elif video_url.startswith('/secure/'): + secure_path_data = path_data.get('secure') + if not secure_path_data: + continue + video_url = self._add_akamai_spe_token( + secure_path_data['tokenizer_src'], + secure_path_data['media_src'] + video_url, + content_id, ap_data) + elif not re.match('https?://', video_url): + base_path_data = path_data.get(ext, path_data.get('default', {})) + media_src = base_path_data.get('media_src') + if not media_src: + continue + video_url = media_src + video_url + if video_url in urls: + continue + urls.append(video_url) + format_id = video_file.get('bitrate') + if ext == 'smil': + formats.extend(self._extract_smil_formats( + video_url, video_id, fatal=False)) + elif ext == 'm3u8': + m3u8_formats = self._extract_m3u8_formats( + video_url, video_id, 'mp4', + m3u8_id=format_id or 'hls', fatal=False) + if '/secure/' in video_url and '?hdnea=' in video_url: + for f in m3u8_formats: + f['_seekable'] = False + formats.extend(m3u8_formats) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + update_url_query(video_url, {'hdcore': '3.7.0'}), + video_id, f4m_id=format_id or 'hds', fatal=False)) + else: + f = { + 'format_id': format_id, + 'url': video_url, + 'ext': ext, + } + mobj = rex.search(format_id + video_url) + if mobj: + f.update({ + 'width': int(mobj.group('width')), + 'height': int(mobj.group('height')), + 'tbr': int_or_none(mobj.group('bitrate')), + }) + elif isinstance(format_id, compat_str): + if format_id.isdigit(): + f['tbr'] = int(format_id) + else: + mobj = re.match(r'ios_(audio|[0-9]+)$', format_id) + if mobj: + if mobj.group(1) == 'audio': + f.update({ + 'vcodec': 'none', + 'ext': 'm4a', + }) + else: + f['tbr'] = int(mobj.group(1)) + formats.append(f) + self._sort_formats(formats) + + subtitles = {} + for source in video_data.findall('closedCaptions/source'): + for track in source.findall('track'): + track_url = url_or_none(track.get('url')) + if not track_url or track_url.endswith('/big'): + continue + lang = track.get('lang') or track.get('label') or 'en' + subtitles.setdefault(lang, []).append({ + 'url': track_url, + 'ext': { + 'scc': 'scc', + 'webvtt': 'vtt', + 'smptett': 'tt', + }.get(source.get('format')) + }) + + thumbnails = [{ + 'id': image.get('cut'), + 'url': image.text, + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + } for image in video_data.findall('images/image')] + + is_live = xpath_text(video_data, 'isLive') == 'true' + + return { + 'id': video_id, + 'title': self._live_title(title) if is_live else title, + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + 'thumbnail': xpath_text(video_data, 'poster'), + 'description': strip_or_none(xpath_text(video_data, 'description')), + 'duration': parse_duration(xpath_text(video_data, 'length') or xpath_text(video_data, 'trt')), + 'timestamp': self._extract_timestamp(video_data), + 'upload_date': xpath_attr(video_data, 'metas', 'version'), + 'series': xpath_text(video_data, 'showTitle'), + 'season_number': int_or_none(xpath_text(video_data, 'seasonNumber')), + 'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')), + 'is_live': is_live, + } + + def _extract_ngtv_info(self, media_id, tokenizer_query, ap_data=None): + streams_data = self._download_json( + 'http://medium.ngtv.io/media/%s/tv' % media_id, + media_id)['media']['tv'] + duration = None + chapters = [] + formats = [] + for supported_type in ('unprotected', 'bulkaes'): + stream_data = streams_data.get(supported_type, {}) + m3u8_url = stream_data.get('secureUrl') or stream_data.get('url') + if not m3u8_url: + continue + if stream_data.get('playlistProtection') == 'spe': + m3u8_url = self._add_akamai_spe_token( + 'http://token.ngtv.io/token/token_spe', + m3u8_url, media_id, ap_data or {}, tokenizer_query) + formats.extend(self._extract_m3u8_formats( + m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False)) + + duration = float_or_none(stream_data.get('totalRuntime')) + + if not chapters: + for chapter in stream_data.get('contentSegments', []): + start_time = float_or_none(chapter.get('start')) + chapter_duration = float_or_none(chapter.get('duration')) + if start_time is None or chapter_duration is None: + continue + chapters.append({ + 'start_time': start_time, + 'end_time': start_time + chapter_duration, + }) + self._sort_formats(formats) + + return { + 'formats': formats, + 'chapters': chapters, + 'duration': duration, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/tutv.py youtube-dl-2019.05.20/youtube_dl/extractor/tutv.py --- youtube-dl-2016.02.22/youtube_dl/extractor/tutv.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/tutv.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,9 +1,10 @@ from __future__ import unicode_literals -import base64 - from .common import InfoExtractor -from ..compat import compat_parse_qs +from ..compat import ( + compat_b64decode, + compat_parse_qs, +) class TutvIE(InfoExtractor): @@ -26,7 +27,7 @@ data_content = self._download_webpage( 'http://tu.tv/flvurl.php?codVideo=%s' % internal_id, video_id, 'Downloading video info') - video_url = base64.b64decode(compat_parse_qs(data_content)['kpt'][0].encode('utf-8')).decode('utf-8') + video_url = compat_b64decode(compat_parse_qs(data_content)['kpt'][0]).decode('utf-8') return { 'id': internal_id, diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/tv2hu.py youtube-dl-2019.05.20/youtube_dl/extractor/tv2hu.py --- youtube-dl-2016.02.22/youtube_dl/extractor/tv2hu.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/tv2hu.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,62 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import int_or_none + + +class TV2HuIE(InfoExtractor): + IE_NAME = 'tv2.hu' + _VALID_URL = r'https?://(?:www\.)?tv2\.hu/(?:[^/]+/)+(?P\d+)_[^/?#]+?\.html' + _TESTS = [{ + 'url': 'http://tv2.hu/ezek_megorultek/217679_ezek-megorultek---1.-adas-1.-resz.html', + 'md5': '585e58e2e090f34603804bb2c48e98d8', + 'info_dict': { + 'id': '217679', + 'ext': 'mp4', + 'title': 'Ezek megőrültek! - 1. adás 1. rész', + 'upload_date': '20160826', + 'thumbnail': r're:^https?://.*\.jpg$' + } + }, { + 'url': 'http://tv2.hu/ezek_megorultek/teljes_adasok/217677_ezek-megorultek---1.-adas-2.-resz.html', + 'only_matching': True + }, { + 'url': 'http://tv2.hu/musoraink/aktiv/aktiv_teljes_adas/217963_aktiv-teljes-adas---2016.08.30..html', + 'only_matching': True + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + json_url = self._search_regex( + r'jsonUrl\s*=\s*"([^"]+)"', webpage, 'json url') + json_data = self._download_json(json_url, video_id) + + formats = [] + for b in ('bitrates', 'backupBitrates'): + bitrates = json_data.get(b, {}) + m3u8_url = bitrates.get('hls') + if m3u8_url: + formats.extend(self._extract_wowza_formats( + m3u8_url, video_id, skip_protocols=['rtmp', 'rtsp'])) + + for mp4_url in bitrates.get('mp4', []): + height = int_or_none(self._search_regex( + r'\.(\d+)p\.mp4', mp4_url, 'height', default=None)) + formats.append({ + 'format_id': 'http' + ('-%d' % height if height else ''), + 'url': mp4_url, + 'height': height, + 'width': int_or_none(height / 9.0 * 16.0 if height else None), + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._og_search_title(webpage).strip(), + 'thumbnail': self._og_search_thumbnail(webpage), + 'upload_date': self._search_regex( + r'/vod/(\d{8})/', json_url, 'upload_date', default=None), + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/tv2.py youtube-dl-2019.05.20/youtube_dl/extractor/tv2.py --- youtube-dl-2016.02.22/youtube_dl/extractor/tv2.py 2016-01-31 11:57:01.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/tv2.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re @@ -8,13 +8,14 @@ determine_ext, int_or_none, float_or_none, + js_to_json, parse_iso8601, remove_end, ) class TV2IE(InfoExtractor): - _VALID_URL = 'http://(?:www\.)?tv2\.no/v/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?tv2\.no/v/(?P\d+)' _TEST = { 'url': 'http://www.tv2.no/v/916509/', 'info_dict': { @@ -54,10 +55,11 @@ ext = determine_ext(video_url) if ext == 'f4m': formats.extend(self._extract_f4m_formats( - video_url, video_id, f4m_id=format_id)) + video_url, video_id, f4m_id=format_id, fatal=False)) elif ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', m3u8_id=format_id)) + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False)) elif ext == 'ism' or video_url.endswith('.ism/Manifest'): pass else: @@ -100,12 +102,12 @@ class TV2ArticleIE(InfoExtractor): - _VALID_URL = 'http://(?:www\.)?tv2\.no/(?:a|\d{4}/\d{2}/\d{2}(/[^/]+)+)/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?tv2\.no/(?:a|\d{4}/\d{2}/\d{2}(/[^/]+)+)/(?P\d+)' _TESTS = [{ 'url': 'http://www.tv2.no/2015/05/16/nyheter/alesund/krim/pingvin/6930542', 'info_dict': { 'id': '6930542', - 'title': 'Russen hetses etter pingvintyveri – innrømmer å ha åpnet luken på buret', + 'title': 'Russen hetses etter pingvintyveri - innrømmer å ha åpnet luken på buret', 'description': 'md5:339573779d3eea3542ffe12006190954', }, 'playlist_count': 2, @@ -119,9 +121,23 @@ webpage = self._download_webpage(url, playlist_id) + # Old embed pattern (looks unused nowadays) + assets = re.findall(r'data-assetid=["\'](\d+)', webpage) + + if not assets: + # New embed pattern + for v in re.findall(r'TV2ContentboxVideo\(({.+?})\)', webpage): + video = self._parse_json( + v, playlist_id, transform_source=js_to_json, fatal=False) + if not video: + continue + asset = video.get('assetId') + if asset: + assets.append(asset) + entries = [ - self.url_result('http://www.tv2.no/v/%s' % video_id, 'TV2') - for video_id in re.findall(r'data-assetid="(\d+)"', webpage)] + self.url_result('http://www.tv2.no/v/%s' % asset_id, 'TV2') + for asset_id in assets] title = remove_end(self._og_search_title(webpage), ' - TV2.no') description = remove_end(self._og_search_description(webpage), ' - TV2.no') diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/tv4.py youtube-dl-2019.05.20/youtube_dl/extractor/tv4.py --- youtube-dl-2016.02.22/youtube_dl/extractor/tv4.py 2016-01-14 09:35:12.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/tv4.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,9 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( - ExtractorError, + int_or_none, parse_iso8601, ) @@ -15,33 +17,34 @@ tv4\.se/(?:[^/]+)/klipp/(?:.*)-| tv4play\.se/ (?: - (?:program|barn)/(?:[^\?]+)\?video_id=| + (?:program|barn)/(?:[^/]+/|(?:[^\?]+)\?video_id=)| iframe/video/| film/| sport/| ) )(?P[0-9]+)''' + _GEO_COUNTRIES = ['SE'] _TESTS = [ { 'url': 'http://www.tv4.se/kalla-fakta/klipp/kalla-fakta-5-english-subtitles-2491650', - 'md5': '909d6454b87b10a25aa04c4bdd416a9b', + 'md5': 'cb837212f342d77cec06e6dad190e96d', 'info_dict': { 'id': '2491650', 'ext': 'mp4', 'title': 'Kalla Fakta 5 (english subtitles)', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': int, 'upload_date': '20131125', }, }, { 'url': 'http://www.tv4play.se/iframe/video/3054113', - 'md5': '77f851c55139ffe0ebd41b6a5552489b', + 'md5': 'cb837212f342d77cec06e6dad190e96d', 'info_dict': { 'id': '3054113', 'ext': 'mp4', 'title': 'Så här jobbar ficktjuvarna - se avslöjande bilder', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'description': 'Unika bilder avslöjar hur turisternas fickor vittjas mitt på Stockholms central. Två experter på ficktjuvarna avslöjar knepen du ska se upp för.', 'timestamp': int, 'upload_date': '20150130', @@ -59,42 +62,54 @@ 'url': 'http://www.tv4play.se/barn/looney-tunes?video_id=3062412', 'only_matching': True, }, + { + 'url': 'http://www.tv4play.se/program/farang/3922081', + 'only_matching': True, + } ] def _real_extract(self, url): video_id = self._match_id(url) info = self._download_json( - 'http://www.tv4play.se/player/assets/%s.json' % video_id, video_id, 'Downloading video info JSON') + 'http://www.tv4play.se/player/assets/%s.json' % video_id, + video_id, 'Downloading video info JSON') + + title = info['title'] + + manifest_url = self._download_json( + 'https://playback-api.b17g.net/media/' + video_id, + video_id, query={ + 'service': 'tv4', + 'device': 'browser', + 'protocol': 'hls', + })['playbackItem']['manifestUrl'] + formats = self._extract_m3u8_formats( + manifest_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False) + formats.extend(self._extract_mpd_formats( + manifest_url.replace('.m3u8', '.mpd'), + video_id, mpd_id='dash', fatal=False)) + formats.extend(self._extract_f4m_formats( + manifest_url.replace('.m3u8', '.f4m'), + video_id, f4m_id='hds', fatal=False)) + formats.extend(self._extract_ism_formats( + re.sub(r'\.ism/.+?\.m3u8', r'.ism/Manifest', manifest_url), + video_id, ism_id='mss', fatal=False)) + + if not formats and info.get('is_geo_restricted'): + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) - # If is_geo_restricted is true, it doesn't necessarily mean we can't download it - if info['is_geo_restricted']: - self.report_warning('This content might not be available in your country due to licensing restrictions.') - if info['requires_subscription']: - raise ExtractorError('This content requires subscription.', expected=True) - - sources_data = self._download_json( - 'https://prima.tv4play.se/api/web/asset/%s/play.json?protocol=http&videoFormat=MP4' % video_id, video_id, 'Downloading sources JSON') - sources = sources_data['playback'] - - formats = [] - for item in sources.get('items', {}).get('item', []): - ext, bitrate = item['mediaFormat'], item['bitrate'] - formats.append({ - 'format_id': '%s_%s' % (ext, bitrate), - 'tbr': bitrate, - 'ext': ext, - 'url': item['url'], - }) self._sort_formats(formats) return { 'id': video_id, - 'title': info['title'], + 'title': title, 'formats': formats, + # 'subtitles': subtitles, 'description': info.get('description'), 'timestamp': parse_iso8601(info.get('broadcast_date_time')), - 'duration': info.get('duration'), + 'duration': int_or_none(info.get('duration')), 'thumbnail': info.get('image'), - 'is_live': sources.get('live'), + 'is_live': info.get('is_live') is True, } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/tv5mondeplus.py youtube-dl-2019.05.20/youtube_dl/extractor/tv5mondeplus.py --- youtube-dl-2016.02.22/youtube_dl/extractor/tv5mondeplus.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/tv5mondeplus.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,79 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_html, + determine_ext, + extract_attributes, + get_element_by_class, + int_or_none, + parse_duration, + parse_iso8601, +) + + +class TV5MondePlusIE(InfoExtractor): + IE_DESC = 'TV5MONDE+' + _VALID_URL = r'https?://(?:www\.)?tv5mondeplus\.com/toutes-les-videos/[^/]+/(?P[^/?#]+)' + _TEST = { + 'url': 'http://www.tv5mondeplus.com/toutes-les-videos/documentaire/tdah-mon-amour-tele-quebec-tdah-mon-amour-ep001-enfants', + 'md5': '12130fc199f020673138a83466542ec6', + 'info_dict': { + 'id': 'tdah-mon-amour-tele-quebec-tdah-mon-amour-ep001-enfants', + 'ext': 'mp4', + 'title': 'Tdah, mon amour - Enfants', + 'description': 'md5:230e3aca23115afcf8006d1bece6df74', + 'upload_date': '20170401', + 'timestamp': 1491022860, + } + } + _GEO_BYPASS = False + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + if ">Ce programme n'est malheureusement pas disponible pour votre zone géographique.<" in webpage: + self.raise_geo_restricted(countries=['FR']) + + series = get_element_by_class('video-detail__title', webpage) + title = episode = get_element_by_class( + 'video-detail__subtitle', webpage) or series + if series and series != title: + title = '%s - %s' % (series, title) + vpl_data = extract_attributes(self._search_regex( + r'(<[^>]+class="video_player_loader"[^>]+>)', + webpage, 'video player loader')) + + video_files = self._parse_json( + vpl_data['data-broadcast'], display_id).get('files', []) + formats = [] + for video_file in video_files: + v_url = video_file.get('url') + if not v_url: + continue + video_format = video_file.get('format') or determine_ext(v_url) + if video_format == 'm3u8': + formats.extend(self._extract_m3u8_formats( + v_url, display_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': v_url, + 'format_id': video_format, + }) + self._sort_formats(formats) + + return { + 'id': display_id, + 'display_id': display_id, + 'title': title, + 'description': clean_html(get_element_by_class('video-detail__description', webpage)), + 'thumbnail': vpl_data.get('data-image'), + 'duration': int_or_none(vpl_data.get('data-duration')) or parse_duration(self._html_search_meta('duration', webpage)), + 'timestamp': parse_iso8601(self._html_search_meta('uploadDate', webpage)), + 'formats': formats, + 'episode': episode, + 'series': series, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/tvanouvelles.py youtube-dl-2019.05.20/youtube_dl/extractor/tvanouvelles.py --- youtube-dl-2016.02.22/youtube_dl/extractor/tvanouvelles.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/tvanouvelles.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,65 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .brightcove import BrightcoveNewIE + + +class TVANouvellesIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tvanouvelles\.ca/videos/(?P\d+)' + _TEST = { + 'url': 'http://www.tvanouvelles.ca/videos/5117035533001', + 'info_dict': { + 'id': '5117035533001', + 'ext': 'mp4', + 'title': 'L’industrie du taxi dénonce l’entente entre Québec et Uber: explications', + 'description': 'md5:479653b7c8cf115747bf5118066bd8b3', + 'uploader_id': '1741764581', + 'timestamp': 1473352030, + 'upload_date': '20160908', + }, + 'add_ie': ['BrightcoveNew'], + } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1741764581/default_default/index.html?videoId=%s' + + def _real_extract(self, url): + brightcove_id = self._match_id(url) + return self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + BrightcoveNewIE.ie_key(), brightcove_id) + + +class TVANouvellesArticleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tvanouvelles\.ca/(?:[^/]+/)+(?P[^/?#&]+)' + _TEST = { + 'url': 'http://www.tvanouvelles.ca/2016/11/17/des-policiers-qui-ont-la-meche-un-peu-courte', + 'info_dict': { + 'id': 'des-policiers-qui-ont-la-meche-un-peu-courte', + 'title': 'Des policiers qui ont «la mèche un peu courte»?', + 'description': 'md5:92d363c8eb0f0f030de9a4a84a90a3a0', + }, + 'playlist_mincount': 4, + } + + @classmethod + def suitable(cls, url): + return False if TVANouvellesIE.suitable(url) else super(TVANouvellesArticleIE, cls).suitable(url) + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + entries = [ + self.url_result( + 'http://www.tvanouvelles.ca/videos/%s' % mobj.group('id'), + ie=TVANouvellesIE.ie_key(), video_id=mobj.group('id')) + for mobj in re.finditer( + r'data-video-id=(["\'])?(?P\d+)', webpage)] + + title = self._og_search_title(webpage, fatal=False) + description = self._og_search_description(webpage) + + return self.playlist_result(entries, display_id, title, description) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/tva.py youtube-dl-2019.05.20/youtube_dl/extractor/tva.py --- youtube-dl-2016.02.22/youtube_dl/extractor/tva.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/tva.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,54 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + float_or_none, + smuggle_url, +) + + +class TVAIE(InfoExtractor): + _VALID_URL = r'https?://videos\.tva\.ca/details/_(?P\d+)' + _TEST = { + 'url': 'https://videos.tva.ca/details/_5596811470001', + 'info_dict': { + 'id': '5596811470001', + 'ext': 'mp4', + 'title': 'Un extrait de l\'épisode du dimanche 8 octobre 2017 !', + 'uploader_id': '5481942443001', + 'upload_date': '20171003', + 'timestamp': 1507064617, + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5481942443001/default_default/index.html?videoId=%s' + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._download_json( + 'https://videos.tva.ca/proxy/item/_' + video_id, video_id, headers={ + 'Accept': 'application/json', + }, query={ + 'appId': '5955fc5f23eec60006c951f1', + }) + + def get_attribute(key): + for attribute in video_data.get('attributes', []): + if attribute.get('key') == key: + return attribute.get('value') + return None + + return { + '_type': 'url_transparent', + 'id': video_id, + 'title': get_attribute('title'), + 'url': smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, {'geo_countries': ['CA']}), + 'description': get_attribute('description'), + 'thumbnail': get_attribute('image-background') or get_attribute('image-landscape'), + 'duration': float_or_none(get_attribute('video-duration'), 1000), + 'ie_key': 'BrightcoveNew', + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/tvc.py youtube-dl-2019.05.20/youtube_dl/extractor/tvc.py --- youtube-dl-2016.02.22/youtube_dl/extractor/tvc.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/tvc.py 2019-06-07 18:49:07.000000000 +0000 @@ -11,7 +11,7 @@ class TVCIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?tvc\.ru/video/iframe/id/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?tvc\.ru/video/iframe/id/(?P\d+)' _TEST = { 'url': 'http://www.tvc.ru/video/iframe/id/74622/isPlay/false/id_stat/channel/?acc_video_id=/channel/brand/id/17/show/episodes/episode_id/39702', 'md5': 'bbc5ff531d1e90e856f60fc4b3afd708', @@ -19,7 +19,7 @@ 'id': '74622', 'ext': 'mp4', 'title': 'События. "События". Эфир от 22.05.2015 14:30', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 1122, }, } @@ -64,7 +64,7 @@ class TVCArticleIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?tvc\.ru/(?!video/iframe/id/)(?P[^?#]+)' + _VALID_URL = r'https?://(?:www\.)?tvc\.ru/(?!video/iframe/id/)(?P[^?#]+)' _TESTS = [{ 'url': 'http://www.tvc.ru/channel/brand/id/29/show/episodes/episode_id/39702/', 'info_dict': { @@ -72,7 +72,7 @@ 'ext': 'mp4', 'title': 'События. "События". Эфир от 22.05.2015 14:30', 'description': 'md5:ad7aa7db22903f983e687b8a3e98c6dd', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 1122, }, }, { @@ -82,7 +82,7 @@ 'ext': 'mp4', 'title': 'Эксперты: в столице встал вопрос о максимально безопасных остановках', 'description': 'md5:f2098f71e21f309e89f69b525fd9846e', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 278, }, }, { @@ -92,7 +92,7 @@ 'ext': 'mp4', 'title': 'Ещё не поздно. Эфир от 03.08.2013', 'description': 'md5:51fae9f3f8cfe67abce014e428e5b027', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 3316, }, }] diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/tvigle.py youtube-dl-2019.05.20/youtube_dl/extractor/tvigle.py --- youtube-dl-2016.02.22/youtube_dl/extractor/tvigle.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/tvigle.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re @@ -17,6 +17,9 @@ IE_DESC = 'Интернет-телевидение Tvigle.ru' _VALID_URL = r'https?://(?:www\.)?(?:tvigle\.ru/(?:[^/]+/)+(?P[^/]+)/$|cloud\.tvigle\.ru/video/(?P\d+))' + _GEO_BYPASS = False + _GEO_COUNTRIES = ['RU'] + _TESTS = [ { 'url': 'http://www.tvigle.ru/video/sokrat/', @@ -58,7 +61,9 @@ if not video_id: webpage = self._download_webpage(url, display_id) video_id = self._html_search_regex( - r'class="video-preview current_playing" id="(\d+)">', + (r']+class=["\']player["\'][^>]+id=["\'](\d+)', + r'var\s+cloudId\s*=\s*["\'](\d+)', + r'class="video-preview current_playing" id="(\d+)"'), webpage, 'video id') video_data = self._download_json( @@ -70,8 +75,13 @@ error_message = item.get('errorMessage') if not videos and error_message: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error_message), expected=True) + if item.get('isGeoBlocked') is True: + self.raise_geo_restricted( + msg=error_message, countries=self._GEO_COUNTRIES) + else: + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, error_message), + expected=True) title = item['title'] description = item.get('description') @@ -81,10 +91,10 @@ formats = [] for vcodec, fmts in item['videos'].items(): + if vcodec == 'hls': + continue for format_id, video_url in fmts.items(): if format_id == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', m3u8_id=vcodec)) continue height = self._search_regex( r'^(\d+)[pP]$', format_id, 'height', default=None) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/tvland.py youtube-dl-2019.05.20/youtube_dl/extractor/tvland.py --- youtube-dl-2016.02.22/youtube_dl/extractor/tvland.py 2016-01-01 11:16:56.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/tvland.py 2019-06-07 18:49:07.000000000 +0000 @@ -6,59 +6,29 @@ class TVLandIE(MTVServicesInfoExtractor): IE_NAME = 'tvland.com' - _VALID_URL = r'https?://(?:www\.)?tvland\.com/(?:video-clips|episodes)/(?P[^/?#.]+)' + _VALID_URL = r'https?://(?:www\.)?tvland\.com/(?:video-clips|(?:full-)?episodes)/(?P[^/?#.]+)' _FEED_URL = 'http://www.tvland.com/feeds/mrss/' _TESTS = [{ + # Geo-restricted. Without a proxy metadata are still there. With a + # proxy it redirects to http://m.tvland.com/app/ 'url': 'http://www.tvland.com/episodes/hqhps2/everybody-loves-raymond-the-invasion-ep-048', - 'playlist': [ - { - 'md5': '227e9723b9669c05bf51098b10287aa7', - 'info_dict': { - 'id': 'bcbd3a83-3aca-4dca-809b-f78a87dcccdd', - 'ext': 'mp4', - 'title': 'Everybody Loves Raymond|Everybody Loves Raymond 048 HD, Part 1 of 5', - } - }, - { - 'md5': '9fa2b764ec0e8194fb3ebb01a83df88b', - 'info_dict': { - 'id': 'f4279548-6e13-40dd-92e8-860d27289197', - 'ext': 'mp4', - 'title': 'Everybody Loves Raymond|Everybody Loves Raymond 048 HD, Part 2 of 5', - } - }, - { - 'md5': 'fde4c3bccd7cc7e3576b338734153cec', - 'info_dict': { - 'id': '664e4a38-53ef-4115-9bc9-d0f789ec6334', - 'ext': 'mp4', - 'title': 'Everybody Loves Raymond|Everybody Loves Raymond 048 HD, Part 3 of 5', - } - }, - { - 'md5': '247f6780cda6891f2e49b8ae2b10e017', - 'info_dict': { - 'id': '9146ecf5-b15a-4d78-879c-6679b77f4960', - 'ext': 'mp4', - 'title': 'Everybody Loves Raymond|Everybody Loves Raymond 048 HD, Part 4 of 5', - } - }, - { - 'md5': 'fd269f33256e47bad5eb6c40de089ff6', - 'info_dict': { - 'id': '04334a2e-9a47-4214-a8c2-ae5792e2fab7', - 'ext': 'mp4', - 'title': 'Everybody Loves Raymond|Everybody Loves Raymond 048 HD, Part 5 of 5', - } - } - ], + 'info_dict': { + 'description': 'md5:80973e81b916a324e05c14a3fb506d29', + 'title': 'The Invasion', + }, + 'playlist': [], }, { 'url': 'http://www.tvland.com/video-clips/zea2ev/younger-younger--hilary-duff---little-lies', 'md5': 'e2c6389401cf485df26c79c247b08713', 'info_dict': { 'id': 'b8697515-4bbe-4e01-83d5-fa705ce5fa88', 'ext': 'mp4', - 'title': 'Younger|Younger: Hilary Duff - Little Lies', - 'description': 'md5:7d192f56ca8d958645c83f0de8ef0269' + 'title': 'Younger|December 28, 2015|2|NO-EPISODE#|Younger: Hilary Duff - Little Lies', + 'description': 'md5:7d192f56ca8d958645c83f0de8ef0269', + 'upload_date': '20151228', + 'timestamp': 1451289600, }, + }, { + 'url': 'http://www.tvland.com/full-episodes/iu0hz6/younger-a-kiss-is-just-a-kiss-season-3-ep-301', + 'only_matching': True, }] diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/tvn24.py youtube-dl-2019.05.20/youtube_dl/extractor/tvn24.py --- youtube-dl-2016.02.22/youtube_dl/extractor/tvn24.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/tvn24.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,79 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unescapeHTML, +) + + +class TVN24IE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:[^/]+)\.)?tvn24(?:bis)?\.pl/(?:[^/]+/)*(?P[^/]+)' + _TESTS = [{ + 'url': 'http://www.tvn24.pl/wiadomosci-z-kraju,3/oredzie-artura-andrusa,702428.html', + 'md5': 'fbdec753d7bc29d96036808275f2130c', + 'info_dict': { + 'id': '1584444', + 'ext': 'mp4', + 'title': '"Święta mają być wesołe, dlatego, ludziska, wszyscy pod jemiołę"', + 'description': 'Wyjątkowe orędzie Artura Andrusa, jednego z gości "Szkła kontaktowego".', + 'thumbnail': 're:https?://.*[.]jpeg', + } + }, { + 'url': 'http://fakty.tvn24.pl/ogladaj-online,60/53-konferencja-bezpieczenstwa-w-monachium,716431.html', + 'only_matching': True, + }, { + 'url': 'http://sport.tvn24.pl/pilka-nozna,105/ligue-1-kamil-glik-rozcial-glowe-monaco-tylko-remisuje-z-bastia,716522.html', + 'only_matching': True, + }, { + 'url': 'http://tvn24bis.pl/poranek,146,m/gen-koziej-w-tvn24-bis-wracamy-do-czasow-zimnej-wojny,715660.html', + 'only_matching': True, + }, { + 'url': 'https://www.tvn24.pl/magazyn-tvn24/angie-w-jednej-czwartej-polka-od-szarej-myszki-do-cesarzowej-europy,119,2158', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title(webpage) + + def extract_json(attr, name, fatal=True): + return self._parse_json( + self._search_regex( + r'\b%s=(["\'])(?P(?!\1).+?)\1' % attr, webpage, + name, group='json', fatal=fatal) or '{}', + video_id, transform_source=unescapeHTML, fatal=fatal) + + quality_data = extract_json('data-quality', 'formats') + + formats = [] + for format_id, url in quality_data.items(): + formats.append({ + 'url': url, + 'format_id': format_id, + 'height': int_or_none(format_id.rstrip('p')), + }) + self._sort_formats(formats) + + description = self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail( + webpage, default=None) or self._html_search_regex( + r'\bdata-poster=(["\'])(?P(?!\1).+?)\1', webpage, + 'thumbnail', group='url') + + share_params = extract_json( + 'data-share-params', 'share params', fatal=False) + if isinstance(share_params, dict): + video_id = share_params.get('id') or video_id + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/tvnet.py youtube-dl-2019.05.20/youtube_dl/extractor/tvnet.py --- youtube-dl-2016.02.22/youtube_dl/extractor/tvnet.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/tvnet.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,147 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unescapeHTML, + url_or_none, +) + + +class TVNetIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+)\.tvnet\.gov\.vn/[^/]+/(?:\d+/)?(?P\d+)(?:/|$)' + _TESTS = [{ + # video + 'url': 'http://de.tvnet.gov.vn/video/109788/vtv1---bac-tuyet-tai-lao-cai-va-ha-giang/tin-nong-24h', + 'md5': 'b4d7abe0252c9b47774760b7519c7558', + 'info_dict': { + 'id': '109788', + 'ext': 'mp4', + 'title': 'VTV1 - Bắc tuyết tại Lào Cai và Hà Giang', + 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', + 'is_live': False, + 'view_count': int, + }, + }, { + # audio + 'url': 'http://vn.tvnet.gov.vn/radio/27017/vov1---ban-tin-chieu-10062018/doi-song-va-xa-hoi', + 'md5': 'b5875ce9b0a2eecde029216d0e6db2ae', + 'info_dict': { + 'id': '27017', + 'ext': 'm4a', + 'title': 'VOV1 - Bản tin chiều (10/06/2018)', + 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', + 'is_live': False, + }, + }, { + 'url': 'http://us.tvnet.gov.vn/video/118023/129999/ngay-0705', + 'info_dict': { + 'id': '129999', + 'ext': 'mp4', + 'title': 'VTV1 - Quốc hội với cử tri (11/06/2018)', + 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', + 'is_live': False, + }, + 'params': { + 'skip_download': True, + }, + }, { + # live stream + 'url': 'http://us.tvnet.gov.vn/kenh-truyen-hinh/1011/vtv1', + 'info_dict': { + 'id': '1011', + 'ext': 'mp4', + 'title': r're:^VTV1 \| LiveTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + }, + }, { + # radio live stream + 'url': 'http://vn.tvnet.gov.vn/kenh-truyen-hinh/1014', + 'info_dict': { + 'id': '1014', + 'ext': 'm4a', + 'title': r're:VOV1 \| LiveTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://us.tvnet.gov.vn/phim/6136/25510/vtv3---ca-mot-doi-an-oan-tap-1-50/phim-truyen-hinh', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title( + webpage, default=None) or self._html_search_meta( + 'title', webpage, default=None) or self._search_regex( + r'([^<]+)<', webpage, 'title') + title = re.sub(r'\s*-\s*TV Net\s*$', '', title) + + if '/video/' in url or '/radio/' in url: + is_live = False + elif '/kenh-truyen-hinh/' in url: + is_live = True + else: + is_live = None + + data_file = unescapeHTML(self._search_regex( + r'data-file=(["\'])(?P<url>(?:https?:)?//.+?)\1', webpage, + 'data file', group='url')) + + stream_urls = set() + formats = [] + for stream in self._download_json(data_file, video_id): + if not isinstance(stream, dict): + continue + stream_url = url_or_none(stream.get('url')) + if stream_url in stream_urls or not stream_url: + continue + stream_urls.add(stream_url) + formats.extend(self._extract_m3u8_formats( + stream_url, video_id, 'mp4', + entry_protocol='m3u8' if is_live else 'm3u8_native', + m3u8_id='hls', fatal=False)) + self._sort_formats(formats) + + # better support for radio streams + if title.startswith('VOV'): + for f in formats: + f.update({ + 'ext': 'm4a', + 'vcodec': 'none', + }) + + thumbnail = self._og_search_thumbnail( + webpage, default=None) or unescapeHTML( + self._search_regex( + r'data-image=(["\'])(?P<url>(?:https?:)?//.+?)\1', webpage, + 'thumbnail', default=None, group='url')) + + if is_live: + title = self._live_title(title) + + view_count = int_or_none(self._search_regex( + r'(?s)<div[^>]+\bclass=["\'].*?view-count[^>]+>.*?(\d+).*?</div>', + webpage, 'view count', default=None)) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'is_live': is_live, + 'view_count': view_count, + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/tvnoe.py youtube-dl-2019.05.20/youtube_dl/extractor/tvnoe.py --- youtube-dl-2016.02.22/youtube_dl/extractor/tvnoe.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/tvnoe.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,48 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_html, + get_element_by_class, + js_to_json, +) + + +class TVNoeIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tvnoe\.cz/video/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.tvnoe.cz/video/10362', + 'md5': 'aee983f279aab96ec45ab6e2abb3c2ca', + 'info_dict': { + 'id': '10362', + 'ext': 'mp4', + 'series': 'Noční univerzita', + 'title': 'prof. Tomáš Halík, Th.D. - Návrat náboženství a střet civilizací', + 'description': 'md5:f337bae384e1a531a52c55ebc50fff41', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + iframe_url = self._search_regex( + r'<iframe[^>]+src="([^"]+)"', webpage, 'iframe URL') + + ifs_page = self._download_webpage(iframe_url, video_id) + jwplayer_data = self._find_jwplayer_data( + ifs_page, video_id, transform_source=js_to_json) + info_dict = self._parse_jwplayer_data( + jwplayer_data, video_id, require_title=False, base_url=iframe_url) + + info_dict.update({ + 'id': video_id, + 'title': clean_html(get_element_by_class( + 'field-name-field-podnazev', webpage)), + 'description': clean_html(get_element_by_class( + 'field-name-body', webpage)), + 'series': clean_html(get_element_by_class('title', webpage)) + }) + + return info_dict diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/tvnow.py youtube-dl-2019.05.20/youtube_dl/extractor/tvnow.py --- youtube-dl-2016.02.22/youtube_dl/extractor/tvnow.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/tvnow.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,486 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + parse_iso8601, + parse_duration, + str_or_none, + update_url_query, + urljoin, +) + + +class TVNowBaseIE(InfoExtractor): + _VIDEO_FIELDS = ( + 'id', 'title', 'free', 'geoblocked', 'articleLong', 'articleShort', + 'broadcastStartDate', 'isDrm', 'duration', 'season', 'episode', + 'manifest.dashclear', 'manifest.hlsclear', 'manifest.smoothclear', + 'format.title', 'format.defaultImage169Format', 'format.defaultImage169Logo') + + def _call_api(self, path, video_id, query): + return self._download_json( + 'https://api.tvnow.de/v3/' + path, video_id, query=query) + + def _extract_video(self, info, display_id): + video_id = compat_str(info['id']) + title = info['title'] + + paths = [] + for manifest_url in (info.get('manifest') or {}).values(): + if not manifest_url: + continue + manifest_url = update_url_query(manifest_url, {'filter': ''}) + path = self._search_regex(r'https?://[^/]+/(.+?)\.ism/', manifest_url, 'path') + if path in paths: + continue + paths.append(path) + + def url_repl(proto, suffix): + return re.sub( + r'(?:hls|dash|hss)([.-])', proto + r'\1', re.sub( + r'\.ism/(?:[^.]*\.(?:m3u8|mpd)|[Mm]anifest)', + '.ism/' + suffix, manifest_url)) + + def make_urls(proto, suffix): + urls = [url_repl(proto, suffix)] + hd_url = urls[0].replace('/manifest/', '/ngvod/') + if hd_url != urls[0]: + urls.append(hd_url) + return urls + + for man_url in make_urls('dash', '.mpd'): + formats = self._extract_mpd_formats( + man_url, video_id, mpd_id='dash', fatal=False) + for man_url in make_urls('hss', 'Manifest'): + formats.extend(self._extract_ism_formats( + man_url, video_id, ism_id='mss', fatal=False)) + for man_url in make_urls('hls', '.m3u8'): + formats.extend(self._extract_m3u8_formats( + man_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', + fatal=False)) + if formats: + break + else: + if info.get('isDrm'): + raise ExtractorError( + 'Video %s is DRM protected' % video_id, expected=True) + if info.get('geoblocked'): + raise self.raise_geo_restricted() + if not info.get('free', True): + raise ExtractorError( + 'Video %s is not available for free' % video_id, expected=True) + self._sort_formats(formats) + + description = info.get('articleLong') or info.get('articleShort') + timestamp = parse_iso8601(info.get('broadcastStartDate'), ' ') + duration = parse_duration(info.get('duration')) + + f = info.get('format', {}) + + thumbnails = [{ + 'url': 'https://aistvnow-a.akamaihd.net/tvnow/movie/%s' % video_id, + }] + thumbnail = f.get('defaultImage169Format') or f.get('defaultImage169Logo') + if thumbnail: + thumbnails.append({ + 'url': thumbnail, + }) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnails': thumbnails, + 'timestamp': timestamp, + 'duration': duration, + 'series': f.get('title'), + 'season_number': int_or_none(info.get('season')), + 'episode_number': int_or_none(info.get('episode')), + 'episode': title, + 'formats': formats, + } + + +class TVNowIE(TVNowBaseIE): + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?tvnow\.(?:de|at|ch)/(?P<station>[^/]+)/ + (?P<show_id>[^/]+)/ + (?!(?:list|jahr)(?:/|$))(?P<id>[^/?\#&]+) + ''' + + @classmethod + def suitable(cls, url): + return (False if TVNowNewIE.suitable(url) or TVNowSeasonIE.suitable(url) or TVNowAnnualIE.suitable(url) or TVNowShowIE.suitable(url) + else super(TVNowIE, cls).suitable(url)) + + _TESTS = [{ + 'url': 'https://www.tvnow.de/rtl2/grip-das-motormagazin/der-neue-porsche-911-gt-3/player', + 'info_dict': { + 'id': '331082', + 'display_id': 'grip-das-motormagazin/der-neue-porsche-911-gt-3', + 'ext': 'mp4', + 'title': 'Der neue Porsche 911 GT 3', + 'description': 'md5:6143220c661f9b0aae73b245e5d898bb', + 'timestamp': 1495994400, + 'upload_date': '20170528', + 'duration': 5283, + 'series': 'GRIP - Das Motormagazin', + 'season_number': 14, + 'episode_number': 405, + 'episode': 'Der neue Porsche 911 GT 3', + }, + }, { + # rtl2 + 'url': 'https://www.tvnow.de/rtl2/armes-deutschland/episode-0008/player', + 'only_matching': True, + }, { + # rtlnitro + 'url': 'https://www.tvnow.de/nitro/alarm-fuer-cobra-11-die-autobahnpolizei/auf-eigene-faust-pilot/player', + 'only_matching': True, + }, { + # superrtl + 'url': 'https://www.tvnow.de/superrtl/die-lustigsten-schlamassel-der-welt/u-a-ketchup-effekt/player', + 'only_matching': True, + }, { + # ntv + 'url': 'https://www.tvnow.de/ntv/startup-news/goetter-in-weiss/player', + 'only_matching': True, + }, { + # vox + 'url': 'https://www.tvnow.de/vox/auto-mobil/neues-vom-automobilmarkt-2017-11-19-17-00-00/player', + 'only_matching': True, + }, { + # rtlplus + 'url': 'https://www.tvnow.de/rtlplus/op-ruft-dr-bruckner/die-vernaehte-frau/player', + 'only_matching': True, + }, { + 'url': 'https://www.tvnow.de/rtl2/grip-das-motormagazin/der-neue-porsche-911-gt-3', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id = '%s/%s' % mobj.group(2, 3) + + info = self._call_api( + 'movies/' + display_id, display_id, query={ + 'fields': ','.join(self._VIDEO_FIELDS), + }) + + return self._extract_video(info, display_id) + + +class TVNowNewIE(InfoExtractor): + _VALID_URL = r'''(?x) + (?P<base_url>https?:// + (?:www\.)?tvnow\.(?:de|at|ch)/ + (?:shows|serien))/ + (?P<show>[^/]+)-\d+/ + [^/]+/ + episode-\d+-(?P<episode>[^/?$&]+)-(?P<id>\d+) + ''' + + _TESTS = [{ + 'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669/2017-05/episode-405-der-neue-porsche-911-gt-3-331082', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + base_url = re.sub(r'(?:shows|serien)', '_', mobj.group('base_url')) + show, episode = mobj.group('show', 'episode') + return self.url_result( + # Rewrite new URLs to the old format and use extraction via old API + # at api.tvnow.de as a loophole for bypassing premium content checks + '%s/%s/%s' % (base_url, show, episode), + ie=TVNowIE.ie_key(), video_id=mobj.group('id')) + + +class TVNowNewBaseIE(InfoExtractor): + def _call_api(self, path, video_id, query={}): + result = self._download_json( + 'https://apigw.tvnow.de/module/' + path, video_id, query=query) + error = result.get('error') + if error: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error), expected=True) + return result + + +r""" +TODO: new apigw.tvnow.de based version of TVNowIE. Replace old TVNowIE with it +when api.tvnow.de is shut down. This version can't bypass premium checks though. +class TVNowIE(TVNowNewBaseIE): + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?tvnow\.(?:de|at|ch)/ + (?:shows|serien)/[^/]+/ + (?:[^/]+/)+ + (?P<display_id>[^/?$&]+)-(?P<id>\d+) + ''' + + _TESTS = [{ + # episode with annual navigation + 'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669/2017-05/episode-405-der-neue-porsche-911-gt-3-331082', + 'info_dict': { + 'id': '331082', + 'display_id': 'grip-das-motormagazin/der-neue-porsche-911-gt-3', + 'ext': 'mp4', + 'title': 'Der neue Porsche 911 GT 3', + 'description': 'md5:6143220c661f9b0aae73b245e5d898bb', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1495994400, + 'upload_date': '20170528', + 'duration': 5283, + 'series': 'GRIP - Das Motormagazin', + 'season_number': 14, + 'episode_number': 405, + 'episode': 'Der neue Porsche 911 GT 3', + }, + }, { + # rtl2, episode with season navigation + 'url': 'https://www.tvnow.de/shows/armes-deutschland-11471/staffel-3/episode-14-bernd-steht-seit-der-trennung-von-seiner-frau-allein-da-526124', + 'only_matching': True, + }, { + # rtlnitro + 'url': 'https://www.tvnow.de/serien/alarm-fuer-cobra-11-die-autobahnpolizei-1815/staffel-13/episode-5-auf-eigene-faust-pilot-366822', + 'only_matching': True, + }, { + # superrtl + 'url': 'https://www.tvnow.de/shows/die-lustigsten-schlamassel-der-welt-1221/staffel-2/episode-14-u-a-ketchup-effekt-364120', + 'only_matching': True, + }, { + # ntv + 'url': 'https://www.tvnow.de/shows/startup-news-10674/staffel-2/episode-39-goetter-in-weiss-387630', + 'only_matching': True, + }, { + # vox + 'url': 'https://www.tvnow.de/shows/auto-mobil-174/2017-11/episode-46-neues-vom-automobilmarkt-2017-11-19-17-00-00-380072', + 'only_matching': True, + }, { + 'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669/2017-05/episode-405-der-neue-porsche-911-gt-3-331082', + 'only_matching': True, + }] + + def _extract_video(self, info, url, display_id): + config = info['config'] + source = config['source'] + + video_id = compat_str(info.get('id') or source['videoId']) + title = source['title'].strip() + + paths = [] + for manifest_url in (info.get('manifest') or {}).values(): + if not manifest_url: + continue + manifest_url = update_url_query(manifest_url, {'filter': ''}) + path = self._search_regex(r'https?://[^/]+/(.+?)\.ism/', manifest_url, 'path') + if path in paths: + continue + paths.append(path) + + def url_repl(proto, suffix): + return re.sub( + r'(?:hls|dash|hss)([.-])', proto + r'\1', re.sub( + r'\.ism/(?:[^.]*\.(?:m3u8|mpd)|[Mm]anifest)', + '.ism/' + suffix, manifest_url)) + + formats = self._extract_mpd_formats( + url_repl('dash', '.mpd'), video_id, + mpd_id='dash', fatal=False) + formats.extend(self._extract_ism_formats( + url_repl('hss', 'Manifest'), + video_id, ism_id='mss', fatal=False)) + formats.extend(self._extract_m3u8_formats( + url_repl('hls', '.m3u8'), video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + if formats: + break + else: + if try_get(info, lambda x: x['rights']['isDrm']): + raise ExtractorError( + 'Video %s is DRM protected' % video_id, expected=True) + if try_get(config, lambda x: x['boards']['geoBlocking']['block']): + raise self.raise_geo_restricted() + if not info.get('free', True): + raise ExtractorError( + 'Video %s is not available for free' % video_id, expected=True) + self._sort_formats(formats) + + description = source.get('description') + thumbnail = url_or_none(source.get('poster')) + timestamp = unified_timestamp(source.get('previewStart')) + duration = parse_duration(source.get('length')) + + series = source.get('format') + season_number = int_or_none(self._search_regex( + r'staffel-(\d+)', url, 'season number', default=None)) + episode_number = int_or_none(self._search_regex( + r'episode-(\d+)', url, 'episode number', default=None)) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'series': series, + 'season_number': season_number, + 'episode_number': episode_number, + 'episode': title, + 'formats': formats, + } + + def _real_extract(self, url): + display_id, video_id = re.match(self._VALID_URL, url).groups() + info = self._call_api('player/' + video_id, video_id) + return self._extract_video(info, video_id, display_id) +""" + + +class TVNowListBaseIE(TVNowNewBaseIE): + _SHOW_VALID_URL = r'''(?x) + (?P<base_url> + https?:// + (?:www\.)?tvnow\.(?:de|at|ch)/(?:shows|serien)/ + [^/?#&]+-(?P<show_id>\d+) + ) + ''' + + @classmethod + def suitable(cls, url): + return (False if TVNowNewIE.suitable(url) + else super(TVNowListBaseIE, cls).suitable(url)) + + def _extract_items(self, url, show_id, list_id, query): + items = self._call_api( + 'teaserrow/format/episode/' + show_id, list_id, + query=query)['items'] + + entries = [] + for item in items: + if not isinstance(item, dict): + continue + item_url = urljoin(url, item.get('url')) + if not item_url: + continue + video_id = str_or_none(item.get('id') or item.get('videoId')) + item_title = item.get('subheadline') or item.get('text') + entries.append(self.url_result( + item_url, ie=TVNowNewIE.ie_key(), video_id=video_id, + video_title=item_title)) + + return self.playlist_result(entries, '%s/%s' % (show_id, list_id)) + + +class TVNowSeasonIE(TVNowListBaseIE): + _VALID_URL = r'%s/staffel-(?P<id>\d+)' % TVNowListBaseIE._SHOW_VALID_URL + _TESTS = [{ + 'url': 'https://www.tvnow.de/serien/alarm-fuer-cobra-11-die-autobahnpolizei-1815/staffel-13', + 'info_dict': { + 'id': '1815/13', + }, + 'playlist_mincount': 22, + }] + + def _real_extract(self, url): + _, show_id, season_id = re.match(self._VALID_URL, url).groups() + return self._extract_items( + url, show_id, season_id, {'season': season_id}) + + +class TVNowAnnualIE(TVNowListBaseIE): + _VALID_URL = r'%s/(?P<year>\d{4})-(?P<month>\d{2})' % TVNowListBaseIE._SHOW_VALID_URL + _TESTS = [{ + 'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669/2017-05', + 'info_dict': { + 'id': '1669/2017-05', + }, + 'playlist_mincount': 2, + }] + + def _real_extract(self, url): + _, show_id, year, month = re.match(self._VALID_URL, url).groups() + return self._extract_items( + url, show_id, '%s-%s' % (year, month), { + 'year': int(year), + 'month': int(month), + }) + + +class TVNowShowIE(TVNowListBaseIE): + _VALID_URL = TVNowListBaseIE._SHOW_VALID_URL + _TESTS = [{ + # annual navigationType + 'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669', + 'info_dict': { + 'id': '1669', + }, + 'playlist_mincount': 73, + }, { + # season navigationType + 'url': 'https://www.tvnow.de/shows/armes-deutschland-11471', + 'info_dict': { + 'id': '11471', + }, + 'playlist_mincount': 3, + }] + + @classmethod + def suitable(cls, url): + return (False if TVNowNewIE.suitable(url) or TVNowSeasonIE.suitable(url) or TVNowAnnualIE.suitable(url) + else super(TVNowShowIE, cls).suitable(url)) + + def _real_extract(self, url): + base_url, show_id = re.match(self._VALID_URL, url).groups() + + result = self._call_api( + 'teaserrow/format/navigation/' + show_id, show_id) + + items = result['items'] + + entries = [] + navigation = result.get('navigationType') + if navigation == 'annual': + for item in items: + if not isinstance(item, dict): + continue + year = int_or_none(item.get('year')) + if year is None: + continue + months = item.get('months') + if not isinstance(months, list): + continue + for month_dict in months: + if not isinstance(month_dict, dict) or not month_dict: + continue + month_number = int_or_none(list(month_dict.keys())[0]) + if month_number is None: + continue + entries.append(self.url_result( + '%s/%04d-%02d' % (base_url, year, month_number), + ie=TVNowAnnualIE.ie_key())) + elif navigation == 'season': + for item in items: + if not isinstance(item, dict): + continue + season_number = int_or_none(item.get('season')) + if season_number is None: + continue + entries.append(self.url_result( + '%s/staffel-%d' % (base_url, season_number), + ie=TVNowSeasonIE.ie_key())) + else: + raise ExtractorError('Unknown navigationType') + + return self.playlist_result(entries, show_id) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/tvplayer.py youtube-dl-2019.05.20/youtube_dl/extractor/tvplayer.py --- youtube-dl-2016.02.22/youtube_dl/extractor/tvplayer.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/tvplayer.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,86 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_HTTPError, + compat_str, +) +from ..utils import ( + extract_attributes, + try_get, + urlencode_postdata, + ExtractorError, +) + + +class TVPlayerIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tvplayer\.com/watch/(?P<id>[^/?#]+)' + _TEST = { + 'url': 'http://tvplayer.com/watch/bbcone', + 'info_dict': { + 'id': '89', + 'ext': 'mp4', + 'title': r're:^BBC One [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + current_channel = extract_attributes(self._search_regex( + r'(<div[^>]+class="[^"]*current-channel[^"]*"[^>]*>)', + webpage, 'channel element')) + title = current_channel['data-name'] + + resource_id = current_channel['data-id'] + + token = self._search_regex( + r'data-token=(["\'])(?P<token>(?!\1).+)\1', webpage, + 'token', group='token') + + context = self._download_json( + 'https://tvplayer.com/watch/context', display_id, + 'Downloading JSON context', query={ + 'resource': resource_id, + 'gen': token, + }) + + validate = context['validate'] + platform = try_get( + context, lambda x: x['platform']['key'], compat_str) or 'firefox' + + try: + response = self._download_json( + 'http://api.tvplayer.com/api/v2/stream/live', + display_id, 'Downloading JSON stream', headers={ + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + }, data=urlencode_postdata({ + 'id': resource_id, + 'service': 1, + 'platform': platform, + 'validate': validate, + }))['tvplayer']['response'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError): + response = self._parse_json( + e.cause.read().decode(), resource_id)['tvplayer']['response'] + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, response['error']), expected=True) + raise + + formats = self._extract_m3u8_formats(response['stream'], display_id, 'mp4') + self._sort_formats(formats) + + return { + 'id': resource_id, + 'display_id': display_id, + 'title': self._live_title(title), + 'formats': formats, + 'is_live': True, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/tvplay.py youtube-dl-2019.05.20/youtube_dl/extractor/tvplay.py --- youtube-dl-2016.02.22/youtube_dl/extractor/tvplay.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/tvplay.py 2019-06-07 18:49:07.000000000 +0000 @@ -4,47 +4,61 @@ import re from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_HTTPError, + compat_str, + compat_urlparse, +) from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, parse_iso8601, qualities, + smuggle_url, + try_get, + unsmuggle_url, + update_url_query, + url_or_none, ) class TVPlayIE(InfoExtractor): - IE_DESC = 'TV3Play and related services' - _VALID_URL = r'''(?x)http://(?:www\.)? - (?:tvplay\.lv/parraides| - tv3play\.lt/programos| - play\.tv3\.lt/programos| - tv3play\.ee/sisu| - tv3play\.se/program| - tv6play\.se/program| - tv8play\.se/program| - tv10play\.se/program| - tv3play\.no/programmer| - viasat4play\.no/programmer| - tv6play\.no/programmer| - tv3play\.dk/programmer| - play\.novatv\.bg/programi - )/[^/]+/(?P<id>\d+) - ''' + IE_NAME = 'mtg' + IE_DESC = 'MTG services' + _VALID_URL = r'''(?x) + (?: + mtg:| + https?:// + (?:www\.)? + (?: + tvplay(?:\.skaties)?\.lv(?:/parraides)?| + (?:tv3play|play\.tv3)\.lt(?:/programos)?| + tv3play(?:\.tv3)?\.ee/sisu| + (?:tv(?:3|6|8|10)play|viafree)\.se/program| + (?:(?:tv3play|viasat4play|tv6play|viafree)\.no|(?:tv3play|viafree)\.dk)/programmer| + play\.nova(?:tv)?\.bg/programi + ) + /(?:[^/]+/)+ + ) + (?P<id>\d+) + ''' _TESTS = [ { 'url': 'http://www.tvplay.lv/parraides/vinas-melo-labak/418113?autostart=true', + 'md5': 'a1612fe0849455423ad8718fe049be21', 'info_dict': { 'id': '418113', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Kādi ir īri? - Viņas melo labāk', 'description': 'Baiba apsmej īrus, kādi tie ir un ko viņi dara.', + 'series': 'Viņas melo labāk', + 'season': '2.sezona', + 'season_number': 2, 'duration': 25, 'timestamp': 1406097056, 'upload_date': '20140723', }, - 'params': { - # rtmp download - 'skip_download': True, - }, }, { 'url': 'http://play.tv3.lt/programos/moterys-meluoja-geriau/409229?autostart=true', @@ -53,6 +67,10 @@ 'ext': 'flv', 'title': 'Moterys meluoja geriau', 'description': 'md5:9aec0fc68e2cbc992d2a140bd41fa89e', + 'series': 'Moterys meluoja geriau', + 'episode_number': 47, + 'season': '1 sezonas', + 'season_number': 1, 'duration': 1330, 'timestamp': 1403769181, 'upload_date': '20140626', @@ -82,7 +100,7 @@ 'url': 'http://www.tv3play.se/program/husraddarna/395385?autostart=true', 'info_dict': { 'id': '395385', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Husräddarna S02E07', 'description': 'md5:f210c6c89f42d4fc39faa551be813777', 'duration': 2574, @@ -90,7 +108,6 @@ 'upload_date': '20140520', }, 'params': { - # rtmp download 'skip_download': True, }, }, @@ -98,7 +115,7 @@ 'url': 'http://www.tv6play.se/program/den-sista-dokusapan/266636?autostart=true', 'info_dict': { 'id': '266636', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Den sista dokusåpan S01E08', 'description': 'md5:295be39c872520221b933830f660b110', 'duration': 1492, @@ -107,7 +124,6 @@ 'age_limit': 18, }, 'params': { - # rtmp download 'skip_download': True, }, }, @@ -115,7 +131,7 @@ 'url': 'http://www.tv8play.se/program/antikjakten/282756?autostart=true', 'info_dict': { 'id': '282756', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Antikjakten S01E10', 'description': 'md5:1b201169beabd97e20c5ad0ad67b13b8', 'duration': 2646, @@ -123,7 +139,6 @@ 'upload_date': '20120925', }, 'params': { - # rtmp download 'skip_download': True, }, }, @@ -131,7 +146,7 @@ 'url': 'http://www.tv3play.no/programmer/anna-anka-soker-assistent/230898?autostart=true', 'info_dict': { 'id': '230898', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Anna Anka søker assistent - Ep. 8', 'description': 'md5:f80916bf5bbe1c5f760d127f8dd71474', 'duration': 2656, @@ -139,7 +154,6 @@ 'upload_date': '20100628', }, 'params': { - # rtmp download 'skip_download': True, }, }, @@ -147,7 +161,7 @@ 'url': 'http://www.viasat4play.no/programmer/budbringerne/21873?autostart=true', 'info_dict': { 'id': '21873', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Budbringerne program 10', 'description': 'md5:4db78dc4ec8a85bb04fd322a3ee5092d', 'duration': 1297, @@ -155,7 +169,6 @@ 'upload_date': '20090929', }, 'params': { - # rtmp download 'skip_download': True, }, }, @@ -163,7 +176,7 @@ 'url': 'http://www.tv6play.no/programmer/hotelinspektor-alex-polizzi/361883?autostart=true', 'info_dict': { 'id': '361883', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Hotelinspektør Alex Polizzi - Ep. 10', 'description': 'md5:3ecf808db9ec96c862c8ecb3a7fdaf81', 'duration': 2594, @@ -171,7 +184,6 @@ 'upload_date': '20140224', }, 'params': { - # rtmp download 'skip_download': True, }, }, @@ -191,59 +203,355 @@ 'skip_download': True, }, }, + { + 'url': 'https://play.nova.bg/programi/zdravei-bulgariya/764300?autostart=true', + 'only_matching': True, + }, + { + 'url': 'http://tvplay.skaties.lv/parraides/vinas-melo-labak/418113?autostart=true', + 'only_matching': True, + }, + { + 'url': 'https://tvplay.skaties.lv/vinas-melo-labak/418113/?autostart=true', + 'only_matching': True, + }, + { + # views is null + 'url': 'http://tvplay.skaties.lv/parraides/tv3-zinas/760183', + 'only_matching': True, + }, + { + 'url': 'http://tv3play.tv3.ee/sisu/kodu-keset-linna/238551?autostart=true', + 'only_matching': True, + }, + { + 'url': 'http://www.viafree.se/program/underhallning/i-like-radio-live/sasong-1/676869', + 'only_matching': True, + }, + { + 'url': 'mtg:418113', + 'only_matching': True, + } ] def _real_extract(self, url): - video_id = self._match_id(url) + url, smuggled_data = unsmuggle_url(url, {}) + self._initialize_geo_bypass({ + 'countries': smuggled_data.get('geo_countries'), + }) + video_id = self._match_id(url) + geo_country = self._search_regex( + r'https?://[^/]+\.([a-z]{2})', url, + 'geo country', default=None) + if geo_country: + self._initialize_geo_bypass({'countries': [geo_country.upper()]}) video = self._download_json( - 'http://playapi.mtgx.tv/v1/videos/%s' % video_id, video_id, 'Downloading video JSON') + 'http://playapi.mtgx.tv/v3/videos/%s' % video_id, video_id, 'Downloading video JSON') - if video['is_geo_blocked']: - self.report_warning( - 'This content might not be available in your country due to copyright reasons') + title = video['title'] - streams = self._download_json( - 'http://playapi.mtgx.tv/v1/videos/stream/%s' % video_id, video_id, 'Downloading streams JSON') + try: + streams = self._download_json( + 'http://playapi.mtgx.tv/v3/videos/stream/%s' % video_id, + video_id, 'Downloading streams JSON') + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + msg = self._parse_json(e.cause.read().decode('utf-8'), video_id) + raise ExtractorError(msg['msg'], expected=True) + raise quality = qualities(['hls', 'medium', 'high']) formats = [] - for format_id, video_url in streams['streams'].items(): - if not video_url or not isinstance(video_url, compat_str): + for format_id, video_url in streams.get('streams', {}).items(): + video_url = url_or_none(video_url) + if not video_url: continue - fmt = { - 'format_id': format_id, - 'preference': quality(format_id), - } - if video_url.startswith('rtmp'): - m = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', video_url) - if not m: - continue - fmt.update({ - 'ext': 'flv', - 'url': m.group('url'), - 'app': m.group('app'), - 'play_path': m.group('playpath'), - }) - elif video_url.endswith('.f4m'): + ext = determine_ext(video_url) + if ext == 'f4m': formats.extend(self._extract_f4m_formats( - video_url + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81', video_id)) - continue + update_url_query(video_url, { + 'hdcore': '3.5.0', + 'plugin': 'aasp-3.5.0.151.81' + }), video_id, f4m_id='hds', fatal=False)) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) else: - fmt.update({ - 'url': video_url, - }) - formats.append(fmt) + fmt = { + 'format_id': format_id, + 'quality': quality(format_id), + 'ext': ext, + } + if video_url.startswith('rtmp'): + if smuggled_data.get('skip_rtmp'): + continue + m = re.search( + r'^(?P<url>rtmp://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', video_url) + if not m: + continue + fmt.update({ + 'ext': 'flv', + 'url': m.group('url'), + 'app': m.group('app'), + 'play_path': m.group('playpath'), + 'preference': -1, + }) + else: + fmt.update({ + 'url': video_url, + }) + formats.append(fmt) + + if not formats and video.get('is_geo_blocked'): + self.raise_geo_restricted( + 'This content might not be available in your country due to copyright reasons') + + self._sort_formats(formats) + # TODO: webvtt in m3u8 + subtitles = {} + sami_path = video.get('sami_path') + if sami_path: + lang = self._search_regex( + r'_([a-z]{2})\.xml', sami_path, 'lang', + default=compat_urlparse.urlparse(url).netloc.rsplit('.', 1)[-1]) + subtitles[lang] = [{ + 'url': sami_path, + }] + + series = video.get('format_title') + episode_number = int_or_none(video.get('format_position', {}).get('episode')) + season = video.get('_embedded', {}).get('season', {}).get('title') + season_number = int_or_none(video.get('format_position', {}).get('season')) + + return { + 'id': video_id, + 'title': title, + 'description': video.get('description'), + 'series': series, + 'episode_number': episode_number, + 'season': season, + 'season_number': season_number, + 'duration': int_or_none(video.get('duration')), + 'timestamp': parse_iso8601(video.get('created_at')), + 'view_count': try_get(video, lambda x: x['views']['total'], int), + 'age_limit': int_or_none(video.get('age_limit', 0)), + 'formats': formats, + 'subtitles': subtitles, + } + + +class ViafreeIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?:www\.)? + viafree\. + (?: + (?:dk|no)/programmer| + se/program + ) + /(?:[^/]+/)+(?P<id>[^/?#&]+) + ''' + _TESTS = [{ + 'url': 'http://www.viafree.se/program/livsstil/husraddarna/sasong-2/avsnitt-2', + 'info_dict': { + 'id': '395375', + 'ext': 'mp4', + 'title': 'Husräddarna S02E02', + 'description': 'md5:4db5c933e37db629b5a2f75dfb34829e', + 'series': 'Husräddarna', + 'season': 'Säsong 2', + 'season_number': 2, + 'duration': 2576, + 'timestamp': 1400596321, + 'upload_date': '20140520', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [TVPlayIE.ie_key()], + }, { + # with relatedClips + 'url': 'http://www.viafree.se/program/reality/sommaren-med-youtube-stjarnorna/sasong-1/avsnitt-1', + 'info_dict': { + 'id': '758770', + 'ext': 'mp4', + 'title': 'Sommaren med YouTube-stjärnorna S01E01', + 'description': 'md5:2bc69dce2c4bb48391e858539bbb0e3f', + 'series': 'Sommaren med YouTube-stjärnorna', + 'season': 'Säsong 1', + 'season_number': 1, + 'duration': 1326, + 'timestamp': 1470905572, + 'upload_date': '20160811', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [TVPlayIE.ie_key()], + }, { + # Different og:image URL schema + 'url': 'http://www.viafree.se/program/reality/sommaren-med-youtube-stjarnorna/sasong-1/avsnitt-2', + 'only_matching': True, + }, { + 'url': 'http://www.viafree.no/programmer/underholdning/det-beste-vorspielet/sesong-2/episode-1', + 'only_matching': True, + }, { + 'url': 'http://www.viafree.dk/programmer/reality/paradise-hotel/saeson-7/episode-5', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if TVPlayIE.suitable(url) else super(ViafreeIE, cls).suitable(url) + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + data = self._parse_json( + self._search_regex( + r'(?s)window\.App\s*=\s*({.+?})\s*;\s*</script', + webpage, 'data', default='{}'), + video_id, transform_source=lambda x: re.sub( + r'(?s)function\s+[a-zA-Z_][\da-zA-Z_]*\s*\([^)]*\)\s*{[^}]*}\s*', + 'null', x), fatal=False) + + video_id = None + + if data: + video_id = try_get( + data, lambda x: x['context']['dispatcher']['stores'][ + 'ContentPageProgramStore']['currentVideo']['id'], + compat_str) + + # Fallback #1 (extract from og:image URL schema) + if not video_id: + thumbnail = self._og_search_thumbnail(webpage, default=None) + if thumbnail: + video_id = self._search_regex( + # Patterns seen: + # http://cdn.playapi.mtgx.tv/imagecache/600x315/cloud/content-images/inbox/765166/a2e95e5f1d735bab9f309fa345cc3f25.jpg + # http://cdn.playapi.mtgx.tv/imagecache/600x315/cloud/content-images/seasons/15204/758770/4a5ba509ca8bc043e1ebd1a76131cdf2.jpg + r'https?://[^/]+/imagecache/(?:[^/]+/)+(\d{6,})/', + thumbnail, 'video id', default=None) + + # Fallback #2. Extract from raw JSON string. + # May extract wrong video id if relatedClips is present. + if not video_id: + video_id = self._search_regex( + r'currentVideo["\']\s*:\s*.+?["\']id["\']\s*:\s*["\'](\d{6,})', + webpage, 'video id') + + return self.url_result( + smuggle_url( + 'mtg:%s' % video_id, + { + 'geo_countries': [ + compat_urlparse.urlparse(url).netloc.rsplit('.', 1)[-1]], + # rtmp host mtgfs.fplive.net for viafree is unresolvable + 'skip_rtmp': True, + }), + ie=TVPlayIE.ie_key(), video_id=video_id) + + +class TVPlayHomeIE(InfoExtractor): + _VALID_URL = r'https?://tvplay\.(?:tv3\.lt|skaties\.lv|tv3\.ee)/[^/]+/[^/?#&]+-(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://tvplay.tv3.lt/aferistai-n-7/aferistai-10047125/', + 'info_dict': { + 'id': '366367', + 'ext': 'mp4', + 'title': 'Aferistai', + 'description': 'Aferistai. Kalėdinė pasaka.', + 'series': 'Aferistai [N-7]', + 'season': '1 sezonas', + 'season_number': 1, + 'duration': 464, + 'timestamp': 1394209658, + 'upload_date': '20140307', + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [TVPlayIE.ie_key()], + }, { + 'url': 'https://tvplay.skaties.lv/vinas-melo-labak/vinas-melo-labak-10280317/', + 'only_matching': True, + }, { + 'url': 'https://tvplay.tv3.ee/cool-d-ga-mehhikosse/cool-d-ga-mehhikosse-10044354/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_id = self._search_regex( + r'data-asset-id\s*=\s*["\'](\d{5,})\b', webpage, 'video id') + + if len(video_id) < 8: + return self.url_result( + 'mtg:%s' % video_id, ie=TVPlayIE.ie_key(), video_id=video_id) + + m3u8_url = self._search_regex( + r'data-file\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'm3u8 url', group='url') + + formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') self._sort_formats(formats) + title = self._search_regex( + r'data-title\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, + 'title', default=None, group='value') or self._html_search_meta( + 'title', webpage, default=None) or self._og_search_title( + webpage) + + description = self._html_search_meta( + 'description', webpage, + default=None) or self._og_search_description(webpage) + + thumbnail = self._search_regex( + r'data-image\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'thumbnail', default=None, group='url') or self._html_search_meta( + 'thumbnail', webpage, default=None) or self._og_search_thumbnail( + webpage) + + duration = int_or_none(self._search_regex( + r'data-duration\s*=\s*["\'](\d+)', webpage, 'duration', + fatal=False)) + + season = self._search_regex( + (r'data-series-title\s*=\s*(["\'])[^/]+/(?P<value>(?:(?!\1).)+)\1', + r'\bseason\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage, + 'season', default=None, group='value') + season_number = int_or_none(self._search_regex( + r'(\d+)(?:[.\s]+sezona|\s+HOOAEG)', season or '', 'season number', + default=None)) + episode = self._search_regex( + (r'\bepisode\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1', + r'data-subtitle\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage, + 'episode', default=None, group='value') + episode_number = int_or_none(self._search_regex( + r'(?:S[eē]rija|Osa)\s+(\d+)', episode or '', 'episode number', + default=None)) + return { 'id': video_id, - 'title': video['title'], - 'description': video['description'], - 'duration': video['duration'], - 'timestamp': parse_iso8601(video['created_at']), - 'view_count': video['views']['total'], - 'age_limit': video.get('age_limit', 0), + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'season': season, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, 'formats': formats, } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/tvp.py youtube-dl-2019.05.20/youtube_dl/extractor/tvp.py --- youtube-dl-2016.02.22/youtube_dl/extractor/tvp.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/tvp.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,47 +1,117 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals +import itertools import re from .common import InfoExtractor +from ..utils import ( + clean_html, + determine_ext, + ExtractorError, + get_element_by_attribute, + orderedSet, +) -class TvpIE(InfoExtractor): - IE_NAME = 'tvp.pl' - _VALID_URL = r'https?://(?:vod|www)\.tvp\.pl/.*/(?P<id>\d+)$' +class TVPIE(InfoExtractor): + IE_NAME = 'tvp' + IE_DESC = 'Telewizja Polska' + _VALID_URL = r'https?://[^/]+\.tvp\.(?:pl|info)/(?:video/(?:[^,\s]*,)*|(?:(?!\d+/)[^/]+/)*)(?P<id>\d+)' _TESTS = [{ - 'url': 'http://vod.tvp.pl/filmy-fabularne/filmy-za-darmo/ogniem-i-mieczem/wideo/odc-2/4278035', - 'md5': 'cdd98303338b8a7f7abab5cd14092bf2', - 'info_dict': { - 'id': '4278035', - 'ext': 'wmv', - 'title': 'Ogniem i mieczem, odc. 2', - }, - }, { - 'url': 'http://vod.tvp.pl/seriale/obyczajowe/czas-honoru/sezon-1-1-13/i-seria-odc-13/194536', - 'md5': '8aa518c15e5cc32dfe8db400dc921fbb', + 'url': 'https://vod.tvp.pl/video/czas-honoru,i-seria-odc-13,194536', + 'md5': 'a21eb0aa862f25414430f15fdfb9e76c', 'info_dict': { 'id': '194536', 'ext': 'mp4', - 'title': 'Czas honoru, I seria – odc. 13', + 'title': 'Czas honoru, odc. 13 – Władek', + 'description': 'md5:437f48b93558370b031740546b696e24', }, }, { 'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176', - 'md5': 'c3b15ed1af288131115ff17a17c19dda', + 'md5': 'b0005b542e5b4de643a9690326ab1257', 'info_dict': { 'id': '17916176', 'ext': 'mp4', 'title': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata', + 'description': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata', + }, + }, { + # page id is not the same as video id(#7799) + 'url': 'https://wiadomosci.tvp.pl/33908820/28092017-1930', + 'md5': '84cd3c8aec4840046e5ab712416b73d0', + 'info_dict': { + 'id': '33908820', + 'ext': 'mp4', + 'title': 'Wiadomości, 28.09.2017, 19:30', + 'description': 'Wydanie główne codziennego serwisu informacyjnego.' }, + 'skip': 'HTTP Error 404: Not Found', }, { 'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272', - 'md5': 'c3b15ed1af288131115ff17a17c19dda', + 'only_matching': True, + }, { + 'url': 'http://wiadomosci.tvp.pl/25169746/24052016-1200', + 'only_matching': True, + }, { + 'url': 'http://krakow.tvp.pl/25511623/25lecie-mck-wyjatkowe-miejsce-na-mapie-krakowa', + 'only_matching': True, + }, { + 'url': 'http://teleexpress.tvp.pl/25522307/wierni-wzieli-udzial-w-procesjach', + 'only_matching': True, + }, { + 'url': 'http://sport.tvp.pl/25522165/krychowiak-uspokaja-w-sprawie-kontuzji-dwa-tygodnie-to-maksimum', + 'only_matching': True, + }, { + 'url': 'http://www.tvp.info/25511919/trwa-rewolucja-wladza-zdecydowala-sie-na-pogwalcenie-konstytucji', + 'only_matching': True, + }] + + def _real_extract(self, url): + page_id = self._match_id(url) + webpage = self._download_webpage(url, page_id) + video_id = self._search_regex([ + r'<iframe[^>]+src="[^"]*?object_id=(\d+)', + r"object_id\s*:\s*'(\d+)'", + r'data-video-id="(\d+)"'], webpage, 'video id', default=page_id) + return { + '_type': 'url_transparent', + 'url': 'tvp:' + video_id, + 'description': self._og_search_description( + webpage, default=None) or self._html_search_meta( + 'description', webpage, default=None), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'ie_key': 'TVPEmbed', + } + + +class TVPEmbedIE(InfoExtractor): + IE_NAME = 'tvp:embed' + IE_DESC = 'Telewizja Polska' + _VALID_URL = r'(?:tvp:|https?://[^/]+\.tvp\.(?:pl|info)/sess/tvplayer\.php\?.*?object_id=)(?P<id>\d+)' + + _TESTS = [{ + 'url': 'tvp:194536', + 'md5': 'a21eb0aa862f25414430f15fdfb9e76c', + 'info_dict': { + 'id': '194536', + 'ext': 'mp4', + 'title': 'Czas honoru, odc. 13 – Władek', + }, + }, { + # not available + 'url': 'http://www.tvp.pl/sess/tvplayer.php?object_id=22670268', + 'md5': '8c9cd59d16edabf39331f93bf8a766c7', 'info_dict': { - 'id': '17834272', + 'id': '22670268', 'ext': 'mp4', - 'title': 'Na sygnale, odc. 39', + 'title': 'Panorama, 07.12.2015, 15:40', }, + 'skip': 'Transmisja została zakończona lub materiał niedostępny', + }, { + 'url': 'tvp:22670268', + 'only_matching': True, }] def _real_extract(self, url): @@ -50,6 +120,14 @@ webpage = self._download_webpage( 'http://www.tvp.pl/sess/tvplayer.php?object_id=%s' % video_id, video_id) + error = self._html_search_regex( + r'(?s)<p[^>]+\bclass=["\']notAvailable__text["\'][^>]*>(.+?)</p>', + webpage, 'error', default=None) or clean_html( + get_element_by_attribute('class', 'msg error', webpage)) + if error: + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, clean_html(error)), expected=True) + title = self._search_regex( r'name\s*:\s*([\'"])Title\1\s*,\s*value\s*:\s*\1(?P<title>.+?)\1', webpage, 'title', group='title') @@ -63,24 +141,52 @@ r"poster\s*:\s*'([^']+)'", webpage, 'thumbnail', default=None) video_url = self._search_regex( - r'0:{src:([\'"])(?P<url>.*?)\1', webpage, 'formats', group='url', default=None) - if not video_url: + r'0:{src:([\'"])(?P<url>.*?)\1', webpage, + 'formats', group='url', default=None) + if not video_url or 'material_niedostepny.mp4' in video_url: video_url = self._download_json( 'http://www.tvp.pl/pub/stat/videofileinfo?video_id=%s' % video_id, video_id)['video_url'] - ext = video_url.rsplit('.', 1)[-1] - if ext != 'ism/manifest': - if '/' in ext: - ext = 'mp4' + formats = [] + video_url_base = self._search_regex( + r'(https?://.+?/video)(?:\.(?:ism|f4m|m3u8)|-\d+\.mp4)', + video_url, 'video base url', default=None) + if video_url_base: + # TODO: <Group> found instead of <AdaptationSet> in MPD manifest. + # It's not mentioned in MPEG-DASH standard. Figure that out. + # formats.extend(self._extract_mpd_formats( + # video_url_base + '.ism/video.mpd', + # video_id, mpd_id='dash', fatal=False)) + formats.extend(self._extract_ism_formats( + video_url_base + '.ism/Manifest', + video_id, 'mss', fatal=False)) + formats.extend(self._extract_f4m_formats( + video_url_base + '.ism/video.f4m', + video_id, f4m_id='hds', fatal=False)) + m3u8_formats = self._extract_m3u8_formats( + video_url_base + '.ism/video.m3u8', video_id, + 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) + self._sort_formats(m3u8_formats) + m3u8_formats = list(filter( + lambda f: f.get('vcodec') != 'none', m3u8_formats)) + formats.extend(m3u8_formats) + for i, m3u8_format in enumerate(m3u8_formats, 2): + http_url = '%s-%d.mp4' % (video_url_base, i) + if self._is_valid_url(http_url, video_id): + f = m3u8_format.copy() + f.update({ + 'url': http_url, + 'format_id': f['format_id'].replace('hls', 'http'), + 'protocol': 'http', + }) + formats.append(f) + else: formats = [{ 'format_id': 'direct', 'url': video_url, - 'ext': ext, + 'ext': determine_ext(video_url, 'mp4'), }] - else: - m3u8_url = re.sub('([^/]*)\.ism/manifest', r'\1.ism/\1.m3u8', video_url) - formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') self._sort_formats(formats) @@ -92,48 +198,55 @@ } -class TvpSeriesIE(InfoExtractor): - IE_NAME = 'tvp.pl:Series' - _VALID_URL = r'https?://vod\.tvp\.pl/(?:[^/]+/){2}(?P<id>[^/]+)/?$' +class TVPWebsiteIE(InfoExtractor): + IE_NAME = 'tvp:series' + _VALID_URL = r'https?://vod\.tvp\.pl/website/(?P<display_id>[^,]+),(?P<id>\d+)' _TESTS = [{ - 'url': 'http://vod.tvp.pl/filmy-fabularne/filmy-za-darmo/ogniem-i-mieczem', + # series + 'url': 'https://vod.tvp.pl/website/lzy-cennet,38678312/video', 'info_dict': { - 'title': 'Ogniem i mieczem', - 'id': '4278026', + 'id': '38678312', }, - 'playlist_count': 4, + 'playlist_count': 115, }, { - 'url': 'http://vod.tvp.pl/audycje/podroze/boso-przez-swiat', + # film + 'url': 'https://vod.tvp.pl/website/gloria,35139666', 'info_dict': { - 'title': 'Boso przez świat', - 'id': '9329207', + 'id': '36637049', + 'ext': 'mp4', + 'title': 'Gloria, Gloria', + }, + 'params': { + 'skip_download': True, }, - 'playlist_count': 86, + 'add_ie': ['TVPEmbed'], + }, { + 'url': 'https://vod.tvp.pl/website/lzy-cennet,38678312', + 'only_matching': True, }] - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id, tries=5) - - title = self._html_search_regex( - r'(?s) id=[\'"]path[\'"]>(?:.*? / ){2}(.*?)</span>', webpage, 'series') - playlist_id = self._search_regex(r'nodeId:\s*(\d+)', webpage, 'playlist id') - playlist = self._download_webpage( - 'http://vod.tvp.pl/vod/seriesAjax?type=series&nodeId=%s&recommend' - 'edId=0&sort=&page=0&pageSize=10000' % playlist_id, display_id, tries=5, - note='Downloading playlist') - - videos_paths = re.findall( - '(?s)class="shortTitle">.*?href="(/[^"]+)', playlist) - entries = [ - self.url_result('http://vod.tvp.pl%s' % v_path, ie=TvpIE.ie_key()) - for v_path in videos_paths] + def _entries(self, display_id, playlist_id): + url = 'https://vod.tvp.pl/website/%s,%s/video' % (display_id, playlist_id) + for page_num in itertools.count(1): + page = self._download_webpage( + url, display_id, 'Downloading page %d' % page_num, + query={'page': page_num}) + + video_ids = orderedSet(re.findall( + r'<a[^>]+\bhref=["\']/video/%s,[^,]+,(\d+)' % display_id, + page)) + + if not video_ids: + break + + for video_id in video_ids: + yield self.url_result( + 'tvp:%s' % video_id, ie=TVPEmbedIE.ie_key(), + video_id=video_id) - return { - '_type': 'playlist', - 'id': playlist_id, - 'display_id': display_id, - 'title': title, - 'entries': entries, - } + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id, playlist_id = mobj.group('display_id', 'id') + return self.playlist_result( + self._entries(display_id, playlist_id), playlist_id) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/tweakers.py youtube-dl-2019.05.20/youtube_dl/extractor/tweakers.py --- youtube-dl-2016.02.22/youtube_dl/extractor/tweakers.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/tweakers.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,25 +1,62 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import ( + int_or_none, + determine_ext, + mimetype2ext, +) class TweakersIE(InfoExtractor): _VALID_URL = r'https?://tweakers\.net/video/(?P<id>\d+)' _TEST = { 'url': 'https://tweakers.net/video/9926/new-nintendo-3ds-xl-op-alle-fronten-beter.html', - 'md5': '3147e4ddad366f97476a93863e4557c8', + 'md5': 'fe73e417c093a788e0160c4025f88b15', 'info_dict': { 'id': '9926', 'ext': 'mp4', 'title': 'New Nintendo 3DS XL - Op alle fronten beter', - 'description': 'md5:f97324cc71e86e11c853f0763820e3ba', - 'thumbnail': 're:^https?://.*\.jpe?g$', + 'description': 'md5:3789b21fed9c0219e9bcaacd43fab280', + 'thumbnail': r're:^https?://.*\.jpe?g$', 'duration': 386, + 'uploader_id': 's7JeEm', } } def _real_extract(self, url): - playlist_id = self._match_id(url) - entries = self._extract_xspf_playlist( - 'https://tweakers.net/video/s1playlist/%s/playlist.xspf' % playlist_id, playlist_id) - return self.playlist_result(entries, playlist_id) + video_id = self._match_id(url) + video_data = self._download_json( + 'https://tweakers.net/video/s1playlist/%s/1920/1080/playlist.json' % video_id, + video_id)['items'][0] + + title = video_data['title'] + + formats = [] + for location in video_data.get('locations', {}).get('progressive', []): + format_id = location.get('label') + width = int_or_none(location.get('width')) + height = int_or_none(location.get('height')) + for source in location.get('sources', []): + source_url = source.get('src') + if not source_url: + continue + ext = mimetype2ext(source.get('type')) or determine_ext(source_url) + formats.append({ + 'format_id': format_id, + 'url': source_url, + 'width': width, + 'height': height, + 'ext': ext, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'thumbnail': video_data.get('poster'), + 'duration': int_or_none(video_data.get('duration')), + 'uploader_id': video_data.get('account'), + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/twentyfourvideo.py youtube-dl-2019.05.20/youtube_dl/extractor/twentyfourvideo.py --- youtube-dl-2016.02.22/youtube_dl/extractor/twentyfourvideo.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/twentyfourvideo.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( parse_iso8601, @@ -12,48 +14,71 @@ class TwentyFourVideoIE(InfoExtractor): IE_NAME = '24video' - _VALID_URL = r'https?://(?:www\.)?24video\.net/(?:video/(?:view|xml)/|player/new24_play\.swf\?id=)(?P<id>\d+)' - - _TESTS = [ - { - 'url': 'http://www.24video.net/video/view/1044982', - 'md5': 'e09fc0901d9eaeedac872f154931deeb', - 'info_dict': { - 'id': '1044982', - 'ext': 'mp4', - 'title': 'Эротика каменного века', - 'description': 'Как смотрели порно в каменном веке.', - 'thumbnail': 're:^https?://.*\.jpg$', - 'uploader': 'SUPERTELO', - 'duration': 31, - 'timestamp': 1275937857, - 'upload_date': '20100607', - 'age_limit': 18, - 'like_count': int, - 'dislike_count': int, - }, + _VALID_URL = r'''(?x) + https?:// + (?P<host> + (?:(?:www|porno)\.)?24video\. + (?:net|me|xxx|sexy?|tube|adult|site) + )/ + (?: + video/(?:(?:view|xml)/)?| + player/new24_play\.swf\?id= + ) + (?P<id>\d+) + ''' + + _TESTS = [{ + 'url': 'http://www.24video.net/video/view/1044982', + 'md5': 'e09fc0901d9eaeedac872f154931deeb', + 'info_dict': { + 'id': '1044982', + 'ext': 'mp4', + 'title': 'Эротика каменного века', + 'description': 'Как смотрели порно в каменном веке.', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'SUPERTELO', + 'duration': 31, + 'timestamp': 1275937857, + 'upload_date': '20100607', + 'age_limit': 18, + 'like_count': int, + 'dislike_count': int, }, - { - 'url': 'http://www.24video.net/player/new24_play.swf?id=1044982', - 'only_matching': True, - } - ] + }, { + 'url': 'http://www.24video.net/player/new24_play.swf?id=1044982', + 'only_matching': True, + }, { + 'url': 'http://www.24video.me/video/view/1044982', + 'only_matching': True, + }, { + 'url': 'http://www.24video.tube/video/view/2363750', + 'only_matching': True, + }, { + 'url': 'https://www.24video.site/video/view/2640421', + 'only_matching': True, + }, { + 'url': 'https://porno.24video.net/video/2640421-vsya-takaya-gibkaya-i-v-masle', + 'only_matching': True, + }] def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + host = mobj.group('host') webpage = self._download_webpage( - 'http://www.24video.net/video/view/%s' % video_id, video_id) + 'http://%s/video/view/%s' % (host, video_id), video_id) title = self._og_search_title(webpage) description = self._html_search_regex( - r'<span itemprop="description">([^<]+)</span>', webpage, 'description', fatal=False) + r'<(p|span)[^>]+itemprop="description"[^>]*>(?P<description>[^<]+)</\1>', + webpage, 'description', fatal=False, group='description') thumbnail = self._og_search_thumbnail(webpage) duration = int_or_none(self._og_search_property( 'duration', webpage, 'duration', fatal=False)) timestamp = parse_iso8601(self._search_regex( - r'<time id="video-timeago" datetime="([^"]+)" itemprop="uploadDate">', - webpage, 'upload date')) + r'<time[^>]+\bdatetime="([^"]+)"[^>]+itemprop="uploadDate"', + webpage, 'upload date', fatal=False)) uploader = self._html_search_regex( r'class="video-uploaded"[^>]*>\s*<a href="/jsecUser/movies/[^"]+"[^>]*>([^<]+)</a>', @@ -63,16 +88,16 @@ r'<span class="video-views">(\d+) просмотр', webpage, 'view count', fatal=False)) comment_count = int_or_none(self._html_search_regex( - r'<div class="comments-title" id="comments-count">(\d+) комментари', - webpage, 'comment count', fatal=False)) + r'<a[^>]+href="#tab-comments"[^>]*>(\d+) комментари', + webpage, 'comment count', default=None)) # Sets some cookies self._download_xml( - r'http://www.24video.net/video/xml/%s?mode=init' % video_id, + r'http://%s/video/xml/%s?mode=init' % (host, video_id), video_id, 'Downloading init XML') video_xml = self._download_xml( - 'http://www.24video.net/video/xml/%s?mode=play' % video_id, + 'http://%s/video/xml/%s?mode=play' % (host, video_id), video_id, 'Downloading video XML') video = xpath_element(video_xml, './/video', 'video', fatal=True) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/twentymin.py youtube-dl-2019.05.20/youtube_dl/extractor/twentymin.py --- youtube-dl-2016.02.22/youtube_dl/extractor/twentymin.py 2016-01-09 00:15:59.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/twentymin.py 2019-06-07 18:49:07.000000000 +0000 @@ -4,70 +4,88 @@ import re from .common import InfoExtractor -from ..utils import remove_end +from ..utils import ( + int_or_none, + try_get, +) class TwentyMinutenIE(InfoExtractor): IE_NAME = '20min' - _VALID_URL = r'https?://(?:www\.)?20min\.ch/(?:videotv/*\?.*\bvid=(?P<id>\d+)|(?:[^/]+/)*(?P<display_id>[^/#?]+))' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?20min\.ch/ + (?: + videotv/*\?.*?\bvid=| + videoplayer/videoplayer\.html\?.*?\bvideoId@ + ) + (?P<id>\d+) + ''' _TESTS = [{ - # regular video 'url': 'http://www.20min.ch/videotv/?vid=469148&cid=2', - 'md5': 'b52d6bc6ea6398e6a38f12cfd418149c', + 'md5': 'e7264320db31eed8c38364150c12496e', 'info_dict': { 'id': '469148', - 'ext': 'flv', + 'ext': 'mp4', 'title': '85 000 Franken für 15 perfekte Minuten', - 'description': 'Was die Besucher vom Silvesterzauber erwarten können. (Video: Alice Grosjean/Murat Temel)', - 'thumbnail': 'http://thumbnails.20min-tv.ch/server063/469148/frame-72-469148.jpg' - } + 'thumbnail': r're:https?://.*\.jpg$', + }, }, { - # news article with video - 'url': 'http://www.20min.ch/schweiz/news/story/-Wir-muessen-mutig-nach-vorne-schauen--22050469', - 'md5': 'cd4cbb99b94130cff423e967cd275e5e', + 'url': 'http://www.20min.ch/videoplayer/videoplayer.html?params=client@twentyDE|videoId@523629', 'info_dict': { - 'id': '469408', - 'display_id': '-Wir-muessen-mutig-nach-vorne-schauen--22050469', - 'ext': 'flv', - 'title': '«Wir müssen mutig nach vorne schauen»', - 'description': 'Kein Land sei innovativer als die Schweiz, sagte Johann Schneider-Ammann in seiner Neujahrsansprache. Das Land müsse aber seine Hausaufgaben machen.', - 'thumbnail': 'http://www.20min.ch/images/content/2/2/0/22050469/10/teaserbreit.jpg' - } + 'id': '523629', + 'ext': 'mp4', + 'title': 'So kommen Sie bei Eis und Schnee sicher an', + 'description': 'md5:117c212f64b25e3d95747e5276863f7d', + 'thumbnail': r're:https?://.*\.jpg$', + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://www.20min.ch/videotv/?cid=44&vid=468738', 'only_matching': True, - }, { - 'url': 'http://www.20min.ch/ro/sortir/cinema/story/Grandir-au-bahut--c-est-dur-18927411', - 'only_matching': True, }] + @staticmethod + def _extract_urls(webpage): + return [m.group('url') for m in re.finditer( + r'<iframe[^>]+src=(["\'])(?P<url>(?:(?:https?:)?//)?(?:www\.)?20min\.ch/videoplayer/videoplayer.html\?.*?\bvideoId@\d+.*?)\1', + webpage)] + def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') or video_id - - webpage = self._download_webpage(url, display_id) - - title = self._html_search_regex( - r'<h1>.*?<span>(.+?)</span></h1>', - webpage, 'title', default=None) - if not title: - title = remove_end(re.sub( - r'^20 [Mm]inuten.*? -', '', self._og_search_title(webpage)), ' - News') - - if not video_id: - video_id = self._search_regex( - r'"file\d?"\s*,\s*\"(\d+)', webpage, 'video id') - - description = self._html_search_meta( - 'description', webpage, 'description') - thumbnail = self._og_search_thumbnail(webpage) + video_id = self._match_id(url) + + video = self._download_json( + 'http://api.20min.ch/video/%s/show' % video_id, + video_id)['content'] + + title = video['title'] + + formats = [{ + 'format_id': format_id, + 'url': 'http://podcast.20min-tv.ch/podcast/20min/%s%s.mp4' % (video_id, p), + 'quality': quality, + } for quality, (format_id, p) in enumerate([('sd', ''), ('hd', 'h')])] + self._sort_formats(formats) + + description = video.get('lead') + thumbnail = video.get('thumbnail') + + def extract_count(kind): + return try_get( + video, + lambda x: int_or_none(x['communityobject']['thumbs_%s' % kind])) + + like_count = extract_count('up') + dislike_count = extract_count('down') return { 'id': video_id, - 'display_id': display_id, - 'url': 'http://speed.20min-tv.ch/%sm.flv' % video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, + 'like_count': like_count, + 'dislike_count': dislike_count, + 'formats': formats, } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/twentythreevideo.py youtube-dl-2019.05.20/youtube_dl/extractor/twentythreevideo.py --- youtube-dl-2016.02.22/youtube_dl/extractor/twentythreevideo.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/twentythreevideo.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,77 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import int_or_none + + +class TwentyThreeVideoIE(InfoExtractor): + IE_NAME = '23video' + _VALID_URL = r'https?://video\.(?P<domain>twentythree\.net|23video\.com|filmweb\.no)/v\.ihtml/player\.html\?(?P<query>.*?\bphoto(?:_|%5f)id=(?P<id>\d+).*)' + _TEST = { + 'url': 'https://video.twentythree.net/v.ihtml/player.html?showDescriptions=0&source=site&photo%5fid=20448876&autoPlay=1', + 'md5': '75fcf216303eb1dae9920d651f85ced4', + 'info_dict': { + 'id': '20448876', + 'ext': 'mp4', + 'title': 'Video Marketing Minute: Personalized Video', + 'timestamp': 1513855354, + 'upload_date': '20171221', + 'uploader_id': '12258964', + 'uploader': 'Rasmus Bysted', + } + } + + def _real_extract(self, url): + domain, query, photo_id = re.match(self._VALID_URL, url).groups() + base_url = 'https://video.%s' % domain + photo_data = self._download_json( + base_url + '/api/photo/list?' + query, photo_id, query={ + 'format': 'json', + }, transform_source=lambda s: self._search_regex(r'(?s)({.+})', s, 'photo data'))['photo'] + title = photo_data['title'] + + formats = [] + + audio_path = photo_data.get('audio_download') + if audio_path: + formats.append({ + 'format_id': 'audio', + 'url': base_url + audio_path, + 'filesize': int_or_none(photo_data.get('audio_size')), + 'vcodec': 'none', + }) + + def add_common_info_to_list(l, template, id_field, id_value): + f_base = template % id_value + f_path = photo_data.get(f_base + 'download') + if not f_path: + return + l.append({ + id_field: id_value, + 'url': base_url + f_path, + 'width': int_or_none(photo_data.get(f_base + 'width')), + 'height': int_or_none(photo_data.get(f_base + 'height')), + 'filesize': int_or_none(photo_data.get(f_base + 'size')), + }) + + for f in ('mobile_high', 'medium', 'hd', '1080p', '4k'): + add_common_info_to_list(formats, 'video_%s_', 'format_id', f) + + thumbnails = [] + for t in ('quad16', 'quad50', 'quad75', 'quad100', 'small', 'portrait', 'standard', 'medium', 'large', 'original'): + add_common_info_to_list(thumbnails, '%s_', 'id', t) + + return { + 'id': photo_id, + 'title': title, + 'timestamp': int_or_none(photo_data.get('creation_date_epoch')), + 'duration': int_or_none(photo_data.get('video_length')), + 'view_count': int_or_none(photo_data.get('view_count')), + 'comment_count': int_or_none(photo_data.get('number_of_comments')), + 'uploader_id': photo_data.get('user_id'), + 'uploader': photo_data.get('display_name'), + 'thumbnails': thumbnails, + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/twentytwotracks.py youtube-dl-2019.05.20/youtube_dl/extractor/twentytwotracks.py --- youtube-dl-2016.02.22/youtube_dl/extractor/twentytwotracks.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/twentytwotracks.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,86 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import int_or_none - -# 22Tracks regularly replace the audio tracks that can be streamed on their -# site. The tracks usually expire after 1 months, so we can't add tests. - - -class TwentyTwoTracksIE(InfoExtractor): - _VALID_URL = r'https?://22tracks\.com/(?P<city>[a-z]+)/(?P<genre>[\da-z]+)/(?P<id>\d+)' - IE_NAME = '22tracks:track' - - _API_BASE = 'http://22tracks.com/api' - - def _extract_info(self, city, genre_name, track_id=None): - item_id = track_id if track_id else genre_name - - cities = self._download_json( - '%s/cities' % self._API_BASE, item_id, - 'Downloading cities info', - 'Unable to download cities info') - city_id = [x['id'] for x in cities if x['slug'] == city][0] - - genres = self._download_json( - '%s/genres/%s' % (self._API_BASE, city_id), item_id, - 'Downloading %s genres info' % city, - 'Unable to download %s genres info' % city) - genre = [x for x in genres if x['slug'] == genre_name][0] - genre_id = genre['id'] - - tracks = self._download_json( - '%s/tracks/%s' % (self._API_BASE, genre_id), item_id, - 'Downloading %s genre tracks info' % genre_name, - 'Unable to download track info') - - return [x for x in tracks if x['id'] == item_id][0] if track_id else [genre['title'], tracks] - - def _get_track_url(self, filename, track_id): - token = self._download_json( - 'http://22tracks.com/token.php?desktop=true&u=/128/%s' % filename, - track_id, 'Downloading token', 'Unable to download token') - return 'http://audio.22tracks.com%s?st=%s&e=%d' % (token['filename'], token['st'], token['e']) - - def _extract_track_info(self, track_info, track_id): - download_url = self._get_track_url(track_info['filename'], track_id) - title = '%s - %s' % (track_info['artist'].strip(), track_info['title'].strip()) - return { - 'id': track_id, - 'url': download_url, - 'ext': 'mp3', - 'title': title, - 'duration': int_or_none(track_info.get('duration')), - 'timestamp': int_or_none(track_info.get('published_at') or track_info.get('created')) - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - - city = mobj.group('city') - genre = mobj.group('genre') - track_id = mobj.group('id') - - track_info = self._extract_info(city, genre, track_id) - return self._extract_track_info(track_info, track_id) - - -class TwentyTwoTracksGenreIE(TwentyTwoTracksIE): - _VALID_URL = r'https?://22tracks\.com/(?P<city>[a-z]+)/(?P<genre>[\da-z]+)/?$' - IE_NAME = '22tracks:genre' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - - city = mobj.group('city') - genre = mobj.group('genre') - - genre_title, tracks = self._extract_info(city, genre) - - entries = [ - self._extract_track_info(track_info, track_info['id']) - for track_info in tracks] - - return self.playlist_result(entries, genre, genre_title) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/twitcasting.py youtube-dl-2019.05.20/youtube_dl/extractor/twitcasting.py --- youtube-dl-2016.02.22/youtube_dl/extractor/twitcasting.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/twitcasting.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,81 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import urlencode_postdata + +import re + + +class TwitCastingIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P<uploader_id>[^/]+)/movie/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://twitcasting.tv/ivetesangalo/movie/2357609', + 'md5': '745243cad58c4681dc752490f7540d7f', + 'info_dict': { + 'id': '2357609', + 'ext': 'mp4', + 'title': 'Live #2357609', + 'uploader_id': 'ivetesangalo', + 'description': "Moi! I'm live on TwitCasting from my iPhone.", + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://twitcasting.tv/mttbernardini/movie/3689740', + 'info_dict': { + 'id': '3689740', + 'ext': 'mp4', + 'title': 'Live playing something #3689740', + 'uploader_id': 'mttbernardini', + 'description': "I'm live on TwitCasting from my iPad. password: abc (Santa Marinella/Lazio, Italia)", + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': True, + 'videopassword': 'abc', + }, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + uploader_id = mobj.group('uploader_id') + + video_password = self._downloader.params.get('videopassword') + request_data = None + if video_password: + request_data = urlencode_postdata({ + 'password': video_password, + }) + webpage = self._download_webpage(url, video_id, data=request_data) + + title = self._html_search_regex( + r'(?s)<[^>]+id=["\']movietitle[^>]+>(.+?)</', + webpage, 'title', default=None) or self._html_search_meta( + 'twitter:title', webpage, fatal=True) + + m3u8_url = self._search_regex( + (r'data-movie-url=(["\'])(?P<url>(?:(?!\1).)+)\1', + r'(["\'])(?P<url>http.+?\.m3u8.*?)\1'), + webpage, 'm3u8 url', group='url') + + formats = self._extract_m3u8_formats( + m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + + thumbnail = self._og_search_thumbnail(webpage) + description = self._og_search_description( + webpage, default=None) or self._html_search_meta( + 'twitter:description', webpage) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'uploader_id': uploader_id, + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/twitch.py youtube-dl-2019.05.20/youtube_dl/extractor/twitch.py --- youtube-dl-2016.02.22/youtube_dl/extractor/twitch.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/twitch.py 2019-06-07 18:49:07.000000000 +0000 @@ -4,31 +4,41 @@ import itertools import re import random +import json from .common import InfoExtractor from ..compat import ( + compat_kwargs, compat_parse_qs, compat_str, - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, - compat_urlparse, ) from ..utils import ( - encode_dict, + clean_html, ExtractorError, + float_or_none, int_or_none, + orderedSet, parse_duration, parse_iso8601, - sanitized_Request, + qualities, + try_get, + unified_timestamp, + update_url_query, + url_or_none, + urljoin, ) class TwitchBaseIE(InfoExtractor): - _VALID_URL_BASE = r'https?://(?:www\.)?twitch\.tv' + _VALID_URL_BASE = r'https?://(?:(?:www|go|m)\.)?twitch\.tv' _API_BASE = 'https://api.twitch.tv' - _USHER_BASE = 'http://usher.twitch.tv' - _LOGIN_URL = 'http://www.twitch.tv/login' + _USHER_BASE = 'https://usher.ttvnw.net' + _LOGIN_FORM_URL = 'https://www.twitch.tv/login' + _LOGIN_POST_URL = 'https://passport.twitch.tv/login' + _CLIENT_ID = 'kimne78kx3ncx6brgo4mv6wki5h1ko' _NETRC_MACHINE = 'twitch' def _handle_error(self, response): @@ -40,16 +50,13 @@ '%s returned error: %s - %s' % (self.IE_NAME, error, response.get('message')), expected=True) - def _download_json(self, url, video_id, note='Downloading JSON metadata'): - headers = { - 'Referer': 'http://api.twitch.tv/crossdomain/receiver.html?v=2', - 'X-Requested-With': 'XMLHttpRequest', - } - for cookie in self._downloader.cookiejar: - if cookie.name == 'api_token': - headers['Twitch-Api-Token'] = cookie.value - request = sanitized_Request(url, headers=headers) - response = super(TwitchBaseIE, self)._download_json(request, video_id, note) + def _call_api(self, path, item_id, *args, **kwargs): + headers = kwargs.get('headers', {}).copy() + headers['Client-ID'] = self._CLIENT_ID + kwargs['headers'] = headers + response = self._download_json( + '%s/%s' % (self._API_BASE, path), item_id, + *args, **compat_kwargs(kwargs)) self._handle_error(response) return response @@ -57,64 +64,97 @@ self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return + def fail(message): + raise ExtractorError( + 'Unable to login. Twitch said: %s' % message, expected=True) + + def login_step(page, urlh, note, data): + form = self._hidden_inputs(page) + form.update(data) + + page_url = urlh.geturl() + post_url = self._search_regex( + r'<form[^>]+action=(["\'])(?P<url>.+?)\1', page, + 'post url', default=self._LOGIN_POST_URL, group='url') + post_url = urljoin(page_url, post_url) + + headers = { + 'Referer': page_url, + 'Origin': page_url, + 'Content-Type': 'text/plain;charset=UTF-8', + } + + response = self._download_json( + post_url, None, note, data=json.dumps(form).encode(), + headers=headers, expected_status=400) + error = response.get('error_description') or response.get('error_code') + if error: + fail(error) + + if 'Authenticated successfully' in response.get('message', ''): + return None, None + + redirect_url = urljoin( + post_url, + response.get('redirect') or response['redirect_path']) + return self._download_webpage_handle( + redirect_url, None, 'Downloading login redirect page', + headers=headers) + login_page, handle = self._download_webpage_handle( - self._LOGIN_URL, None, 'Downloading login page') + self._LOGIN_FORM_URL, None, 'Downloading login page') - login_form = self._hidden_inputs(login_page) + # Some TOR nodes and public proxies are blocked completely + if 'blacklist_message' in login_page: + fail(clean_html(login_page)) + + redirect_page, handle = login_step( + login_page, handle, 'Logging in', { + 'username': username, + 'password': password, + 'client_id': self._CLIENT_ID, + }) - login_form.update({ - 'username': username, - 'password': password, - }) - - redirect_url = handle.geturl() - - post_url = self._search_regex( - r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, - 'post url', default=redirect_url, group='url') - - if not post_url.startswith('http'): - post_url = compat_urlparse.urljoin(redirect_url, post_url) - - request = sanitized_Request( - post_url, compat_urllib_parse.urlencode(encode_dict(login_form)).encode('utf-8')) - request.add_header('Referer', redirect_url) - response = self._download_webpage( - request, None, 'Logging in as %s' % username) - - error_message = self._search_regex( - r'<div[^>]+class="subwindow_notice"[^>]*>([^<]+)</div>', - response, 'error message', default=None) - if error_message: - raise ExtractorError( - 'Unable to login. Twitch said: %s' % error_message, expected=True) + # Successful login + if not redirect_page: + return - if '>Reset your password<' in response: - self.report_warning('Twitch asks you to reset your password, go to https://secure.twitch.tv/reset/submit') + if re.search(r'(?i)<form[^>]+id="two-factor-submit"', redirect_page) is not None: + # TODO: Add mechanism to request an SMS or phone call + tfa_token = self._get_tfa_info('two-factor authentication token') + login_step(redirect_page, handle, 'Submitting TFA token', { + 'authy_token': tfa_token, + 'remember_2fa': 'true', + }) def _prefer_source(self, formats): try: source = next(f for f in formats if f['format_id'] == 'Source') - source['preference'] = 10 + source['quality'] = 10 except StopIteration: - pass # No Source stream present + for f in formats: + if '/chunked/' in f['url']: + f.update({ + 'quality': 10, + 'format_note': 'Source', + }) self._sort_formats(formats) class TwitchItemBaseIE(TwitchBaseIE): def _download_info(self, item, item_id): - return self._extract_info(self._download_json( - '%s/kraken/videos/%s%s' % (self._API_BASE, item, item_id), item_id, + return self._extract_info(self._call_api( + 'kraken/videos/%s%s' % (item, item_id), item_id, 'Downloading %s info JSON' % self._ITEM_TYPE)) def _extract_media(self, item_id): info = self._download_info(self._ITEM_SHORTCUT, item_id) - response = self._download_json( - '%s/api/videos/%s%s' % (self._API_BASE, self._ITEM_SHORTCUT, item_id), item_id, + response = self._call_api( + 'api/videos/%s%s' % (self._ITEM_SHORTCUT, item_id), item_id, 'Downloading %s playlist JSON' % self._ITEM_TYPE) entries = [] chunks = response['chunks'] @@ -141,6 +181,13 @@ return self.playlist_result(entries, info['id'], info['title']) def _extract_info(self, info): + status = info.get('status') + if status == 'recording': + is_live = True + elif status == 'recorded': + is_live = False + else: + is_live = None return { 'id': info['_id'], 'title': info.get('title') or 'Untitled Broadcast', @@ -151,6 +198,7 @@ 'uploader_id': info.get('channel', {}).get('name'), 'timestamp': parse_iso8601(info.get('recorded_at')), 'view_count': int_or_none(info.get('views')), + 'is_live': is_live, } def _real_extract(self, url): @@ -170,6 +218,7 @@ 'title': 'Worlds Semifinals - Star Horn Royal Club vs. OMG', }, 'playlist_mincount': 12, + 'skip': 'HTTP Error 404: Not Found', } @@ -186,6 +235,7 @@ 'title': 'ACRL Off Season - Sports Cars @ Nordschleife', }, 'playlist_mincount': 3, + 'skip': 'HTTP Error 404: Not Found', }, { 'url': 'http://www.twitch.tv/tsm_theoddone/c/2349361', 'only_matching': True, @@ -194,7 +244,14 @@ class TwitchVodIE(TwitchItemBaseIE): IE_NAME = 'twitch:vod' - _VALID_URL = r'%s/[^/]+/v/(?P<id>\d+)' % TwitchBaseIE._VALID_URL_BASE + _VALID_URL = r'''(?x) + https?:// + (?: + (?:(?:www|go|m)\.)?twitch\.tv/(?:[^/]+/v(?:ideo)?|videos)/| + player\.twitch\.tv/\?.*?\bvideo=v + ) + (?P<id>\d+) + ''' _ITEM_TYPE = 'vod' _ITEM_SHORTCUT = 'v' @@ -204,7 +261,7 @@ 'id': 'v6528877', 'ext': 'mp4', 'title': 'LCK Summer Split - Week 6 Day 1', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 17208, 'timestamp': 1435131709, 'upload_date': '20150624', @@ -224,7 +281,7 @@ 'id': 'v11230755', 'ext': 'mp4', 'title': 'Untitled Broadcast', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 1638, 'timestamp': 1439746708, 'upload_date': '20150816', @@ -236,27 +293,41 @@ # m3u8 download 'skip_download': True, }, + 'skip': 'HTTP Error 404: Not Found', + }, { + 'url': 'http://player.twitch.tv/?t=5m10s&video=v6528877', + 'only_matching': True, + }, { + 'url': 'https://www.twitch.tv/videos/6528877', + 'only_matching': True, + }, { + 'url': 'https://m.twitch.tv/beagsandjam/v/247478721', + 'only_matching': True, + }, { + 'url': 'https://www.twitch.tv/northernlion/video/291940395', + 'only_matching': True, }] def _real_extract(self, url): item_id = self._match_id(url) info = self._download_info(self._ITEM_SHORTCUT, item_id) - access_token = self._download_json( - '%s/api/vods/%s/access_token' % (self._API_BASE, item_id), item_id, + access_token = self._call_api( + 'api/vods/%s/access_token' % item_id, item_id, 'Downloading %s access token' % self._ITEM_TYPE) formats = self._extract_m3u8_formats( '%s/vod/%s?%s' % ( self._USHER_BASE, item_id, - compat_urllib_parse.urlencode({ + compat_urllib_parse_urlencode({ 'allow_source': 'true', + 'allow_audio_only': 'true', 'allow_spectre': 'true', 'player': 'twitchweb', 'nauth': access_token['token'], 'nauthsig': access_token['sig'], })), - item_id, 'mp4') + item_id, 'mp4', entry_protocol='m3u8_native') self._prefer_source(formats) info['formats'] = formats @@ -266,34 +337,73 @@ if 't' in query: info['start_time'] = parse_duration(query['t'][0]) + if info.get('timestamp') is not None: + info['subtitles'] = { + 'rechat': [{ + 'url': update_url_query( + 'https://rechat.twitch.tv/rechat-messages', { + 'video_id': 'v%s' % item_id, + 'start': info['timestamp'], + }), + 'ext': 'json', + }], + } + return info class TwitchPlaylistBaseIE(TwitchBaseIE): - _PLAYLIST_URL = '%s/kraken/channels/%%s/videos/?offset=%%d&limit=%%d' % TwitchBaseIE._API_BASE + _PLAYLIST_PATH = 'kraken/channels/%s/videos/?offset=%d&limit=%d' _PAGE_LIMIT = 100 def _extract_playlist(self, channel_id): - info = self._download_json( - '%s/kraken/channels/%s' % (self._API_BASE, channel_id), + info = self._call_api( + 'kraken/channels/%s' % channel_id, channel_id, 'Downloading channel info JSON') channel_name = info.get('display_name') or info.get('name') entries = [] offset = 0 limit = self._PAGE_LIMIT + broken_paging_detected = False + counter_override = None for counter in itertools.count(1): - response = self._download_json( - self._PLAYLIST_URL % (channel_id, offset, limit), - channel_id, 'Downloading %s videos JSON page %d' % (self._PLAYLIST_TYPE, counter)) + response = self._call_api( + self._PLAYLIST_PATH % (channel_id, offset, limit), + channel_id, + 'Downloading %s JSON page %s' + % (self._PLAYLIST_TYPE, counter_override or counter)) page_entries = self._extract_playlist_page(response) if not page_entries: break + total = int_or_none(response.get('_total')) + # Since the beginning of March 2016 twitch's paging mechanism + # is completely broken on the twitch side. It simply ignores + # a limit and returns the whole offset number of videos. + # Working around by just requesting all videos at once. + # Upd: pagination bug was fixed by twitch on 15.03.2016. + if not broken_paging_detected and total and len(page_entries) > limit: + self.report_warning( + 'Twitch pagination is broken on twitch side, requesting all videos at once', + channel_id) + broken_paging_detected = True + offset = total + counter_override = '(all at once)' + continue entries.extend(page_entries) + if broken_paging_detected or total and len(page_entries) >= total: + break offset += limit return self.playlist_result( - [self.url_result(entry) for entry in set(entries)], + [self._make_url_result(entry) for entry in orderedSet(entries)], channel_id, channel_name) + def _make_url_result(self, url): + try: + video_id = 'v%s' % TwitchVodIE._match_id(url) + return self.url_result(url, TwitchVodIE.ie_key(), video_id=video_id) + except AssertionError: + return self.url_result(url) + def _extract_playlist_page(self, response): videos = response.get('videos') return [video['url'] for video in videos] if videos else [] @@ -307,60 +417,110 @@ _VALID_URL = r'%s/(?P<id>[^/]+)/profile/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE _PLAYLIST_TYPE = 'profile' - _TEST = { + _TESTS = [{ 'url': 'http://www.twitch.tv/vanillatv/profile', 'info_dict': { 'id': 'vanillatv', 'title': 'VanillaTV', }, 'playlist_mincount': 412, - } + }, { + 'url': 'http://m.twitch.tv/vanillatv/profile', + 'only_matching': True, + }] -class TwitchPastBroadcastsIE(TwitchPlaylistBaseIE): - IE_NAME = 'twitch:past_broadcasts' - _VALID_URL = r'%s/(?P<id>[^/]+)/profile/past_broadcasts/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE - _PLAYLIST_URL = TwitchPlaylistBaseIE._PLAYLIST_URL + '&broadcasts=true' - _PLAYLIST_TYPE = 'past broadcasts' +class TwitchVideosBaseIE(TwitchPlaylistBaseIE): + _VALID_URL_VIDEOS_BASE = r'%s/(?P<id>[^/]+)/videos' % TwitchBaseIE._VALID_URL_BASE + _PLAYLIST_PATH = TwitchPlaylistBaseIE._PLAYLIST_PATH + '&broadcast_type=' - _TEST = { - 'url': 'http://www.twitch.tv/spamfish/profile/past_broadcasts', + +class TwitchAllVideosIE(TwitchVideosBaseIE): + IE_NAME = 'twitch:videos:all' + _VALID_URL = r'%s/all' % TwitchVideosBaseIE._VALID_URL_VIDEOS_BASE + _PLAYLIST_PATH = TwitchVideosBaseIE._PLAYLIST_PATH + 'archive,upload,highlight' + _PLAYLIST_TYPE = 'all videos' + + _TESTS = [{ + 'url': 'https://www.twitch.tv/spamfish/videos/all', 'info_dict': { 'id': 'spamfish', 'title': 'Spamfish', }, - 'playlist_mincount': 54, - } + 'playlist_mincount': 869, + }, { + 'url': 'https://m.twitch.tv/spamfish/videos/all', + 'only_matching': True, + }] -class TwitchBookmarksIE(TwitchPlaylistBaseIE): - IE_NAME = 'twitch:bookmarks' - _VALID_URL = r'%s/(?P<id>[^/]+)/profile/bookmarks/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE - _PLAYLIST_URL = '%s/api/bookmark/?user=%%s&offset=%%d&limit=%%d' % TwitchBaseIE._API_BASE - _PLAYLIST_TYPE = 'bookmarks' +class TwitchUploadsIE(TwitchVideosBaseIE): + IE_NAME = 'twitch:videos:uploads' + _VALID_URL = r'%s/uploads' % TwitchVideosBaseIE._VALID_URL_VIDEOS_BASE + _PLAYLIST_PATH = TwitchVideosBaseIE._PLAYLIST_PATH + 'upload' + _PLAYLIST_TYPE = 'uploads' - _TEST = { - 'url': 'http://www.twitch.tv/ognos/profile/bookmarks', + _TESTS = [{ + 'url': 'https://www.twitch.tv/spamfish/videos/uploads', 'info_dict': { - 'id': 'ognos', - 'title': 'Ognos', + 'id': 'spamfish', + 'title': 'Spamfish', }, - 'playlist_mincount': 3, - } + 'playlist_mincount': 0, + }, { + 'url': 'https://m.twitch.tv/spamfish/videos/uploads', + 'only_matching': True, + }] + + +class TwitchPastBroadcastsIE(TwitchVideosBaseIE): + IE_NAME = 'twitch:videos:past-broadcasts' + _VALID_URL = r'%s/past-broadcasts' % TwitchVideosBaseIE._VALID_URL_VIDEOS_BASE + _PLAYLIST_PATH = TwitchVideosBaseIE._PLAYLIST_PATH + 'archive' + _PLAYLIST_TYPE = 'past broadcasts' + + _TESTS = [{ + 'url': 'https://www.twitch.tv/spamfish/videos/past-broadcasts', + 'info_dict': { + 'id': 'spamfish', + 'title': 'Spamfish', + }, + 'playlist_mincount': 0, + }, { + 'url': 'https://m.twitch.tv/spamfish/videos/past-broadcasts', + 'only_matching': True, + }] - def _extract_playlist_page(self, response): - entries = [] - for bookmark in response.get('bookmarks', []): - video = bookmark.get('video') - if not video: - continue - entries.append(video['url']) - return entries + +class TwitchHighlightsIE(TwitchVideosBaseIE): + IE_NAME = 'twitch:videos:highlights' + _VALID_URL = r'%s/highlights' % TwitchVideosBaseIE._VALID_URL_VIDEOS_BASE + _PLAYLIST_PATH = TwitchVideosBaseIE._PLAYLIST_PATH + 'highlight' + _PLAYLIST_TYPE = 'highlights' + + _TESTS = [{ + 'url': 'https://www.twitch.tv/spamfish/videos/highlights', + 'info_dict': { + 'id': 'spamfish', + 'title': 'Spamfish', + }, + 'playlist_mincount': 805, + }, { + 'url': 'https://m.twitch.tv/spamfish/videos/highlights', + 'only_matching': True, + }] class TwitchStreamIE(TwitchBaseIE): IE_NAME = 'twitch:stream' - _VALID_URL = r'%s/(?P<id>[^/#?]+)/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE + _VALID_URL = r'''(?x) + https?:// + (?: + (?:(?:www|go|m)\.)?twitch\.tv/| + player\.twitch\.tv/\?.*?\bchannel= + ) + (?P<id>[^/#?]+) + ''' _TESTS = [{ 'url': 'http://www.twitch.tv/shroomztv', @@ -384,20 +544,41 @@ }, { 'url': 'http://www.twitch.tv/miracle_doto#profile-0', 'only_matching': True, + }, { + 'url': 'https://player.twitch.tv/?channel=lotsofs', + 'only_matching': True, + }, { + 'url': 'https://go.twitch.tv/food', + 'only_matching': True, + }, { + 'url': 'https://m.twitch.tv/food', + 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return (False + if any(ie.suitable(url) for ie in ( + TwitchVideoIE, + TwitchChapterIE, + TwitchVodIE, + TwitchProfileIE, + TwitchAllVideosIE, + TwitchUploadsIE, + TwitchPastBroadcastsIE, + TwitchHighlightsIE, + TwitchClipsIE)) + else super(TwitchStreamIE, cls).suitable(url)) + def _real_extract(self, url): channel_id = self._match_id(url) - stream = self._download_json( - '%s/kraken/streams/%s' % (self._API_BASE, channel_id), channel_id, + stream = self._call_api( + 'kraken/streams/%s?stream_type=all' % channel_id, channel_id, 'Downloading stream JSON').get('stream') - # Fallback on profile extraction if stream is offline if not stream: - return self.url_result( - 'http://www.twitch.tv/%s/profile' % channel_id, - 'TwitchProfile', channel_id) + raise ExtractorError('%s is offline' % channel_id, expected=True) # Channel name may be typed if different case than the original channel name # (e.g. http://www.twitch.tv/TWITCHPLAYSPOKEMON) that will lead to constructing @@ -405,12 +586,14 @@ # JSON and fallback to lowercase if it's not available. channel_id = stream.get('channel', {}).get('name') or channel_id.lower() - access_token = self._download_json( - '%s/api/channels/%s/access_token' % (self._API_BASE, channel_id), channel_id, + access_token = self._call_api( + 'api/channels/%s/access_token' % channel_id, channel_id, 'Downloading channel access token') query = { 'allow_source': 'true', + 'allow_audio_only': 'true', + 'allow_spectre': 'true', 'p': random.randint(1000000, 10000000), 'player': 'twitchweb', 'segment_preference': '4', @@ -419,7 +602,7 @@ } formats = self._extract_m3u8_formats( '%s/api/channel/hls/%s.m3u8?%s' - % (self._USHER_BASE, channel_id, compat_urllib_parse.urlencode(query)), + % (self._USHER_BASE, channel_id, compat_urllib_parse_urlencode(query)), channel_id, 'mp4') self._prefer_source(formats) @@ -454,3 +637,95 @@ 'formats': formats, 'is_live': True, } + + +class TwitchClipsIE(TwitchBaseIE): + IE_NAME = 'twitch:clips' + _VALID_URL = r'https?://(?:clips\.twitch\.tv/(?:[^/]+/)*|(?:www\.)?twitch\.tv/[^/]+/clip/)(?P<id>[^/?#&]+)' + + _TESTS = [{ + 'url': 'https://clips.twitch.tv/FaintLightGullWholeWheat', + 'md5': '761769e1eafce0ffebfb4089cb3847cd', + 'info_dict': { + 'id': '42850523', + 'ext': 'mp4', + 'title': 'EA Play 2016 Live from the Novo Theatre', + 'thumbnail': r're:^https?://.*\.jpg', + 'timestamp': 1465767393, + 'upload_date': '20160612', + 'creator': 'EA', + 'uploader': 'stereotype_', + 'uploader_id': '43566419', + }, + }, { + # multiple formats + 'url': 'https://clips.twitch.tv/rflegendary/UninterestedBeeDAESuppy', + 'only_matching': True, + }, { + 'url': 'https://www.twitch.tv/sergeynixon/clip/StormyThankfulSproutFutureMan', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + status = self._download_json( + 'https://clips.twitch.tv/api/v2/clips/%s/status' % video_id, + video_id) + + formats = [] + + for option in status['quality_options']: + if not isinstance(option, dict): + continue + source = url_or_none(option.get('source')) + if not source: + continue + formats.append({ + 'url': source, + 'format_id': option.get('quality'), + 'height': int_or_none(option.get('quality')), + 'fps': int_or_none(option.get('frame_rate')), + }) + + self._sort_formats(formats) + + info = { + 'formats': formats, + } + + clip = self._call_api( + 'kraken/clips/%s' % video_id, video_id, fatal=False, headers={ + 'Accept': 'application/vnd.twitchtv.v5+json', + }) + + if clip: + quality_key = qualities(('tiny', 'small', 'medium')) + thumbnails = [] + thumbnails_dict = clip.get('thumbnails') + if isinstance(thumbnails_dict, dict): + for thumbnail_id, thumbnail_url in thumbnails_dict.items(): + thumbnails.append({ + 'id': thumbnail_id, + 'url': thumbnail_url, + 'preference': quality_key(thumbnail_id), + }) + + info.update({ + 'id': clip.get('tracking_id') or video_id, + 'title': clip.get('title') or video_id, + 'duration': float_or_none(clip.get('duration')), + 'views': int_or_none(clip.get('views')), + 'timestamp': unified_timestamp(clip.get('created_at')), + 'thumbnails': thumbnails, + 'creator': try_get(clip, lambda x: x['broadcaster']['display_name'], compat_str), + 'uploader': try_get(clip, lambda x: x['curator']['display_name'], compat_str), + 'uploader_id': try_get(clip, lambda x: x['curator']['id'], compat_str), + }) + else: + info.update({ + 'title': video_id, + 'id': video_id, + }) + + return info diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/twitter.py youtube-dl-2019.05.20/youtube_dl/extractor/twitter.py --- youtube-dl-2016.02.22/youtube_dl/extractor/twitter.py 2016-02-22 10:57:19.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/twitter.py 2019-06-07 18:49:07.000000000 +0000 @@ -4,25 +4,46 @@ import re from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( + determine_ext, + dict_get, + ExtractorError, float_or_none, - xpath_text, - remove_end, int_or_none, - ExtractorError, - sanitized_Request, + remove_end, + try_get, + xpath_text, ) +from .periscope import PeriscopeIE + class TwitterBaseIE(InfoExtractor): - def _get_vmap_video_url(self, vmap_url, video_id): + def _extract_formats_from_vmap_url(self, vmap_url, video_id): vmap_data = self._download_xml(vmap_url, video_id) - return xpath_text(vmap_data, './/MediaFile').strip() + video_url = xpath_text(vmap_data, './/MediaFile').strip() + if determine_ext(video_url) == 'm3u8': + return self._extract_m3u8_formats( + video_url, video_id, ext='mp4', m3u8_id='hls', + entry_protocol='m3u8_native') + return [{ + 'url': video_url, + }] + + @staticmethod + def _search_dimensions_in_video_url(a_format, video_url): + m = re.search(r'/(?P<width>\d+)x(?P<height>\d+)/', video_url) + if m: + a_format.update({ + 'width': int(m.group('width')), + 'height': int(m.group('height')), + }) class TwitterCardIE(TwitterBaseIE): IE_NAME = 'twitter:card' - _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/cards/tfw/v1/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/(?P<path>cards/tfw/v1|videos(?:/tweet)?)/(?P<id>\d+)' _TESTS = [ { 'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889', @@ -30,10 +51,10 @@ 'info_dict': { 'id': '560070183650213889', 'ext': 'mp4', - 'title': 'TwitterCard', - 'thumbnail': 're:^https?://.*\.jpg$', + 'title': 'Twitter web player', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 30.033, - } + }, }, { 'url': 'https://twitter.com/i/cards/tfw/v1/623160978427936768', @@ -41,19 +62,18 @@ 'info_dict': { 'id': '623160978427936768', 'ext': 'mp4', - 'title': 'TwitterCard', - 'thumbnail': 're:^https?://.*\.jpg', - 'duration': 80.155, + 'title': 'Twitter web player', + 'thumbnail': r're:^https?://.*$', }, }, { 'url': 'https://twitter.com/i/cards/tfw/v1/654001591733886977', - 'md5': 'd4724ffe6d2437886d004fa5de1043b3', + 'md5': 'b6d9683dd3f48e340ded81c0e917ad46', 'info_dict': { 'id': 'dq4Oj5quskI', 'ext': 'mp4', 'title': 'Ubuntu 11.10 Overview', - 'description': 'Take a quick peek at what\'s new and improved in Ubuntu 11.10.\n\nOnce installed take a look at 10 Things to Do After Installing: http://www.omgubuntu.co.uk/2011/10/10-things-to-do-after-installing-ubuntu-11-10/', + 'description': 'md5:a831e97fa384863d6e26ce48d1c43376', 'upload_date': '20111013', 'uploader': 'OMG! Ubuntu!', 'uploader_id': 'omgubuntu', @@ -62,7 +82,7 @@ }, { 'url': 'https://twitter.com/i/cards/tfw/v1/665289828897005568', - 'md5': 'ab2745d0b0ce53319a534fccaa986439', + 'md5': '6dabeaca9e68cbb71c99c322a4b42a11', 'info_dict': { 'id': 'iBb2x00UVlv', 'ext': 'mp4', @@ -70,65 +90,188 @@ 'uploader_id': '1189339351084113920', 'uploader': 'ArsenalTerje', 'title': 'Vine by ArsenalTerje', + 'timestamp': 1447451307, }, 'add_ie': ['Vine'], - } + }, { + 'url': 'https://twitter.com/i/videos/tweet/705235433198714880', + 'md5': '884812a2adc8aaf6fe52b15ccbfa3b88', + 'info_dict': { + 'id': '705235433198714880', + 'ext': 'mp4', + 'title': 'Twitter web player', + 'thumbnail': r're:^https?://.*', + }, + }, { + 'url': 'https://twitter.com/i/videos/752274308186120192', + 'only_matching': True, + }, ] - def _real_extract(self, url): - video_id = self._match_id(url) + _API_BASE = 'https://api.twitter.com/1.1' - # Different formats served for different User-Agents - USER_AGENTS = [ - 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)', # mp4 - 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0', # webm - ] + def _parse_media_info(self, media_info, video_id): + formats = [] + for media_variant in media_info.get('variants', []): + media_url = media_variant['url'] + if media_url.endswith('.m3u8'): + formats.extend(self._extract_m3u8_formats(media_url, video_id, ext='mp4', m3u8_id='hls')) + elif media_url.endswith('.mpd'): + formats.extend(self._extract_mpd_formats(media_url, video_id, mpd_id='dash')) + else: + tbr = int_or_none(dict_get(media_variant, ('bitRate', 'bitrate')), scale=1000) + a_format = { + 'url': media_url, + 'format_id': 'http-%d' % tbr if tbr else 'http', + 'tbr': tbr, + } + # Reported bitRate may be zero + if not a_format['tbr']: + del a_format['tbr'] + + self._search_dimensions_in_video_url(a_format, media_url) + + formats.append(a_format) + return formats + + def _extract_mobile_formats(self, username, video_id): + webpage = self._download_webpage( + 'https://mobile.twitter.com/%s/status/%s' % (username, video_id), + video_id, 'Downloading mobile webpage', + headers={ + # A recent mobile UA is necessary for `gt` cookie + 'User-Agent': 'Mozilla/5.0 (Android 6.0.1; Mobile; rv:54.0) Gecko/54.0 Firefox/54.0', + }) + main_script_url = self._html_search_regex( + r'<script[^>]+src="([^"]+main\.[^"]+)"', webpage, 'main script URL') + main_script = self._download_webpage( + main_script_url, video_id, 'Downloading main script') + bearer_token = self._search_regex( + r'BEARER_TOKEN\s*:\s*"([^"]+)"', + main_script, 'bearer token') + # https://developer.twitter.com/en/docs/tweets/post-and-engage/api-reference/get-statuses-show-id + api_data = self._download_json( + '%s/statuses/show/%s.json' % (self._API_BASE, video_id), + video_id, 'Downloading API data', + headers={ + 'Authorization': 'Bearer ' + bearer_token, + }) + media_info = try_get(api_data, lambda o: o['extended_entities']['media'][0]['video_info']) or {} + return self._parse_media_info(media_info, video_id) + + def _real_extract(self, url): + path, video_id = re.search(self._VALID_URL, url).groups() config = None formats = [] - for user_agent in USER_AGENTS: - request = sanitized_Request(url) - request.add_header('User-Agent', user_agent) - webpage = self._download_webpage(request, video_id) + duration = None + + urls = [url] + if path.startswith('cards/'): + urls.append('https://twitter.com/i/videos/' + video_id) + + for u in urls: + webpage = self._download_webpage( + u, video_id, headers={'Referer': 'https://twitter.com/'}) iframe_url = self._html_search_regex( - r'<iframe[^>]+src="((?:https?:)?//(?:www.youtube.com/embed/[^"]+|(?:www\.)?vine\.co/v/\w+/card))"', + r'<iframe[^>]+src="((?:https?:)?//(?:www\.youtube\.com/embed/[^"]+|(?:www\.)?vine\.co/v/\w+/card))"', webpage, 'video iframe', default=None) if iframe_url: return self.url_result(iframe_url) config = self._parse_json(self._html_search_regex( - r'data-player-config="([^"]+)"', webpage, 'data player config'), + r'data-(?:player-)?config="([^"]+)"', webpage, + 'data player config', default='{}'), video_id) - if 'playlist' not in config: - if 'vmapUrl' in config: - formats.append({ - 'url': self._get_vmap_video_url(config['vmapUrl'], video_id), - }) - break # same video regardless of UA - continue - video_url = config['playlist'][0]['source'] + if config.get('source_type') == 'vine': + return self.url_result(config['player_url'], 'Vine') - f = { - 'url': video_url, + periscope_url = PeriscopeIE._extract_url(webpage) + if periscope_url: + return self.url_result(periscope_url, PeriscopeIE.ie_key()) + + video_url = config.get('video_url') or config.get('playlist', [{}])[0].get('source') + + if video_url: + if determine_ext(video_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats(video_url, video_id, ext='mp4', m3u8_id='hls')) + else: + f = { + 'url': video_url, + } + + self._search_dimensions_in_video_url(f, video_url) + + formats.append(f) + + vmap_url = config.get('vmapUrl') or config.get('vmap_url') + if vmap_url: + formats.extend( + self._extract_formats_from_vmap_url(vmap_url, video_id)) + + media_info = None + + for entity in config.get('status', {}).get('entities', []): + if 'mediaInfo' in entity: + media_info = entity['mediaInfo'] + + if media_info: + formats.extend(self._parse_media_info(media_info, video_id)) + duration = float_or_none(media_info.get('duration', {}).get('nanos'), scale=1e9) + + username = config.get('user', {}).get('screen_name') + if username: + formats.extend(self._extract_mobile_formats(username, video_id)) + + if formats: + title = self._search_regex(r'<title>([^<]+)', webpage, 'title') + thumbnail = config.get('posterImageUrl') or config.get('image_src') + duration = float_or_none(config.get('duration'), scale=1000) or duration + break + + if not formats: + headers = { + 'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw', + 'Referer': url, } + ct0 = self._get_cookies(url).get('ct0') + if ct0: + headers['csrf_token'] = ct0.value + guest_token = self._download_json( + '%s/guest/activate.json' % self._API_BASE, video_id, + 'Downloading guest token', data=b'', + headers=headers)['guest_token'] + headers['x-guest-token'] = guest_token + self._set_cookie('api.twitter.com', 'gt', guest_token) + config = self._download_json( + '%s/videos/tweet/config/%s.json' % (self._API_BASE, video_id), + video_id, headers=headers) + track = config['track'] + vmap_url = track.get('vmapUrl') + if vmap_url: + formats = self._extract_formats_from_vmap_url(vmap_url, video_id) + else: + playback_url = track['playbackUrl'] + if determine_ext(playback_url) == 'm3u8': + formats = self._extract_m3u8_formats( + playback_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls') + else: + formats = [{ + 'url': playback_url, + }] + title = 'Twitter web player' + thumbnail = config.get('posterImage') + duration = float_or_none(track.get('durationMs'), scale=1000) - m = re.search(r'/(?P\d+)x(?P\d+)/', video_url) - if m: - f.update({ - 'width': int(m.group('width')), - 'height': int(m.group('height')), - }) - formats.append(f) + self._remove_duplicate_formats(formats) self._sort_formats(formats) - thumbnail = config.get('posterImageUrl') - duration = float_or_none(config.get('duration')) - return { 'id': video_id, - 'title': 'TwitterCard', + 'title': title, 'thumbnail': thumbnail, 'duration': duration, 'formats': formats, @@ -137,21 +280,21 @@ class TwitterIE(InfoExtractor): IE_NAME = 'twitter' - _VALID_URL = r'https?://(?:www\.|m\.|mobile\.)?twitter\.com/(?P[^/]+)/status/(?P\d+)' + _VALID_URL = r'https?://(?:www\.|m\.|mobile\.)?twitter\.com/(?:i/web|(?P[^/]+))/status/(?P\d+)' _TEMPLATE_URL = 'https://twitter.com/%s/status/%s' + _TEMPLATE_STATUSES_URL = 'https://twitter.com/statuses/%s' _TESTS = [{ 'url': 'https://twitter.com/freethenipple/status/643211948184596480', - # MD5 checksums are different in different places 'info_dict': { 'id': '643211948184596480', 'ext': 'mp4', 'title': 'FREE THE NIPPLE - FTN supporters on Hollywood Blvd today!', - 'thumbnail': 're:^https?://.*\.jpg', - 'duration': 12.922, + 'thumbnail': r're:^https?://.*\.jpg', 'description': 'FREE THE NIPPLE on Twitter: "FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ"', 'uploader': 'FREE THE NIPPLE', 'uploader_id': 'freethenipple', + 'duration': 12.922, }, }, { 'url': 'https://twitter.com/giphz/status/657991469417025536/photo/1', @@ -161,14 +304,14 @@ 'ext': 'mp4', 'title': 'Gifs - tu vai cai tu vai cai tu nao eh capaz disso tu vai cai', 'description': 'Gifs on Twitter: "tu vai cai tu vai cai tu nao eh capaz disso tu vai cai https://t.co/tM46VHFlO5"', - 'thumbnail': 're:^https?://.*\.png', + 'thumbnail': r're:^https?://.*\.png', 'uploader': 'Gifs', 'uploader_id': 'giphz', }, 'expected_warnings': ['height', 'width'], + 'skip': 'Account suspended', }, { 'url': 'https://twitter.com/starwars/status/665052190608723968', - 'md5': '39b7199856dee6cd4432e72c74bc69d4', 'info_dict': { 'id': '665052190608723968', 'ext': 'mp4', @@ -177,6 +320,114 @@ 'uploader_id': 'starwars', 'uploader': 'Star Wars', }, + }, { + 'url': 'https://twitter.com/BTNBrentYarina/status/705235433198714880', + 'info_dict': { + 'id': '705235433198714880', + 'ext': 'mp4', + 'title': 'Brent Yarina - Khalil Iverson\'s missed highlight dunk. And made highlight dunk. In one highlight.', + 'description': 'Brent Yarina on Twitter: "Khalil Iverson\'s missed highlight dunk. And made highlight dunk. In one highlight."', + 'uploader_id': 'BTNBrentYarina', + 'uploader': 'Brent Yarina', + }, + 'params': { + # The same video as https://twitter.com/i/videos/tweet/705235433198714880 + # Test case of TwitterCardIE + 'skip_download': True, + }, + }, { + 'url': 'https://twitter.com/jaydingeer/status/700207533655363584', + 'info_dict': { + 'id': '700207533655363584', + 'ext': 'mp4', + 'title': 'JG - BEAT PROD: @suhmeduh #Damndaniel', + 'description': 'JG on Twitter: "BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"', + 'thumbnail': r're:^https?://.*\.jpg', + 'uploader': 'JG', + 'uploader_id': 'jaydingeer', + 'duration': 30.0, + }, + }, { + 'url': 'https://twitter.com/Filmdrunk/status/713801302971588609', + 'md5': '89a15ed345d13b86e9a5a5e051fa308a', + 'info_dict': { + 'id': 'MIOxnrUteUd', + 'ext': 'mp4', + 'title': 'Vince Mancini - Vine of the day', + 'description': 'Vince Mancini on Twitter: "Vine of the day https://t.co/xmTvRdqxWf"', + 'uploader': 'Vince Mancini', + 'uploader_id': 'Filmdrunk', + 'timestamp': 1402826626, + 'upload_date': '20140615', + }, + 'add_ie': ['Vine'], + }, { + 'url': 'https://twitter.com/captainamerica/status/719944021058060289', + 'info_dict': { + 'id': '719944021058060289', + 'ext': 'mp4', + 'title': 'Captain America - @King0fNerd Are you sure you made the right choice? Find out in theaters.', + 'description': 'Captain America on Twitter: "@King0fNerd Are you sure you made the right choice? Find out in theaters. https://t.co/GpgYi9xMJI"', + 'uploader_id': 'captainamerica', + 'uploader': 'Captain America', + 'duration': 3.17, + }, + }, { + 'url': 'https://twitter.com/OPP_HSD/status/779210622571536384', + 'info_dict': { + 'id': '1zqKVVlkqLaKB', + 'ext': 'mp4', + 'title': 'Sgt Kerry Schmidt - LIVE on #Periscope: Road rage, mischief, assault, rollover and fire in one occurrence', + 'description': 'Sgt Kerry Schmidt on Twitter: "LIVE on #Periscope: Road rage, mischief, assault, rollover and fire in one occurrence https://t.co/EKrVgIXF3s"', + 'upload_date': '20160923', + 'uploader_id': 'OPP_HSD', + 'uploader': 'Sgt Kerry Schmidt', + 'timestamp': 1474613214, + }, + 'add_ie': ['Periscope'], + }, { + # has mp4 formats via mobile API + 'url': 'https://twitter.com/news_al3alm/status/852138619213144067', + 'info_dict': { + 'id': '852138619213144067', + 'ext': 'mp4', + 'title': 'عالم الأخبار - كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة', + 'description': 'عالم الأخبار on Twitter: "كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة https://t.co/xg6OhpyKfN"', + 'uploader': 'عالم الأخبار', + 'uploader_id': 'news_al3alm', + 'duration': 277.4, + }, + }, { + 'url': 'https://twitter.com/i/web/status/910031516746514432', + 'info_dict': { + 'id': '910031516746514432', + 'ext': 'mp4', + 'title': 'Préfet de Guadeloupe - [Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre.', + 'thumbnail': r're:^https?://.*\.jpg', + 'description': 'Préfet de Guadeloupe on Twitter: "[Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre. https://t.co/mwx01Rs4lo"', + 'uploader': 'Préfet de Guadeloupe', + 'uploader_id': 'Prefet971', + 'duration': 47.48, + }, + 'params': { + 'skip_download': True, # requires ffmpeg + }, + }, { + # card via api.twitter.com/1.1/videos/tweet/config + 'url': 'https://twitter.com/LisPower1/status/1001551623938805763', + 'info_dict': { + 'id': '1001551623938805763', + 'ext': 'mp4', + 'title': 're:.*?Shep is on a roll today.*?', + 'thumbnail': r're:^https?://.*\.jpg', + 'description': 'md5:63b036c228772523ae1924d5f8e5ed6b', + 'uploader': 'Lis Power', + 'uploader_id': 'LisPower1', + 'duration': 111.278, + }, + 'params': { + 'skip_download': True, # requires ffmpeg + }, }] def _real_extract(self, url): @@ -184,7 +435,15 @@ user_id = mobj.group('user_id') twid = mobj.group('id') - webpage = self._download_webpage(self._TEMPLATE_URL % (user_id, twid), twid) + webpage, urlh = self._download_webpage_handle( + self._TEMPLATE_STATUSES_URL % twid, twid) + + if 'twitter.com/account/suspended' in urlh.geturl(): + raise ExtractorError('Account suspended by Twitter.', expected=True) + + if user_id is None: + mobj = re.match(self._VALID_URL, urlh.geturl()) + user_id = mobj.group('user_id') username = remove_end(self._og_search_title(webpage), ' on Twitter') @@ -201,17 +460,6 @@ 'title': username + ' - ' + title, } - card_id = self._search_regex( - r'["\']/i/cards/tfw/v1/(\d+)', webpage, 'twitter card url', default=None) - if card_id: - card_url = 'https://twitter.com/i/cards/tfw/v1/' + card_id - info.update({ - '_type': 'url_transparent', - 'ie_key': 'TwitterCard', - 'url': card_url, - }) - return info - mobj = re.search(r'''(?x) ]+class="animated-gif"(?P[^>]+)>\s* ]+video-src="(?P[^"]+)" @@ -234,12 +482,30 @@ }) return info + twitter_card_url = None + if 'class="PlayableMedia' in webpage: + twitter_card_url = '%s//twitter.com/i/videos/tweet/%s' % (self.http_scheme(), twid) + else: + twitter_card_iframe_url = self._search_regex( + r'data-full-card-iframe-url=([\'"])(?P(?:(?!\1).)+)\1', + webpage, 'Twitter card iframe URL', default=None, group='url') + if twitter_card_iframe_url: + twitter_card_url = compat_urlparse.urljoin(url, twitter_card_iframe_url) + + if twitter_card_url: + info.update({ + '_type': 'url_transparent', + 'ie_key': 'TwitterCard', + 'url': twitter_card_url, + }) + return info + raise ExtractorError('There\'s no video in this tweet.') class TwitterAmplifyIE(TwitterBaseIE): IE_NAME = 'twitter:amplify' - _VALID_URL = 'https?://amp\.twimg\.com/v/(?P[0-9a-f\-]{36})' + _VALID_URL = r'https?://amp\.twimg\.com/v/(?P[0-9a-f\-]{36})' _TEST = { 'url': 'https://amp.twimg.com/v/0ba0c3c7-0af3-4c0a-bed5-7efd1ffa2951', @@ -258,7 +524,7 @@ vmap_url = self._html_search_meta( 'twitter:amplify:vmap', webpage, 'vmap url') - video_url = self._get_vmap_video_url(vmap_url, video_id) + formats = self._extract_formats_from_vmap_url(vmap_url, video_id) thumbnails = [] thumbnail = self._html_search_meta( @@ -280,11 +546,10 @@ }) video_w, video_h = _find_dimension('player') - formats = [{ - 'url': video_url, + formats[0].update({ 'width': video_w, 'height': video_h, - }] + }) return { 'id': video_id, diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/ubu.py youtube-dl-2019.05.20/youtube_dl/extractor/ubu.py --- youtube-dl-2016.02.22/youtube_dl/extractor/ubu.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/ubu.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,57 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - qualities, -) - - -class UbuIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?ubu\.com/film/(?P[\da-z_-]+)\.html' - _TEST = { - 'url': 'http://ubu.com/film/her_noise.html', - 'md5': '138d5652618bf0f03878978db9bef1ee', - 'info_dict': { - 'id': 'her_noise', - 'ext': 'm4v', - 'title': 'Her Noise - The Making Of (2007)', - 'duration': 3600, - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - title = self._html_search_regex( - r'.+?Film & Video: ([^<]+)', webpage, 'title') - - duration = int_or_none(self._html_search_regex( - r'Duration: (\d+) minutes', webpage, 'duration', fatal=False), - invscale=60) - - formats = [] - FORMAT_REGEXES = [ - ('sq', r"'flashvars'\s*,\s*'file=([^']+)'"), - ('hq', r'href="(http://ubumexico\.centro\.org\.mx/video/[^"]+)"'), - ] - preference = qualities([fid for fid, _ in FORMAT_REGEXES]) - for format_id, format_regex in FORMAT_REGEXES: - m = re.search(format_regex, webpage) - if m: - formats.append({ - 'url': m.group(1), - 'format_id': format_id, - 'preference': preference(format_id), - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'duration': duration, - 'formats': formats, - } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/udemy.py youtube-dl-2019.05.20/youtube_dl/extractor/udemy.py --- youtube-dl-2016.02.22/youtube_dl/extractor/udemy.py 2016-01-09 00:15:59.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/udemy.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,23 +1,42 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import ( compat_HTTPError, - compat_urllib_parse, + compat_kwargs, + compat_str, compat_urllib_request, + compat_urlparse, ) from ..utils import ( + determine_ext, + extract_attributes, ExtractorError, float_or_none, int_or_none, + js_to_json, sanitized_Request, + try_get, unescapeHTML, + url_or_none, + urlencode_postdata, ) class UdemyIE(InfoExtractor): IE_NAME = 'udemy' - _VALID_URL = r'https?://www\.udemy\.com/(?:[^#]+#/lecture/|lecture/view/?\?lectureId=)(?P\d+)' + _VALID_URL = r'''(?x) + https?:// + (?:[^/]+\.)?udemy\.com/ + (?: + [^#]+\#/lecture/| + lecture/view/?\?lectureId=| + [^/]+/learn/v4/t/lecture/ + ) + (?P\d+) + ''' _LOGIN_URL = 'https://www.udemy.com/join/login-popup/?displayType=ajax&showSkipButton=1' _ORIGIN_URL = 'https://www.udemy.com' _NETRC_MACHINE = 'udemy' @@ -33,36 +52,69 @@ 'duration': 579.29, }, 'skip': 'Requires udemy account credentials', + }, { + # new URL schema + 'url': 'https://www.udemy.com/electric-bass-right-from-the-start/learn/v4/t/lecture/4580906', + 'only_matching': True, + }, { + # no url in outputs format entry + 'url': 'https://www.udemy.com/learn-web-development-complete-step-by-step-guide-to-success/learn/v4/t/lecture/4125812', + 'only_matching': True, + }, { + # only outputs rendition + 'url': 'https://www.udemy.com/how-you-can-help-your-local-community-5-amazing-examples/learn/v4/t/lecture/3225750?start=0', + 'only_matching': True, + }, { + 'url': 'https://wipro.udemy.com/java-tutorial/#/lecture/172757', + 'only_matching': True, }] - def _enroll_course(self, webpage, course_id): + def _extract_course_info(self, webpage, video_id): + course = self._parse_json( + unescapeHTML(self._search_regex( + r'ng-init=["\'].*\bcourse=({.+?})[;"\']', + webpage, 'course', default='{}')), + video_id, fatal=False) or {} + course_id = course.get('id') or self._search_regex( + [ + r'data-course-id=["\'](\d+)', + r'"courseId"\s*:\s*(\d+)' + ], webpage, 'course id') + return course_id, course.get('title') + + def _enroll_course(self, base_url, webpage, course_id): + def combine_url(base_url, url): + return compat_urlparse.urljoin(base_url, url) if not url.startswith('http') else url + checkout_url = unescapeHTML(self._search_regex( - r'href=(["\'])(?Phttps?://(?:www\.)?udemy\.com/payment/checkout/.+?)\1', + r'href=(["\'])(?P(?:https?://(?:www\.)?udemy\.com)?/(?:payment|cart)/checkout/.+?)\1', webpage, 'checkout url', group='url', default=None)) if checkout_url: raise ExtractorError( 'Course %s is not free. You have to pay for it before you can download. ' - 'Use this URL to confirm purchase: %s' % (course_id, checkout_url), expected=True) + 'Use this URL to confirm purchase: %s' + % (course_id, combine_url(base_url, checkout_url)), + expected=True) enroll_url = unescapeHTML(self._search_regex( - r'href=(["\'])(?Phttps?://(?:www\.)?udemy\.com/course/subscribe/.+?)\1', + r'href=(["\'])(?P(?:https?://(?:www\.)?udemy\.com)?/course/subscribe/.+?)\1', webpage, 'enroll url', group='url', default=None)) if enroll_url: - webpage = self._download_webpage(enroll_url, course_id, 'Enrolling in the course') + webpage = self._download_webpage( + combine_url(base_url, enroll_url), + course_id, 'Enrolling in the course', + headers={'Referer': base_url}) if '>You have enrolled in' in webpage: self.to_screen('%s: Successfully enrolled in the course' % course_id) def _download_lecture(self, course_id, lecture_id): return self._download_json( - 'https://www.udemy.com/api-2.0/users/me/subscribed-courses/%s/lectures/%s?%s' % ( - course_id, lecture_id, compat_urllib_parse.urlencode({ - 'video_only': '', - 'auto_play': '', - 'fields[lecture]': 'title,description,asset', - 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,data', - 'instructorPreviewMode': 'False', - })), - lecture_id, 'Downloading lecture JSON') + 'https://www.udemy.com/api-2.0/users/me/subscribed-courses/%s/lectures/%s?' + % (course_id, lecture_id), + lecture_id, 'Downloading lecture JSON', query={ + 'fields[lecture]': 'title,description,view_html,asset', + 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,stream_urls,captions,data', + }) def _handle_error(self, response): if not isinstance(response, dict): @@ -75,7 +127,26 @@ error_str += ' - %s' % error_data.get('formErrors') raise ExtractorError(error_str, expected=True) - def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata'): + def _download_webpage_handle(self, *args, **kwargs): + headers = kwargs.get('headers', {}).copy() + headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36' + kwargs['headers'] = headers + ret = super(UdemyIE, self)._download_webpage_handle( + *args, **compat_kwargs(kwargs)) + if not ret: + return ret + webpage, _ = ret + if any(p in webpage for p in ( + '>Please verify you are a human', + 'Access to this page has been denied because we believe you are using automation tools to browse the website', + '"_pxCaptcha"')): + raise ExtractorError( + 'Udemy asks you to solve a CAPTCHA. Login with browser, ' + 'solve CAPTCHA, then export cookies and pass cookie file to ' + 'youtube-dl with --cookies.', expected=True) + return ret + + def _download_json(self, url_or_request, *args, **kwargs): headers = { 'X-Udemy-Snail-Case': 'true', 'X-Requested-With': 'XMLHttpRequest', @@ -93,7 +164,7 @@ else: url_or_request = sanitized_Request(url_or_request, headers=headers) - response = super(UdemyIE, self)._download_json(url_or_request, video_id, note) + response = super(UdemyIE, self)._download_json(url_or_request, *args, **kwargs) self._handle_error(response) return response @@ -101,7 +172,7 @@ self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return @@ -109,7 +180,9 @@ self._LOGIN_URL, None, 'Downloading login popup') def is_logged(webpage): - return any(p in webpage for p in ['href="https://www.udemy.com/user/logout/', '>Logout<']) + return any(re.search(p, webpage) for p in ( + r'href=["\'](?:https://www\.udemy\.com)?/user/logout/', + r'>Logout<')) # already logged in if is_logged(login_popup): @@ -118,17 +191,17 @@ login_form = self._form_hidden_inputs('login-form', login_popup) login_form.update({ - 'email': username.encode('utf-8'), - 'password': password.encode('utf-8'), + 'email': username, + 'password': password, }) - request = sanitized_Request( - self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) - request.add_header('Referer', self._ORIGIN_URL) - request.add_header('Origin', self._ORIGIN_URL) - response = self._download_webpage( - request, None, 'Logging in as %s' % username) + self._LOGIN_URL, None, 'Logging in', + data=urlencode_postdata(login_form), + headers={ + 'Referer': self._ORIGIN_URL, + 'Origin': self._ORIGIN_URL, + }) if not is_logged(response): error = self._html_search_regex( @@ -143,15 +216,14 @@ webpage = self._download_webpage(url, lecture_id) - course_id = self._search_regex( - r'data-course-id=["\'](\d+)', webpage, 'course id') + course_id, _ = self._extract_course_info(webpage, lecture_id) try: lecture = self._download_lecture(course_id, lecture_id) except ExtractorError as e: # Error could possibly mean we are not enrolled in the course if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - self._enroll_course(webpage, course_id) + self._enroll_course(url, webpage, course_id) lecture = self._download_lecture(course_id, lecture_id) else: raise @@ -161,56 +233,179 @@ asset = lecture['asset'] - asset_type = asset.get('assetType') or asset.get('asset_type') + asset_type = asset.get('asset_type') or asset.get('assetType') if asset_type != 'Video': raise ExtractorError( 'Lecture %s is not a video' % lecture_id, expected=True) - stream_url = asset.get('streamUrl') or asset.get('stream_url') + stream_url = asset.get('stream_url') or asset.get('streamUrl') if stream_url: youtube_url = self._search_regex( r'(https?://www\.youtube\.com/watch\?v=.*)', stream_url, 'youtube URL', default=None) if youtube_url: return self.url_result(youtube_url, 'Youtube') - video_id = asset['id'] - thumbnail = asset.get('thumbnailUrl') or asset.get('thumbnail_url') + video_id = compat_str(asset['id']) + thumbnail = asset.get('thumbnail_url') or asset.get('thumbnailUrl') duration = float_or_none(asset.get('data', {}).get('duration')) - outputs = asset.get('data', {}).get('outputs', {}) + + subtitles = {} + automatic_captions = {} formats = [] - for format_ in asset.get('download_urls', {}).get('Video', []): - video_url = format_.get('file') - if not video_url: - continue - format_id = format_.get('label') - f = { - 'url': format_['file'], - 'height': int_or_none(format_id), + + def extract_output_format(src, f_id): + return { + 'url': src.get('url'), + 'format_id': '%sp' % (src.get('height') or f_id), + 'width': int_or_none(src.get('width')), + 'height': int_or_none(src.get('height')), + 'vbr': int_or_none(src.get('video_bitrate_in_kbps')), + 'vcodec': src.get('video_codec'), + 'fps': int_or_none(src.get('frame_rate')), + 'abr': int_or_none(src.get('audio_bitrate_in_kbps')), + 'acodec': src.get('audio_codec'), + 'asr': int_or_none(src.get('audio_sample_rate')), + 'tbr': int_or_none(src.get('total_bitrate_in_kbps')), + 'filesize': int_or_none(src.get('file_size_in_bytes')), } - if format_id: - # Some videos contain additional metadata (e.g. - # https://www.udemy.com/ios9-swift/learn/#/lecture/3383208) - output = outputs.get(format_id) - if isinstance(output, dict): - f.update({ - 'format_id': '%sp' % (output.get('label') or format_id), - 'width': int_or_none(output.get('width')), - 'height': int_or_none(output.get('height')), - 'vbr': int_or_none(output.get('video_bitrate_in_kbps')), - 'vcodec': output.get('video_codec'), - 'fps': int_or_none(output.get('frame_rate')), - 'abr': int_or_none(output.get('audio_bitrate_in_kbps')), - 'acodec': output.get('audio_codec'), - 'asr': int_or_none(output.get('audio_sample_rate')), - 'tbr': int_or_none(output.get('total_bitrate_in_kbps')), - 'filesize': int_or_none(output.get('file_size_in_bytes')), - }) + + outputs = asset.get('data', {}).get('outputs') + if not isinstance(outputs, dict): + outputs = {} + + def add_output_format_meta(f, key): + output = outputs.get(key) + if isinstance(output, dict): + output_format = extract_output_format(output, key) + output_format.update(f) + return output_format + return f + + def extract_formats(source_list): + if not isinstance(source_list, list): + return + for source in source_list: + video_url = url_or_none(source.get('file') or source.get('src')) + if not video_url: + continue + if source.get('type') == 'application/x-mpegURL' or determine_ext(video_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + continue + format_id = source.get('label') + f = { + 'url': video_url, + 'format_id': '%sp' % format_id, + 'height': int_or_none(format_id), + } + if format_id: + # Some videos contain additional metadata (e.g. + # https://www.udemy.com/ios9-swift/learn/#/lecture/3383208) + f = add_output_format_meta(f, format_id) + formats.append(f) + + def extract_subtitles(track_list): + if not isinstance(track_list, list): + return + for track in track_list: + if not isinstance(track, dict): + continue + if track.get('kind') != 'captions': + continue + src = url_or_none(track.get('src')) + if not src: + continue + lang = track.get('language') or track.get( + 'srclang') or track.get('label') + sub_dict = automatic_captions if track.get( + 'autogenerated') is True else subtitles + sub_dict.setdefault(lang, []).append({ + 'url': src, + }) + + for url_kind in ('download', 'stream'): + urls = asset.get('%s_urls' % url_kind) + if isinstance(urls, dict): + extract_formats(urls.get('Video')) + + captions = asset.get('captions') + if isinstance(captions, list): + for cc in captions: + if not isinstance(cc, dict): + continue + cc_url = url_or_none(cc.get('url')) + if not cc_url: + continue + lang = try_get(cc, lambda x: x['locale']['locale'], compat_str) + sub_dict = (automatic_captions if cc.get('source') == 'auto' + else subtitles) + sub_dict.setdefault(lang or 'en', []).append({ + 'url': cc_url, + }) + + view_html = lecture.get('view_html') + if view_html: + view_html_urls = set() + for source in re.findall(r']+>', view_html): + attributes = extract_attributes(source) + src = attributes.get('src') + if not src: + continue + res = attributes.get('data-res') + height = int_or_none(res) + if src in view_html_urls: + continue + view_html_urls.add(src) + if attributes.get('type') == 'application/x-mpegURL' or determine_ext(src) == 'm3u8': + m3u8_formats = self._extract_m3u8_formats( + src, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) + for f in m3u8_formats: + m = re.search(r'/hls_(?P\d{3,4})_(?P\d{2,})/', f['url']) + if m: + if not f.get('height'): + f['height'] = int(m.group('height')) + if not f.get('tbr'): + f['tbr'] = int(m.group('tbr')) + formats.extend(m3u8_formats) else: - f['format_id'] = '%sp' % format_id - formats.append(f) + formats.append(add_output_format_meta({ + 'url': src, + 'format_id': '%dp' % height if height else None, + 'height': height, + }, res)) + + # react rendition since 2017.04.15 (see + # https://github.com/ytdl-org/youtube-dl/issues/12744) + data = self._parse_json( + self._search_regex( + r'videojs-setup-data=(["\'])(?P{.+?})\1', view_html, + 'setup data', default='{}', group='data'), video_id, + transform_source=unescapeHTML, fatal=False) + if data and isinstance(data, dict): + extract_formats(data.get('sources')) + if not duration: + duration = int_or_none(data.get('duration')) + extract_subtitles(data.get('tracks')) + + if not subtitles and not automatic_captions: + text_tracks = self._parse_json( + self._search_regex( + r'text-tracks=(["\'])(?P\[.+?\])\1', view_html, + 'text tracks', default='{}', group='data'), video_id, + transform_source=lambda s: js_to_json(unescapeHTML(s)), + fatal=False) + extract_subtitles(text_tracks) + + if not formats and outputs: + for format_id, output in outputs.items(): + f = extract_output_format(output, format_id) + if f.get('url'): + formats.append(f) - self._sort_formats(formats) + self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id')) return { 'id': video_id, @@ -218,14 +413,22 @@ 'description': description, 'thumbnail': thumbnail, 'duration': duration, - 'formats': formats + 'formats': formats, + 'subtitles': subtitles, + 'automatic_captions': automatic_captions, } class UdemyCourseIE(UdemyIE): IE_NAME = 'udemy:course' - _VALID_URL = r'https?://www\.udemy\.com/(?P[\da-z-]+)' - _TESTS = [] + _VALID_URL = r'https?://(?:[^/]+\.)?udemy\.com/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.udemy.com/java-tutorial/', + 'only_matching': True, + }, { + 'url': 'https://wipro.udemy.com/java-tutorial/', + 'only_matching': True, + }] @classmethod def suitable(cls, url): @@ -236,29 +439,34 @@ webpage = self._download_webpage(url, course_path) - response = self._download_json( - 'https://www.udemy.com/api-1.1/courses/%s' % course_path, - course_path, 'Downloading course JSON') - - course_id = response['id'] - course_title = response.get('title') + course_id, title = self._extract_course_info(webpage, course_path) - self._enroll_course(webpage, course_id) + self._enroll_course(url, webpage, course_id) response = self._download_json( - 'https://www.udemy.com/api-1.1/courses/%s/curriculum' % course_id, - course_id, 'Downloading course curriculum') + 'https://www.udemy.com/api-2.0/courses/%s/cached-subscriber-curriculum-items' % course_id, + course_id, 'Downloading course curriculum', query={ + 'fields[chapter]': 'title,object_index', + 'fields[lecture]': 'title,asset', + 'page_size': '1000', + }) entries = [] - chapter, chapter_number = None, None - for asset in response: - asset_type = asset.get('assetType') or asset.get('asset_type') - if asset_type == 'Video': - asset_id = asset.get('id') - if asset_id: + chapter, chapter_number = [None] * 2 + for entry in response['results']: + clazz = entry.get('_class') + if clazz == 'lecture': + asset = entry.get('asset') + if isinstance(asset, dict): + asset_type = asset.get('asset_type') or asset.get('assetType') + if asset_type != 'Video': + continue + lecture_id = entry.get('id') + if lecture_id: entry = { '_type': 'url_transparent', - 'url': 'https://www.udemy.com/%s/#/lecture/%s' % (course_path, asset['id']), + 'url': 'https://www.udemy.com/%s/learn/v4/t/lecture/%s' % (course_path, entry['id']), + 'title': entry.get('title'), 'ie_key': UdemyIE.ie_key(), } if chapter_number: @@ -266,8 +474,8 @@ if chapter: entry['chapter'] = chapter entries.append(entry) - elif asset.get('type') == 'chapter': - chapter_number = asset.get('index') or asset.get('object_index') - chapter = asset.get('title') + elif clazz == 'chapter': + chapter_number = entry.get('object_index') + chapter = entry.get('title') - return self.playlist_result(entries, course_id, course_title) + return self.playlist_result(entries, course_id, title) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/udn.py youtube-dl-2019.05.20/youtube_dl/extractor/udn.py --- youtube-dl-2016.02.22/youtube_dl/extractor/udn.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/udn.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,11 +1,13 @@ # coding: utf-8 from __future__ import unicode_literals -import json +import re + from .common import InfoExtractor from ..utils import ( + determine_ext, + int_or_none, js_to_json, - ExtractorError, ) from ..compat import compat_urlparse @@ -16,13 +18,17 @@ _VALID_URL = r'https?:' + _PROTOCOL_RELATIVE_VALID_URL _TESTS = [{ 'url': 'http://video.udn.com/embed/news/300040', - 'md5': 'de06b4c90b042c128395a88f0384817e', 'info_dict': { 'id': '300040', 'ext': 'mp4', 'title': '生物老師男變女 全校挺"做自己"', - 'thumbnail': 're:^https?://.*\.jpg$', - } + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'expected_warnings': ['Failed to parse JSON Expecting value'], }, { 'url': 'https://video.udn.com/embed/news/300040', 'only_matching': True, @@ -37,40 +43,60 @@ page = self._download_webpage(url, video_id) - options = json.loads(js_to_json(self._html_search_regex( - r'var options\s*=\s*([^;]+);', page, 'video urls dictionary'))) - - video_urls = options['video'] + options_str = self._html_search_regex( + r'var\s+options\s*=\s*([^;]+);', page, 'options') + trans_options_str = js_to_json(options_str) + options = self._parse_json(trans_options_str, 'options', fatal=False) or {} + if options: + video_urls = options['video'] + title = options['title'] + poster = options.get('poster') + else: + video_urls = self._parse_json(self._html_search_regex( + r'"video"\s*:\s*({.+?})\s*,', trans_options_str, 'video urls'), 'video urls') + title = self._html_search_regex( + r"title\s*:\s*'(.+?)'\s*,", options_str, 'title') + poster = self._html_search_regex( + r"poster\s*:\s*'(.+?)'\s*,", options_str, 'poster', default=None) if video_urls.get('youtube'): return self.url_result(video_urls.get('youtube'), 'Youtube') - try: - del video_urls['youtube'] - except KeyError: - pass + formats = [] + for video_type, api_url in video_urls.items(): + if not api_url: + continue - formats = [{ - 'url': self._download_webpage( + video_url = self._download_webpage( compat_urlparse.urljoin(url, api_url), video_id, - 'retrieve url for %s video' % video_type), - 'format_id': video_type, - 'preference': 0 if video_type == 'mp4' else -1, - } for video_type, api_url in video_urls.items() if api_url] + note='retrieve url for %s video' % video_type) - if not formats: - raise ExtractorError('No videos found', expected=True) + ext = determine_ext(video_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, ext='mp4', m3u8_id='hls')) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + video_url, video_id, f4m_id='hds')) + else: + mobj = re.search(r'_(?P\d+)p_(?P\d+)\.mp4', video_url) + a_format = { + 'url': video_url, + # video_type may be 'mp4', which confuses YoutubeDL + 'format_id': 'http-' + video_type, + } + if mobj: + a_format.update({ + 'height': int_or_none(mobj.group('height')), + 'tbr': int_or_none(mobj.group('tbr')), + }) + formats.append(a_format) self._sort_formats(formats) - thumbnail = None - - if options.get('gallery') and len(options['gallery']): - thumbnail = options['gallery'][0].get('original') - return { 'id': video_id, 'formats': formats, - 'title': options['title'], - 'thumbnail': thumbnail + 'title': title, + 'thumbnail': poster, } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/ufctv.py youtube-dl-2019.05.20/youtube_dl/extractor/ufctv.py --- youtube-dl-2016.02.22/youtube_dl/extractor/ufctv.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/ufctv.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,73 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + parse_duration, + parse_iso8601, + urlencode_postdata, +) + + +class UFCTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ufc\.tv/video/(?P[^/]+)' + _NETRC_MACHINE = 'ufctv' + _TEST = { + 'url': 'https://www.ufc.tv/video/ufc-219-countdown-full-episode', + 'info_dict': { + 'id': '34167', + 'ext': 'mp4', + 'title': 'UFC 219 Countdown: Full Episode', + 'description': 'md5:26d4e8bf4665ae5878842d7050c3c646', + 'timestamp': 1513962360, + 'upload_date': '20171222', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + } + + def _real_initialize(self): + username, password = self._get_login_info() + if username is None: + return + + code = self._download_json( + 'https://www.ufc.tv/secure/authenticate', + None, 'Logging in', data=urlencode_postdata({ + 'username': username, + 'password': password, + 'format': 'json', + })).get('code') + if code and code != 'loginsuccess': + raise ExtractorError(code, expected=True) + + def _real_extract(self, url): + display_id = self._match_id(url) + video_data = self._download_json(url, display_id, query={ + 'format': 'json', + }) + video_id = str(video_data['id']) + title = video_data['name'] + m3u8_url = self._download_json( + 'https://www.ufc.tv/service/publishpoint', video_id, query={ + 'type': 'video', + 'format': 'json', + 'id': video_id, + }, headers={ + 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0_1 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A402 Safari/604.1', + })['path'] + m3u8_url = m3u8_url.replace('_iphone.', '.') + formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'duration': parse_duration(video_data.get('runtime')), + 'timestamp': parse_iso8601(video_data.get('releaseDate')), + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/uktvplay.py youtube-dl-2019.05.20/youtube_dl/extractor/uktvplay.py --- youtube-dl-2016.02.22/youtube_dl/extractor/uktvplay.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/uktvplay.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,33 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class UKTVPlayIE(InfoExtractor): + _VALID_URL = r'https?://uktvplay\.uktv\.co\.uk/.+?\?.*?\bvideo=(?P\d+)' + _TEST = { + 'url': 'https://uktvplay.uktv.co.uk/shows/world-at-war/c/200/watch-online/?video=2117008346001', + 'md5': '', + 'info_dict': { + 'id': '2117008346001', + 'ext': 'mp4', + 'title': 'Pincers', + 'description': 'Pincers', + 'uploader_id': '1242911124001', + 'upload_date': '20130124', + 'timestamp': 1359049267, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'expected_warnings': ['Failed to download MPD manifest'] + } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1242911124001/H1xnMOqP_default/index.html?videoId=%s' + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % video_id, + 'BrightcoveNew', video_id) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/umg.py youtube-dl-2019.05.20/youtube_dl/extractor/umg.py --- youtube-dl-2016.02.22/youtube_dl/extractor/umg.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/umg.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,103 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_filesize, + parse_iso8601, +) + + +class UMGDeIE(InfoExtractor): + IE_NAME = 'umg:de' + IE_DESC = 'Universal Music Deutschland' + _VALID_URL = r'https?://(?:www\.)?universal-music\.de/[^/]+/videos/[^/?#]+-(?P\d+)' + _TEST = { + 'url': 'https://www.universal-music.de/sido/videos/jedes-wort-ist-gold-wert-457803', + 'md5': 'ebd90f48c80dcc82f77251eb1902634f', + 'info_dict': { + 'id': '457803', + 'ext': 'mp4', + 'title': 'Jedes Wort ist Gold wert', + 'timestamp': 1513591800, + 'upload_date': '20171218', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._download_json( + 'https://api.universal-music.de/graphql', + video_id, query={ + 'query': '''{ + universalMusic(channel:16) { + video(id:%s) { + headline + formats { + formatId + url + type + width + height + mimeType + fileSize + } + duration + createdDate + } + } +}''' % video_id})['data']['universalMusic']['video'] + + title = video_data['headline'] + hls_url_template = 'http://mediadelivery.universal-music-services.de/vod/mp4:autofill/storage/' + '/'.join(list(video_id)) + '/content/%s/file/playlist.m3u8' + + thumbnails = [] + formats = [] + + def add_m3u8_format(format_id): + m3u8_formats = self._extract_m3u8_formats( + hls_url_template % format_id, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal='False') + if m3u8_formats and m3u8_formats[0].get('height'): + formats.extend(m3u8_formats) + + for f in video_data.get('formats', []): + f_url = f.get('url') + mime_type = f.get('mimeType') + if not f_url or mime_type == 'application/mxf': + continue + fmt = { + 'url': f_url, + 'width': int_or_none(f.get('width')), + 'height': int_or_none(f.get('height')), + 'filesize': parse_filesize(f.get('fileSize')), + } + f_type = f.get('type') + if f_type == 'Image': + thumbnails.append(fmt) + elif f_type == 'Video': + format_id = f.get('formatId') + if format_id: + fmt['format_id'] = format_id + if mime_type == 'video/mp4': + add_m3u8_format(format_id) + urlh = self._request_webpage(f_url, video_id, fatal=False) + if urlh: + first_byte = urlh.read(1) + if first_byte not in (b'F', b'\x00'): + continue + formats.append(fmt) + if not formats: + for format_id in (867, 836, 940): + add_m3u8_format(format_id) + self._sort_formats(formats, ('width', 'height', 'filesize', 'tbr')) + + return { + 'id': video_id, + 'title': title, + 'duration': int_or_none(video_data.get('duration')), + 'timestamp': parse_iso8601(video_data.get('createdDate'), ' '), + 'thumbnails': thumbnails, + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/unistra.py youtube-dl-2019.05.20/youtube_dl/extractor/unistra.py --- youtube-dl-2016.02.22/youtube_dl/extractor/unistra.py 2016-01-15 18:42:53.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/unistra.py 2019-06-07 18:49:07.000000000 +0000 @@ -7,7 +7,7 @@ class UnistraIE(InfoExtractor): - _VALID_URL = r'http://utv\.unistra\.fr/(?:index|video)\.php\?id_video\=(?P\d+)' + _VALID_URL = r'https?://utv\.unistra\.fr/(?:index|video)\.php\?id_video\=(?P\d+)' _TESTS = [ { @@ -49,6 +49,7 @@ 'format_id': format_id, 'quality': quality(format_id) }) + self._sort_formats(formats) title = self._html_search_regex( r'UTV - (.*?)</', webpage, 'title') diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/unity.py youtube-dl-2019.05.20/youtube_dl/extractor/unity.py --- youtube-dl-2016.02.22/youtube_dl/extractor/unity.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/unity.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,32 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from .youtube import YoutubeIE + + +class UnityIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?unity3d\.com/learn/tutorials/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://unity3d.com/learn/tutorials/topics/animation/animate-anything-mecanim', + 'info_dict': { + 'id': 'jWuNtik0C8E', + 'ext': 'mp4', + 'title': 'Live Training 22nd September 2014 - Animate Anything', + 'description': 'md5:e54913114bd45a554c56cdde7669636e', + 'duration': 2893, + 'uploader': 'Unity', + 'uploader_id': 'Unity3D', + 'upload_date': '20140926', + } + }, { + 'url': 'https://unity3d.com/learn/tutorials/projects/2d-ufo-tutorial/following-player-camera?playlist=25844', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + youtube_id = self._search_regex( + r'data-video-id="([_0-9a-zA-Z-]+)"', + webpage, 'youtube ID') + return self.url_result(youtube_id, ie=YoutubeIE.ie_key(), video_id=video_id) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/uol.py youtube-dl-2019.05.20/youtube_dl/extractor/uol.py --- youtube-dl-2016.02.22/youtube_dl/extractor/uol.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/uol.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,159 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_html, + int_or_none, + parse_duration, + update_url_query, + str_or_none, +) + + +class UOLIE(InfoExtractor): + IE_NAME = 'uol.com.br' + _VALID_URL = r'https?://(?:.+?\.)?uol\.com\.br/.*?(?:(?:mediaId|v)=|view/(?:[a-z0-9]+/)?|video(?:=|/(?:\d{4}/\d{2}/\d{2}/)?))(?P<id>\d+|[\w-]+-[A-Z0-9]+)' + _TESTS = [{ + 'url': 'http://player.mais.uol.com.br/player_video_v3.swf?mediaId=15951931', + 'md5': '25291da27dc45e0afb5718a8603d3816', + 'info_dict': { + 'id': '15951931', + 'ext': 'mp4', + 'title': 'Miss simpatia é encontrada morta', + 'description': 'md5:3f8c11a0c0556d66daf7e5b45ef823b2', + } + }, { + 'url': 'http://tvuol.uol.com.br/video/incendio-destroi-uma-das-maiores-casas-noturnas-de-londres-04024E9A3268D4C95326', + 'md5': 'e41a2fb7b7398a3a46b6af37b15c00c9', + 'info_dict': { + 'id': '15954259', + 'ext': 'mp4', + 'title': 'Incêndio destrói uma das maiores casas noturnas de Londres', + 'description': 'Em Londres, um incêndio destruiu uma das maiores boates da cidade. Não há informações sobre vítimas.', + } + }, { + 'url': 'http://mais.uol.com.br/static/uolplayer/index.html?mediaId=15951931', + 'only_matching': True, + }, { + 'url': 'http://mais.uol.com.br/view/15954259', + 'only_matching': True, + }, { + 'url': 'http://noticias.band.uol.com.br/brasilurgente/video/2016/08/05/15951931/miss-simpatia-e-encontrada-morta.html', + 'only_matching': True, + }, { + 'url': 'http://videos.band.uol.com.br/programa.asp?e=noticias&pr=brasil-urgente&v=15951931&t=Policia-desmonte-base-do-PCC-na-Cracolandia', + 'only_matching': True, + }, { + 'url': 'http://mais.uol.com.br/view/cphaa0gl2x8r/incendio-destroi-uma-das-maiores-casas-noturnas-de-londres-04024E9A3268D4C95326', + 'only_matching': True, + }, { + 'url': 'http://noticias.uol.com.br//videos/assistir.htm?video=rafaela-silva-inspira-criancas-no-judo-04024D983968D4C95326', + 'only_matching': True, + }, { + 'url': 'http://mais.uol.com.br/view/e0qbgxid79uv/15275470', + 'only_matching': True, + }] + + _FORMATS = { + '2': { + 'width': 640, + 'height': 360, + }, + '5': { + 'width': 1280, + 'height': 720, + }, + '6': { + 'width': 426, + 'height': 240, + }, + '7': { + 'width': 1920, + 'height': 1080, + }, + '8': { + 'width': 192, + 'height': 144, + }, + '9': { + 'width': 568, + 'height': 320, + }, + '11': { + 'width': 640, + 'height': 360, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + media_id = None + + if video_id.isdigit(): + media_id = video_id + + if not media_id: + embed_page = self._download_webpage( + 'https://jsuol.com.br/c/tv/uol/embed/?params=[embed,%s]' % video_id, + video_id, 'Downloading embed page', fatal=False) + if embed_page: + media_id = self._search_regex( + (r'uol\.com\.br/(\d+)', r'mediaId=(\d+)'), + embed_page, 'media id', default=None) + + if not media_id: + webpage = self._download_webpage(url, video_id) + media_id = self._search_regex(r'mediaId=(\d+)', webpage, 'media id') + + video_data = self._download_json( + 'http://mais.uol.com.br/apiuol/v3/player/getMedia/%s.json' % media_id, + media_id)['item'] + title = video_data['title'] + + query = { + 'ver': video_data.get('numRevision', 2), + 'r': 'http://mais.uol.com.br', + } + for k in ('token', 'sign'): + v = video_data.get(k) + if v: + query[k] = v + + formats = [] + for f in video_data.get('formats', []): + f_url = f.get('url') or f.get('secureUrl') + if not f_url: + continue + f_url = update_url_query(f_url, query) + format_id = str_or_none(f.get('id')) + if format_id == '10': + formats.extend(self._extract_m3u8_formats( + f_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + continue + fmt = { + 'format_id': format_id, + 'url': f_url, + 'source_preference': 1, + } + fmt.update(self._FORMATS.get(format_id, {})) + formats.append(fmt) + self._sort_formats(formats, ('height', 'width', 'source_preference', 'tbr', 'ext')) + + tags = [] + for tag in video_data.get('tags', []): + tag_description = tag.get('description') + if not tag_description: + continue + tags.append(tag_description) + + return { + 'id': media_id, + 'title': title, + 'description': clean_html(video_data.get('desMedia')), + 'thumbnail': video_data.get('thumbnail'), + 'duration': int_or_none(video_data.get('durationSeconds')) or parse_duration(video_data.get('duration')), + 'tags': tags, + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/uplynk.py youtube-dl-2019.05.20/youtube_dl/extractor/uplynk.py --- youtube-dl-2016.02.22/youtube_dl/extractor/uplynk.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/uplynk.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,70 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + float_or_none, + ExtractorError, +) + + +class UplynkIE(InfoExtractor): + IE_NAME = 'uplynk' + _VALID_URL = r'https?://.*?\.uplynk\.com/(?P<path>ext/[0-9a-f]{32}/(?P<external_id>[^/?&]+)|(?P<id>[0-9a-f]{32}))\.(?:m3u8|json)(?:.*?\bpbs=(?P<session_id>[^&]+))?' + _TEST = { + 'url': 'http://content.uplynk.com/e89eaf2ce9054aa89d92ddb2d817a52e.m3u8', + 'info_dict': { + 'id': 'e89eaf2ce9054aa89d92ddb2d817a52e', + 'ext': 'mp4', + 'title': '030816-kgo-530pm-solar-eclipse-vid_web.mp4', + 'uploader_id': '4413701bf5a1488db55b767f8ae9d4fa', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _extract_uplynk_info(self, uplynk_content_url): + path, external_id, video_id, session_id = re.match(UplynkIE._VALID_URL, uplynk_content_url).groups() + display_id = video_id or external_id + formats = self._extract_m3u8_formats( + 'http://content.uplynk.com/%s.m3u8' % path, + display_id, 'mp4', 'm3u8_native') + if session_id: + for f in formats: + f['extra_param_to_segment_url'] = 'pbs=' + session_id + self._sort_formats(formats) + asset = self._download_json('http://content.uplynk.com/player/assetinfo/%s.json' % path, display_id) + if asset.get('error') == 1: + raise ExtractorError('% said: %s' % (self.IE_NAME, asset['msg']), expected=True) + + return { + 'id': asset['asset'], + 'title': asset['desc'], + 'thumbnail': asset.get('default_poster_url'), + 'duration': float_or_none(asset.get('duration')), + 'uploader_id': asset.get('owner'), + 'formats': formats, + } + + def _real_extract(self, url): + return self._extract_uplynk_info(url) + + +class UplynkPreplayIE(UplynkIE): + IE_NAME = 'uplynk:preplay' + _VALID_URL = r'https?://.*?\.uplynk\.com/preplay2?/(?P<path>ext/[0-9a-f]{32}/(?P<external_id>[^/?&]+)|(?P<id>[0-9a-f]{32}))\.json' + _TEST = None + + def _real_extract(self, url): + path, external_id, video_id = re.match(self._VALID_URL, url).groups() + display_id = video_id or external_id + preplay = self._download_json(url, display_id) + content_url = 'http://content.uplynk.com/%s.m3u8' % path + session_id = preplay.get('sid') + if session_id: + content_url += '?pbs=' + session_id + return self._extract_uplynk_info(content_url) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/urort.py youtube-dl-2019.05.20/youtube_dl/extractor/urort.py --- youtube-dl-2016.02.22/youtube_dl/extractor/urort.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/urort.py 2019-06-07 18:49:07.000000000 +0000 @@ -21,7 +21,7 @@ 'id': '33124-24', 'ext': 'mp3', 'title': 'The Bomb', - 'thumbnail': 're:^https?://.+\.jpg', + 'thumbnail': r're:^https?://.+\.jpg', 'uploader': 'Gerilja', 'uploader_id': 'Gerilja', 'upload_date': '20100323', diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/urplay.py youtube-dl-2019.05.20/youtube_dl/extractor/urplay.py --- youtube-dl-2016.02.22/youtube_dl/extractor/urplay.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/urplay.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,71 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import unified_timestamp + + +class URPlayIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ur(?:play|skola)\.se/(?:program|Produkter)/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://urplay.se/program/203704-ur-samtiden-livet-universum-och-rymdens-markliga-musik-om-vetenskap-kritiskt-tankande-och-motstand', + 'md5': 'ff5b0c89928f8083c74bbd5099c9292d', + 'info_dict': { + 'id': '203704', + 'ext': 'mp4', + 'title': 'UR Samtiden - Livet, universum och rymdens märkliga musik : Om vetenskap, kritiskt tänkande och motstånd', + 'description': 'md5:5344508a52aa78c1ced6c1b8b9e44e9a', + 'timestamp': 1513512768, + 'upload_date': '20171217', + }, + }, { + 'url': 'https://urskola.se/Produkter/190031-Tripp-Trapp-Trad-Sovkudde', + 'info_dict': { + 'id': '190031', + 'ext': 'mp4', + 'title': 'Tripp, Trapp, Träd : Sovkudde', + 'description': 'md5:b86bffdae04a7e9379d1d7e5947df1d1', + 'timestamp': 1440093600, + 'upload_date': '20150820', + }, + }, { + 'url': 'http://urskola.se/Produkter/155794-Smasagor-meankieli-Grodan-i-vida-varlden', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + urplayer_data = self._parse_json(self._search_regex( + r'urPlayer\.init\(({.+?})\);', webpage, 'urplayer data'), video_id) + host = self._download_json('http://streaming-loadbalancer.ur.se/loadbalancer.json', video_id)['redirect'] + + formats = [] + for quality_attr, quality, preference in (('', 'sd', 0), ('_hd', 'hd', 1)): + file_http = urplayer_data.get('file_http' + quality_attr) or urplayer_data.get('file_http_sub' + quality_attr) + if file_http: + formats.extend(self._extract_wowza_formats( + 'http://%s/%splaylist.m3u8' % (host, file_http), video_id, skip_protocols=['rtmp', 'rtsp'])) + self._sort_formats(formats) + + subtitles = {} + for subtitle in urplayer_data.get('subtitles', []): + subtitle_url = subtitle.get('file') + kind = subtitle.get('kind') + if not subtitle_url or (kind and kind != 'captions'): + continue + subtitles.setdefault(subtitle.get('label', 'Svenska'), []).append({ + 'url': subtitle_url, + }) + + return { + 'id': video_id, + 'title': urplayer_data['title'], + 'description': self._og_search_description(webpage), + 'thumbnail': urplayer_data.get('image'), + 'timestamp': unified_timestamp(self._html_search_meta(('uploadDate', 'schema:uploadDate'), webpage, 'timestamp')), + 'series': urplayer_data.get('series_title'), + 'subtitles': subtitles, + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/usanetwork.py youtube-dl-2019.05.20/youtube_dl/extractor/usanetwork.py --- youtube-dl-2016.02.22/youtube_dl/extractor/usanetwork.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/usanetwork.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,76 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .adobepass import AdobePassIE +from ..utils import ( + extract_attributes, + smuggle_url, + update_url_query, +) + + +class USANetworkIE(AdobePassIE): + _VALID_URL = r'https?://(?:www\.)?usanetwork\.com/(?:[^/]+/videos|movies)/(?P<id>[^/?#]+)' + _TEST = { + 'url': 'http://www.usanetwork.com/mrrobot/videos/hpe-cybersecurity', + 'md5': '33c0d2ba381571b414024440d08d57fd', + 'info_dict': { + 'id': '3086229', + 'ext': 'mp4', + 'title': 'HPE Cybersecurity', + 'description': 'The more we digitize our world, the more vulnerable we are.', + 'upload_date': '20160818', + 'timestamp': 1471535460, + 'uploader': 'NBCU-USA', + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + player_params = extract_attributes(self._search_regex( + r'(<div[^>]+data-usa-tve-player-container[^>]*>)', webpage, 'player params')) + video_id = player_params['data-mpx-guid'] + title = player_params['data-episode-title'] + + account_pid, path = re.search( + r'data-src="(?:https?)?//player\.theplatform\.com/p/([^/]+)/.*?/(media/guid/\d+/\d+)', + webpage).groups() + + query = { + 'mbr': 'true', + } + if player_params.get('data-is-full-episode') == '1': + query['manifest'] = 'm3u' + + if player_params.get('data-entitlement') == 'auth': + adobe_pass = {} + drupal_settings = self._search_regex( + r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', + webpage, 'drupal settings', fatal=False) + if drupal_settings: + drupal_settings = self._parse_json(drupal_settings, video_id, fatal=False) + if drupal_settings: + adobe_pass = drupal_settings.get('adobePass', {}) + resource = self._get_mvpd_resource( + adobe_pass.get('adobePassResourceId', 'usa'), + title, video_id, player_params.get('data-episode-rating', 'TV-14')) + query['auth'] = self._extract_mvpd_auth( + url, video_id, adobe_pass.get('adobePassRequestorId', 'usa'), resource) + + info = self._search_json_ld(webpage, video_id, default={}) + info.update({ + '_type': 'url_transparent', + 'url': smuggle_url(update_url_query( + 'http://link.theplatform.com/s/%s/%s' % (account_pid, path), + query), {'force_smil_url': True}), + 'id': video_id, + 'title': title, + 'series': player_params.get('data-show-title'), + 'episode': title, + 'ie_key': 'ThePlatform', + }) + return info diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/usatoday.py youtube-dl-2019.05.20/youtube_dl/extractor/usatoday.py --- youtube-dl-2016.02.22/youtube_dl/extractor/usatoday.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/usatoday.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,63 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + get_element_by_attribute, + parse_duration, + try_get, + update_url_query, +) +from ..compat import compat_str + + +class USATodayIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?usatoday\.com/(?:[^/]+/)*(?P<id>[^?/#]+)' + _TESTS = [{ + # Brightcove Partner ID = 29906170001 + 'url': 'http://www.usatoday.com/media/cinematic/video/81729424/us-france-warn-syrian-regime-ahead-of-new-peace-talks/', + 'md5': '033587d2529dc3411a1ab3644c3b8827', + 'info_dict': { + 'id': '4799374959001', + 'ext': 'mp4', + 'title': 'US, France warn Syrian regime ahead of new peace talks', + 'timestamp': 1457891045, + 'description': 'md5:7e50464fdf2126b0f533748d3c78d58f', + 'uploader_id': '29906170001', + 'upload_date': '20160313', + } + }, { + # ui-video-data[asset_metadata][items][brightcoveaccount] = 28911775001 + 'url': 'https://www.usatoday.com/story/tech/science/2018/08/21/yellowstone-supervolcano-eruption-stop-worrying-its-blow/973633002/', + 'info_dict': { + 'id': '5824495846001', + 'ext': 'mp4', + 'title': 'Yellowstone more likely to crack rather than explode', + 'timestamp': 1534790612, + 'description': 'md5:3715e7927639a4f16b474e9391687c62', + 'uploader_id': '28911775001', + 'upload_date': '20180820', + } + }] + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(update_url_query(url, {'ajax': 'true'}), display_id) + ui_video_data = get_element_by_attribute('class', 'ui-video-data', webpage) + if not ui_video_data: + raise ExtractorError('no video on the webpage', expected=True) + video_data = self._parse_json(ui_video_data, display_id) + item = try_get(video_data, lambda x: x['asset_metadata']['items'], dict) or {} + + return { + '_type': 'url_transparent', + 'url': self.BRIGHTCOVE_URL_TEMPLATE % (item.get('brightcoveaccount', '29906170001'), item.get('brightcoveid') or video_data['brightcove_id']), + 'id': compat_str(video_data['id']), + 'title': video_data['title'], + 'thumbnail': video_data.get('thumbnail'), + 'description': video_data.get('description'), + 'duration': parse_duration(video_data.get('length')), + 'ie_key': 'BrightcoveNew', + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/ustream.py youtube-dl-2019.05.20/youtube_dl/extractor/ustream.py --- youtube-dl-2016.02.22/youtube_dl/extractor/ustream.py 2016-01-23 10:08:46.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/ustream.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,20 +1,25 @@ from __future__ import unicode_literals +import random import re from .common import InfoExtractor from ..compat import ( + compat_str, compat_urlparse, ) from ..utils import ( + encode_data_uri, ExtractorError, int_or_none, float_or_none, + mimetype2ext, + str_or_none, ) class UstreamIE(InfoExtractor): - _VALID_URL = r'https?://www\.ustream\.tv/(?P<type>recorded|embed|embed/recorded)/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?ustream\.tv/(?P<type>recorded|embed|embed/recorded)/(?P<id>\d+)' IE_NAME = 'ustream' _TESTS = [{ 'url': 'http://www.ustream.tv/recorded/20274954', @@ -41,13 +46,126 @@ 'uploader': 'sportscanadatv', }, 'skip': 'This Pro Broadcaster has chosen to remove this video from the ustream.tv site.', + }, { + 'url': 'http://www.ustream.tv/embed/10299409', + 'info_dict': { + 'id': '10299409', + }, + 'playlist_count': 3, + }, { + 'url': 'http://www.ustream.tv/recorded/91343263', + 'info_dict': { + 'id': '91343263', + 'ext': 'mp4', + 'title': 'GitHub Universe - General Session - Day 1', + 'upload_date': '20160914', + 'description': 'GitHub Universe - General Session - Day 1', + 'timestamp': 1473872730, + 'uploader': 'wa0dnskeqkr', + 'uploader_id': '38977840', + }, + 'params': { + 'skip_download': True, # m3u8 download + }, }] + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage) + if mobj is not None: + return mobj.group('url') + + def _get_stream_info(self, url, video_id, app_id_ver, extra_note=None): + def num_to_hex(n): + return hex(n)[2:] + + rnd = random.randrange + + if not extra_note: + extra_note = '' + + conn_info = self._download_json( + 'http://r%d-1-%s-recorded-lp-live.ums.ustream.tv/1/ustream' % (rnd(1e8), video_id), + video_id, note='Downloading connection info' + extra_note, + query={ + 'type': 'viewer', + 'appId': app_id_ver[0], + 'appVersion': app_id_ver[1], + 'rsid': '%s:%s' % (num_to_hex(rnd(1e8)), num_to_hex(rnd(1e8))), + 'rpin': '_rpin.%d' % rnd(1e15), + 'referrer': url, + 'media': video_id, + 'application': 'recorded', + }) + host = conn_info[0]['args'][0]['host'] + connection_id = conn_info[0]['args'][0]['connectionId'] + + return self._download_json( + 'http://%s/1/ustream?connectionId=%s' % (host, connection_id), + video_id, note='Downloading stream info' + extra_note) + + def _get_streams(self, url, video_id, app_id_ver): + # Sometimes the return dict does not have 'stream' + for trial_count in range(3): + stream_info = self._get_stream_info( + url, video_id, app_id_ver, + extra_note=' (try %d)' % (trial_count + 1) if trial_count > 0 else '') + if 'stream' in stream_info[0]['args'][0]: + return stream_info[0]['args'][0]['stream'] + return [] + + def _parse_segmented_mp4(self, dash_stream_info): + def resolve_dash_template(template, idx, chunk_hash): + return template.replace('%', compat_str(idx), 1).replace('%', chunk_hash) + + formats = [] + for stream in dash_stream_info['streams']: + # Use only one provider to avoid too many formats + provider = dash_stream_info['providers'][0] + fragments = [{ + 'url': resolve_dash_template( + provider['url'] + stream['initUrl'], 0, dash_stream_info['hashes']['0']) + }] + for idx in range(dash_stream_info['videoLength'] // dash_stream_info['chunkTime']): + fragments.append({ + 'url': resolve_dash_template( + provider['url'] + stream['segmentUrl'], idx, + dash_stream_info['hashes'][compat_str(idx // 10 * 10)]) + }) + content_type = stream['contentType'] + kind = content_type.split('/')[0] + f = { + 'format_id': '-'.join(filter(None, [ + 'dash', kind, str_or_none(stream.get('bitrate'))])), + 'protocol': 'http_dash_segments', + # TODO: generate a MPD doc for external players? + 'url': encode_data_uri(b'<MPD/>', 'text/xml'), + 'ext': mimetype2ext(content_type), + 'height': stream.get('height'), + 'width': stream.get('width'), + 'fragments': fragments, + } + if kind == 'video': + f.update({ + 'vcodec': stream.get('codec'), + 'acodec': 'none', + 'vbr': stream.get('bitrate'), + }) + else: + f.update({ + 'vcodec': 'none', + 'acodec': stream.get('codec'), + 'abr': stream.get('bitrate'), + }) + formats.append(f) + return formats + def _real_extract(self, url): m = re.match(self._VALID_URL, url) video_id = m.group('id') - # some sites use this embed format (see: https://github.com/rg3/youtube-dl/issues/2990) + # some sites use this embed format (see: https://github.com/ytdl-org/youtube-dl/issues/2990) if m.group('type') == 'embed/recorded': video_id = m.group('id') desktop_url = 'http://www.ustream.tv/recorded/' + video_id @@ -55,10 +173,12 @@ if m.group('type') == 'embed': video_id = m.group('id') webpage = self._download_webpage(url, video_id) - desktop_video_id = self._html_search_regex( - r'ContentVideoIds=\["([^"]*?)"\]', webpage, 'desktop_video_id') - desktop_url = 'http://www.ustream.tv/recorded/' + desktop_video_id - return self.url_result(desktop_url, 'Ustream') + content_video_ids = self._parse_json(self._search_regex( + r'ustream\.vars\.offAirContentVideoIds=([^;]+);', webpage, + 'content video IDs'), video_id) + return self.playlist_result( + map(lambda u: self.url_result('http://www.ustream.tv/recorded/' + u, 'Ustream'), content_video_ids), + video_id) params = self._download_json( 'https://api.ustream.tv/videos/%s.json' % video_id, video_id) @@ -78,7 +198,22 @@ 'url': video_url, 'ext': format_id, 'filesize': filesize, - } for format_id, video_url in video['media_urls'].items()] + } for format_id, video_url in video['media_urls'].items() if video_url] + + if not formats: + hls_streams = self._get_streams(url, video_id, app_id_ver=(11, 2)) + if hls_streams: + # m3u8_native leads to intermittent ContentTooShortError + formats.extend(self._extract_m3u8_formats( + hls_streams[0]['url'], video_id, ext='mp4', m3u8_id='hls')) + + ''' + # DASH streams handling is incomplete as 'url' is missing + dash_streams = self._get_streams(url, video_id, app_id_ver=(3, 1)) + if dash_streams: + formats.extend(self._parse_segmented_mp4(dash_streams)) + ''' + self._sort_formats(formats) description = video.get('description') @@ -109,7 +244,7 @@ class UstreamChannelIE(InfoExtractor): - _VALID_URL = r'https?://www\.ustream\.tv/channel/(?P<slug>.+)' + _VALID_URL = r'https?://(?:www\.)?ustream\.tv/channel/(?P<slug>.+)' IE_NAME = 'ustream:channel' _TEST = { 'url': 'http://www.ustream.tv/channel/channeljapan', diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/ustudio.py youtube-dl-2019.05.20/youtube_dl/extractor/ustudio.py --- youtube-dl-2016.02.22/youtube_dl/extractor/ustudio.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/ustudio.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,125 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unified_strdate, + unescapeHTML, +) + + +class UstudioIE(InfoExtractor): + IE_NAME = 'ustudio' + _VALID_URL = r'https?://(?:(?:www|v1)\.)?ustudio\.com/video/(?P<id>[^/]+)/(?P<display_id>[^/?#&]+)' + _TEST = { + 'url': 'http://ustudio.com/video/Uxu2my9bgSph/san_francisco_golden_gate_bridge', + 'md5': '58bbfca62125378742df01fc2abbdef6', + 'info_dict': { + 'id': 'Uxu2my9bgSph', + 'display_id': 'san_francisco_golden_gate_bridge', + 'ext': 'mp4', + 'title': 'San Francisco: Golden Gate Bridge', + 'description': 'md5:23925500697f2c6d4830e387ba51a9be', + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20111107', + 'uploader': 'Tony Farley', + } + } + + def _real_extract(self, url): + video_id, display_id = re.match(self._VALID_URL, url).groups() + + config = self._download_xml( + 'http://v1.ustudio.com/embed/%s/ustudio/config.xml' % video_id, + display_id) + + def extract(kind): + return [{ + 'url': unescapeHTML(item.attrib['url']), + 'width': int_or_none(item.get('width')), + 'height': int_or_none(item.get('height')), + } for item in config.findall('./qualities/quality/%s' % kind) if item.get('url')] + + formats = extract('video') + self._sort_formats(formats) + + webpage = self._download_webpage(url, display_id) + + title = self._og_search_title(webpage) + upload_date = unified_strdate(self._search_regex( + r'(?s)Uploaded by\s*.+?\s*on\s*<span>([^<]+)</span>', + webpage, 'upload date', fatal=False)) + uploader = self._search_regex( + r'Uploaded by\s*<a[^>]*>([^<]+)<', + webpage, 'uploader', fatal=False) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': self._og_search_description(webpage), + 'thumbnails': extract('image'), + 'upload_date': upload_date, + 'uploader': uploader, + 'formats': formats, + } + + +class UstudioEmbedIE(InfoExtractor): + IE_NAME = 'ustudio:embed' + _VALID_URL = r'https?://(?:(?:app|embed)\.)?ustudio\.com/embed/(?P<uid>[^/]+)/(?P<id>[^/]+)' + _TEST = { + 'url': 'http://app.ustudio.com/embed/DeN7VdYRDKhP/Uw7G1kMCe65T', + 'md5': '47c0be52a09b23a7f40de9469cec58f4', + 'info_dict': { + 'id': 'Uw7G1kMCe65T', + 'ext': 'mp4', + 'title': '5 Things IT Should Know About Video', + 'description': 'md5:93d32650884b500115e158c5677d25ad', + 'uploader_id': 'DeN7VdYRDKhP', + } + } + + def _real_extract(self, url): + uploader_id, video_id = re.match(self._VALID_URL, url).groups() + video_data = self._download_json( + 'http://app.ustudio.com/embed/%s/%s/config.json' % (uploader_id, video_id), + video_id)['videos'][0] + title = video_data['name'] + + formats = [] + for ext, qualities in video_data.get('transcodes', {}).items(): + for quality in qualities: + quality_url = quality.get('url') + if not quality_url: + continue + height = int_or_none(quality.get('height')) + formats.append({ + 'format_id': '%s-%dp' % (ext, height) if height else ext, + 'url': quality_url, + 'width': int_or_none(quality.get('width')), + 'height': height, + }) + self._sort_formats(formats) + + thumbnails = [] + for image in video_data.get('images', []): + image_url = image.get('url') + if not image_url: + continue + thumbnails.append({ + 'url': image_url, + }) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'duration': int_or_none(video_data.get('duration')), + 'uploader_id': uploader_id, + 'tags': video_data.get('keywords'), + 'thumbnails': thumbnails, + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/varzesh3.py youtube-dl-2019.05.20/youtube_dl/extractor/varzesh3.py --- youtube-dl-2016.02.22/youtube_dl/extractor/varzesh3.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/varzesh3.py 2019-06-07 18:49:07.000000000 +0000 @@ -2,11 +2,19 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_urlparse, + compat_parse_qs, +) +from ..utils import ( + clean_html, + remove_start, +) class Varzesh3IE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?video\.varzesh3\.com/(?:[^/]+/)+(?P<id>[^/]+)/?' - _TEST = { + _TESTS = [{ 'url': 'http://video.varzesh3.com/germany/bundesliga/5-%D9%88%D8%A7%DA%A9%D9%86%D8%B4-%D8%A8%D8%B1%D8%AA%D8%B1-%D8%AF%D8%B1%D9%88%D8%A7%D8%B2%D9%87%E2%80%8C%D8%A8%D8%A7%D9%86%D8%A7%D9%86%D8%9B%D9%87%D9%81%D8%AA%D9%87-26-%D8%A8%D9%88%D9%86%D8%AF%D8%B3/', 'md5': '2a933874cb7dce4366075281eb49e855', 'info_dict': { @@ -14,9 +22,20 @@ 'ext': 'mp4', 'title': '۵ واکنش برتر دروازه‌بانان؛هفته ۲۶ بوندسلیگا', 'description': 'فصل ۲۰۱۵-۲۰۱۴', - 'thumbnail': 're:^https?://.*\.jpg$', - } - } + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'skip': 'HTTP 404 Error', + }, { + 'url': 'http://video.varzesh3.com/video/112785/%D8%AF%D9%84%D9%87-%D8%B9%D9%84%DB%8C%D8%9B-%D8%B3%D8%AA%D8%A7%D8%B1%D9%87-%D9%86%D9%88%D8%B8%D9%87%D9%88%D8%B1-%D9%84%DB%8C%DA%AF-%D8%A8%D8%B1%D8%AA%D8%B1-%D8%AC%D8%B2%DB%8C%D8%B1%D9%87', + 'md5': '841b7cd3afbc76e61708d94e53a4a4e7', + 'info_dict': { + 'id': '112785', + 'ext': 'mp4', + 'title': 'دله علی؛ ستاره نوظهور لیگ برتر جزیره', + 'description': 'فوتبال 120', + }, + 'expected_warnings': ['description'], + }] def _real_extract(self, url): display_id = self._match_id(url) @@ -26,15 +45,30 @@ video_url = self._search_regex( r'<source[^>]+src="([^"]+)"', webpage, 'video url') - title = self._og_search_title(webpage) + title = remove_start(self._html_search_regex( + r'<title>([^<]+)', webpage, 'title'), 'ویدیو ورزش 3 | ') + description = self._html_search_regex( r'(?s)
    (.+?)
    ', - webpage, 'description', fatal=False) - thumbnail = self._og_search_thumbnail(webpage) + webpage, 'description', default=None) + if description is None: + description = clean_html(self._html_search_meta('description', webpage)) + + thumbnail = self._og_search_thumbnail(webpage, default=None) + if thumbnail is None: + fb_sharer_url = self._search_regex( + r']+href="(https?://www\.facebook\.com/sharer/sharer\.php?[^"]+)"', + webpage, 'facebook sharer URL', fatal=False) + sharer_params = compat_parse_qs(compat_urllib_parse_urlparse(fb_sharer_url).query) + thumbnail = sharer_params.get('p[images][0]', [None])[0] video_id = self._search_regex( r"]+rel='(?:canonical|shortlink)'[^>]+href='/\?p=([^']+)'", - webpage, display_id, default=display_id) + webpage, display_id, default=None) + if video_id is None: + video_id = self._search_regex( + r'var\s+VideoId\s*=\s*(\d+);', webpage, 'video id', + default=display_id) return { 'url': video_url, diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/vbox7.py youtube-dl-2019.05.20/youtube_dl/extractor/vbox7.py --- youtube-dl-2016.02.22/youtube_dl/extractor/vbox7.py 2016-02-14 22:26:17.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/vbox7.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,20 +1,43 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse, - compat_urlparse, -) -from ..utils import ( - ExtractorError, - sanitized_Request, -) +from ..utils import ExtractorError class Vbox7IE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?vbox7\.com/play:(?P[^/]+)' - _TEST = { + _VALID_URL = r'''(?x) + https?:// + (?:[^/]+\.)?vbox7\.com/ + (?: + play:| + (?: + emb/external\.php| + player/ext\.swf + )\?.*?\bvid= + ) + (?P[\da-fA-F]+) + ''' + _GEO_COUNTRIES = ['BG'] + _TESTS = [{ + 'url': 'http://vbox7.com/play:0946fff23c', + 'md5': 'a60f9ab3a3a2f013ef9a967d5f7be5bf', + 'info_dict': { + 'id': '0946fff23c', + 'ext': 'mp4', + 'title': 'Борисов: Притеснен съм за бъдещето на България', + 'description': 'По думите му е опасно страната ни да бъде обявена за "сигурна"', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1470982814, + 'upload_date': '20160812', + 'uploader': 'zdraveibulgaria', + }, + 'params': { + 'proxy': '127.0.0.1:8118', + }, + }, { 'url': 'http://vbox7.com/play:249bb972c2', 'md5': '99f65c0c9ef9b682b97313e052734c3f', 'info_dict': { @@ -22,43 +45,61 @@ 'ext': 'mp4', 'title': 'Смях! Чудо - чист за секунди - Скрита камера', }, - } + 'skip': 'georestricted', + }, { + 'url': 'http://vbox7.com/emb/external.php?vid=a240d20f9c&autoplay=1', + 'only_matching': True, + }, { + 'url': 'http://i49.vbox7.com/player/ext.swf?vid=0946fff23c&autoplay=1', + 'only_matching': True, + }] + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r']+src=(?P["\'])(?P(?:https?:)?//vbox7\.com/emb/external\.php.+?)(?P=q)', + webpage) + if mobj: + return mobj.group('url') def _real_extract(self, url): video_id = self._match_id(url) - # need to get the page 3 times for the correct jsSecretToken cookie - # which is necessary for the correct title - def get_session_id(): - redirect_page = self._download_webpage(url, video_id) - session_id_url = self._search_regex( - r'var\s*url\s*=\s*\'([^\']+)\';', redirect_page, - 'session id url') - self._download_webpage( - compat_urlparse.urljoin(url, session_id_url), video_id, - 'Getting session id') - - get_session_id() - get_session_id() - - webpage = self._download_webpage(url, video_id, - 'Downloading redirect page') - - title = self._html_search_regex(r'(.*)', - webpage, 'title').split('/')[0].strip() - - info_url = 'http://vbox7.com/play/magare.do' - data = compat_urllib_parse.urlencode({'as3': '1', 'vid': video_id}) - info_request = sanitized_Request(info_url, data) - info_request.add_header('Content-Type', 'application/x-www-form-urlencoded') - info_response = self._download_webpage(info_request, video_id, 'Downloading info webpage') - if info_response is None: - raise ExtractorError('Unable to extract the media url') - (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&')) + response = self._download_json( + 'https://www.vbox7.com/ajax/video/nextvideo.php?vid=%s' % video_id, + video_id) + + if 'error' in response: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, response['error']), expected=True) + + video = response['options'] + + title = video['title'] + video_url = video['src'] + + if '/na.mp4' in video_url: + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + + uploader = video.get('uploader') + + webpage = self._download_webpage( + 'http://vbox7.com/play:%s' % video_id, video_id, fatal=None) + + info = {} + + if webpage: + info = self._search_json_ld( + webpage.replace('"/*@context"', '"@context"'), video_id, + fatal=False) - return { + info.update({ 'id': video_id, - 'url': final_url, 'title': title, - 'thumbnail': thumbnail_url, - } + 'url': video_url, + 'uploader': uploader, + 'thumbnail': self._proto_relative_url( + info.get('thumbnail') or self._og_search_thumbnail(webpage), + 'http:'), + }) + return info diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/veehd.py youtube-dl-2019.05.20/youtube_dl/extractor/veehd.py --- youtube-dl-2016.02.22/youtube_dl/extractor/veehd.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/veehd.py 2019-06-07 18:49:07.000000000 +0000 @@ -54,7 +54,7 @@ video_id = self._match_id(url) # VeeHD seems to send garbage on the first request. - # See https://github.com/rg3/youtube-dl/issues/2102 + # See https://github.com/ytdl-org/youtube-dl/issues/2102 self._download_webpage(url, video_id, 'Requesting webpage') webpage = self._download_webpage(url, video_id) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/veoh.py youtube-dl-2019.05.20/youtube_dl/extractor/veoh.py --- youtube-dl-2016.02.22/youtube_dl/extractor/veoh.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/veoh.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,80 +1,57 @@ from __future__ import unicode_literals -import re -import json - from .common import InfoExtractor from ..utils import ( int_or_none, - ExtractorError, - sanitized_Request, + parse_duration, + qualities, ) class VeohIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?veoh\.com/(?:watch|iphone/#_Watch)/(?P(?:v|yapi-)[\da-zA-Z]+)' + _VALID_URL = r'https?://(?:www\.)?veoh\.com/(?:watch|embed|iphone/#_Watch)/(?P(?:v|e|yapi-)[\da-zA-Z]+)' - _TESTS = [ - { - 'url': 'http://www.veoh.com/watch/v56314296nk7Zdmz3', - 'md5': '620e68e6a3cff80086df3348426c9ca3', - 'info_dict': { - 'id': '56314296', - 'ext': 'mp4', - 'title': 'Straight Backs Are Stronger', - 'uploader': 'LUMOback', - 'description': 'At LUMOback, we believe straight backs are stronger. The LUMOback Posture & Movement Sensor: It gently vibrates when you slouch, inspiring improved posture and mobility. Use the app to track your data and improve your posture over time. ', - }, + _TESTS = [{ + 'url': 'http://www.veoh.com/watch/v56314296nk7Zdmz3', + 'md5': '9e7ecc0fd8bbee7a69fe38953aeebd30', + 'info_dict': { + 'id': 'v56314296nk7Zdmz3', + 'ext': 'mp4', + 'title': 'Straight Backs Are Stronger', + 'uploader': 'LUMOback', + 'description': 'At LUMOback, we believe straight backs are stronger. The LUMOback Posture & Movement Sensor: It gently vibrates when you slouch, inspiring improved posture and mobility. Use the app to track your data and improve your posture over time. ', }, - { - 'url': 'http://www.veoh.com/watch/v27701988pbTc4wzN?h1=Chile+workers+cover+up+to+avoid+skin+damage', - 'md5': '4a6ff84b87d536a6a71e6aa6c0ad07fa', - 'info_dict': { - 'id': '27701988', - 'ext': 'mp4', - 'title': 'Chile workers cover up to avoid skin damage', - 'description': 'md5:2bd151625a60a32822873efc246ba20d', - 'uploader': 'afp-news', - 'duration': 123, - }, + }, { + 'url': 'http://www.veoh.com/embed/v56314296nk7Zdmz3', + 'only_matching': True, + }, { + 'url': 'http://www.veoh.com/watch/v27701988pbTc4wzN?h1=Chile+workers+cover+up+to+avoid+skin+damage', + 'md5': '4a6ff84b87d536a6a71e6aa6c0ad07fa', + 'info_dict': { + 'id': '27701988', + 'ext': 'mp4', + 'title': 'Chile workers cover up to avoid skin damage', + 'description': 'md5:2bd151625a60a32822873efc246ba20d', + 'uploader': 'afp-news', + 'duration': 123, }, - { - 'url': 'http://www.veoh.com/watch/v69525809F6Nc4frX', - 'md5': '4fde7b9e33577bab2f2f8f260e30e979', - 'note': 'Embedded ooyala video', - 'info_dict': { - 'id': '69525809', - 'ext': 'mp4', - 'title': 'Doctors Alter Plan For Preteen\'s Weight Loss Surgery', - 'description': 'md5:f5a11c51f8fb51d2315bca0937526891', - 'uploader': 'newsy-videos', - }, - 'skip': 'This video has been deleted.', + 'skip': 'This video has been deleted.', + }, { + 'url': 'http://www.veoh.com/watch/v69525809F6Nc4frX', + 'md5': '4fde7b9e33577bab2f2f8f260e30e979', + 'note': 'Embedded ooyala video', + 'info_dict': { + 'id': '69525809', + 'ext': 'mp4', + 'title': 'Doctors Alter Plan For Preteen\'s Weight Loss Surgery', + 'description': 'md5:f5a11c51f8fb51d2315bca0937526891', + 'uploader': 'newsy-videos', }, - ] - - def _extract_formats(self, source): - formats = [] - link = source.get('aowPermalink') - if link: - formats.append({ - 'url': link, - 'ext': 'mp4', - 'format_id': 'aow', - }) - link = source.get('fullPreviewHashLowPath') - if link: - formats.append({ - 'url': link, - 'format_id': 'low', - }) - link = source.get('fullPreviewHashHighPath') - if link: - formats.append({ - 'url': link, - 'format_id': 'high', - }) - return formats + 'skip': 'This video has been deleted.', + }, { + 'url': 'http://www.veoh.com/watch/e152215AJxZktGS', + 'only_matching': True, + }] def _extract_video(self, source): return { @@ -90,38 +67,37 @@ } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - if video_id.startswith('v'): - rsp = self._download_xml( - r'http://www.veoh.com/api/findByPermalink?permalink=%s' % video_id, video_id, 'Downloading video XML') - stat = rsp.get('stat') - if stat == 'ok': - return self._extract_video(rsp.find('./videoList/video')) - elif stat == 'fail': - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, rsp.find('./errorList/error').get('errorMessage')), expected=True) - - webpage = self._download_webpage(url, video_id) - age_limit = 0 - if 'class="adultwarning-container"' in webpage: - self.report_age_confirmation() - age_limit = 18 - request = sanitized_Request(url) - request.add_header('Cookie', 'confirmedAdult=true') - webpage = self._download_webpage(request, video_id) - - m_youtube = re.search(r'http://www\.youtube\.com/v/(.*?)(\&|"|\?)', webpage) - if m_youtube is not None: - youtube_id = m_youtube.group(1) - self.to_screen('%s: detected Youtube video.' % video_id) - return self.url_result(youtube_id, 'Youtube') + video_id = self._match_id(url) + video = self._download_json( + 'https://www.veoh.com/watch/getVideo/' + video_id, + video_id)['video'] + title = video['title'] - info = json.loads( - self._search_regex(r'videoDetailsJSON = \'({.*?})\';', webpage, 'info').replace('\\\'', '\'')) - - video = self._extract_video(info) - video['age_limit'] = age_limit + thumbnail_url = None + q = qualities(['HQ', 'Regular']) + formats = [] + for f_id, f_url in video.get('src', {}).items(): + if not f_url: + continue + if f_id == 'poster': + thumbnail_url = f_url + else: + formats.append({ + 'format_id': f_id, + 'quality': q(f_id), + 'url': f_url, + }) + self._sort_formats(formats) - return video + return { + 'id': video_id, + 'title': title, + 'description': video.get('description'), + 'thumbnail': thumbnail_url, + 'uploader': video.get('author', {}).get('nickname'), + 'duration': int_or_none(video.get('lengthBySec')) or parse_duration(video.get('length')), + 'view_count': int_or_none(video.get('views')), + 'formats': formats, + 'average_rating': int_or_none(video.get('rating')), + 'comment_count': int_or_none(video.get('numOfComments')), + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/vessel.py youtube-dl-2019.05.20/youtube_dl/extractor/vessel.py --- youtube-dl-2016.02.22/youtube_dl/extractor/vessel.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/vessel.py 2019-06-07 18:49:07.000000000 +0000 @@ -2,6 +2,7 @@ from __future__ import unicode_literals import json +import re from .common import InfoExtractor from ..utils import ( @@ -12,23 +13,38 @@ class VesselIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vessel\.com/videos/(?P[0-9a-zA-Z]+)' + _VALID_URL = r'https?://(?:www\.)?vessel\.com/(?:videos|embed)/(?P[0-9a-zA-Z-_]+)' _API_URL_TEMPLATE = 'https://www.vessel.com/api/view/items/%s' _LOGIN_URL = 'https://www.vessel.com/api/account/login' _NETRC_MACHINE = 'vessel' - _TEST = { + _TESTS = [{ 'url': 'https://www.vessel.com/videos/HDN7G5UMs', 'md5': '455cdf8beb71c6dd797fd2f3818d05c4', 'info_dict': { 'id': 'HDN7G5UMs', 'ext': 'mp4', 'title': 'Nvidia GeForce GTX Titan X - The Best Video Card on the Market?', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20150317', 'description': 'Did Nvidia pull out all the stops on the Titan X, or does its performance leave something to be desired?', 'timestamp': int, }, - } + }, { + 'url': 'https://www.vessel.com/embed/G4U7gUJ6a?w=615&h=346', + 'only_matching': True, + }, { + 'url': 'https://www.vessel.com/videos/F01_dsLj1', + 'only_matching': True, + }, { + 'url': 'https://www.vessel.com/videos/RRX-sir-J', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [url for _, url in re.findall( + r']+src=(["\'])((?:https?:)?//(?:www\.)?vessel\.com/embed/[0-9a-zA-Z-_]+.*?)\1', + webpage)] @staticmethod def make_json_request(url, data): @@ -59,7 +75,7 @@ 'Access to this content is restricted. (%s said: %s)' % (self.IE_NAME, err_code), expected=True) def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return self.report_login() @@ -98,16 +114,24 @@ formats = [] for f in video_asset.get('sources', []): - if f['name'] == 'hls-index': + location = f.get('location') + if not location: + continue + name = f.get('name') + if name == 'hls-index': formats.extend(self._extract_m3u8_formats( - f['location'], video_id, ext='mp4', m3u8_id='m3u8')) + location, video_id, ext='mp4', + entry_protocol='m3u8_native', m3u8_id='m3u8', fatal=False)) + elif name == 'dash-index': + formats.extend(self._extract_mpd_formats( + location, video_id, mpd_id='dash', fatal=False)) else: formats.append({ - 'format_id': f['name'], + 'format_id': name, 'tbr': f.get('bitrate'), 'height': f.get('height'), 'width': f.get('width'), - 'url': f['location'], + 'url': location, }) self._sort_formats(formats) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/vesti.py youtube-dl-2019.05.20/youtube_dl/extractor/vesti.py --- youtube-dl-2016.02.22/youtube_dl/extractor/vesti.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/vesti.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re @@ -10,7 +10,7 @@ class VestiIE(InfoExtractor): IE_DESC = 'Вести.Ru' - _VALID_URL = r'http://(?:.+?\.)?vesti\.ru/(?P.+)' + _VALID_URL = r'https?://(?:.+?\.)?vesti\.ru/(?P.+)' _TESTS = [ { diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/vevo.py youtube-dl-2019.05.20/youtube_dl/extractor/vevo.py --- youtube-dl-2016.02.22/youtube_dl/extractor/vevo.py 2016-02-09 11:57:41.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/vevo.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,24 +1,37 @@ from __future__ import unicode_literals import re +import json from .common import InfoExtractor -from ..compat import compat_etree_fromstring +from ..compat import ( + compat_str, + compat_urlparse, + compat_HTTPError, +) from ..utils import ( ExtractorError, int_or_none, - sanitized_Request, parse_iso8601, ) -class VevoIE(InfoExtractor): +class VevoBaseIE(InfoExtractor): + def _extract_json(self, webpage, video_id): + return self._parse_json( + self._search_regex( + r'window\.__INITIAL_STORE__\s*=\s*({.+?});\s*', + webpage, 'initial store'), + video_id) + + +class VevoIE(VevoBaseIE): ''' Accepts urls from vevo.com or in the format 'vevo:{id}' (currently used by MTVIE and MySpaceIE) ''' _VALID_URL = r'''(?x) - (?:https?://www\.vevo\.com/watch/(?:[^/]+/(?:[^/]+/)?)?| + (?:https?://(?:www\.)?vevo\.com/watch/(?!playlist|genre)(?:[^/]+/(?:[^/]+/)?)?| https?://cache\.vevo\.com/m/html/embed\.html\?video=| https?://videoplayer\.vevo\.com/embed/embedded\?videoId=| vevo:) @@ -30,11 +43,15 @@ 'info_dict': { 'id': 'GB1101300280', 'ext': 'mp4', - 'title': 'Somebody to Die For', + 'title': 'Hurts - Somebody to Die For', + 'timestamp': 1372057200, 'upload_date': '20130624', 'uploader': 'Hurts', - 'timestamp': 1372057200, + 'track': 'Somebody to Die For', + 'artist': 'Hurts', + 'genre': 'Pop', }, + 'expected_warnings': ['Unable to download SMIL file', 'Unable to download info'], }, { 'note': 'v3 SMIL format', 'url': 'http://www.vevo.com/watch/cassadee-pope/i-wish-i-could-break-your-heart/USUV71302923', @@ -42,23 +59,31 @@ 'info_dict': { 'id': 'USUV71302923', 'ext': 'mp4', - 'title': 'I Wish I Could Break Your Heart', + 'title': 'Cassadee Pope - I Wish I Could Break Your Heart', + 'timestamp': 1392796919, 'upload_date': '20140219', 'uploader': 'Cassadee Pope', - 'timestamp': 1392796919, + 'track': 'I Wish I Could Break Your Heart', + 'artist': 'Cassadee Pope', + 'genre': 'Country', }, + 'expected_warnings': ['Unable to download SMIL file', 'Unable to download info'], }, { 'note': 'Age-limited video', 'url': 'https://www.vevo.com/watch/justin-timberlake/tunnel-vision-explicit/USRV81300282', 'info_dict': { 'id': 'USRV81300282', 'ext': 'mp4', - 'title': 'Tunnel Vision (Explicit)', - 'upload_date': '20130703', + 'title': 'Justin Timberlake - Tunnel Vision (Explicit)', 'age_limit': 18, - 'uploader': 'Justin Timberlake', 'timestamp': 1372888800, + 'upload_date': '20130703', + 'uploader': 'Justin Timberlake', + 'track': 'Tunnel Vision (Explicit)', + 'artist': 'Justin Timberlake', + 'genre': 'Pop', }, + 'expected_warnings': ['Unable to download SMIL file', 'Unable to download info'], }, { 'note': 'No video_info', 'url': 'http://www.vevo.com/watch/k-camp-1/Till-I-Die/USUV71503000', @@ -66,28 +91,60 @@ 'info_dict': { 'id': 'USUV71503000', 'ext': 'mp4', - 'title': 'Till I Die', - 'upload_date': '20151207', + 'title': 'K Camp ft. T.I. - Till I Die', 'age_limit': 18, - 'uploader': 'K Camp', 'timestamp': 1449468000, + 'upload_date': '20151207', + 'uploader': 'K Camp', + 'track': 'Till I Die', + 'artist': 'K Camp', + 'genre': 'Hip-Hop', + }, + 'expected_warnings': ['Unable to download SMIL file', 'Unable to download info'], + }, { + 'note': 'Featured test', + 'url': 'https://www.vevo.com/watch/lemaitre/Wait/USUV71402190', + 'md5': 'd28675e5e8805035d949dc5cf161071d', + 'info_dict': { + 'id': 'USUV71402190', + 'ext': 'mp4', + 'title': 'Lemaitre ft. LoLo - Wait', + 'age_limit': 0, + 'timestamp': 1413432000, + 'upload_date': '20141016', + 'uploader': 'Lemaitre', + 'track': 'Wait', + 'artist': 'Lemaitre', + 'genre': 'Electronic', }, + 'expected_warnings': ['Unable to download SMIL file', 'Unable to download info'], + }, { + 'note': 'Only available via webpage', + 'url': 'http://www.vevo.com/watch/GBUV71600656', + 'md5': '67e79210613865b66a47c33baa5e37fe', + 'info_dict': { + 'id': 'GBUV71600656', + 'ext': 'mp4', + 'title': 'ABC - Viva Love', + 'age_limit': 0, + 'timestamp': 1461830400, + 'upload_date': '20160428', + 'uploader': 'ABC', + 'track': 'Viva Love', + 'artist': 'ABC', + 'genre': 'Pop', + }, + 'expected_warnings': ['Failed to download video versions info'], + }, { + # no genres available + 'url': 'http://www.vevo.com/watch/INS171400764', + 'only_matching': True, + }, { + # Another case available only via the webpage; using streams/streamsV3 formats + # Geo-restricted to Netherlands/Germany + 'url': 'http://www.vevo.com/watch/boostee/pop-corn-clip-officiel/FR1A91600909', + 'only_matching': True, }] - _SMIL_BASE_URL = 'http://smil.lvl3.vevo.com' - _SOURCE_TYPES = { - 0: 'youtube', - 1: 'brightcove', - 2: 'http', - 3: 'hls_ios', - 4: 'hls', - 5: 'smil', # http - 7: 'f4m_cc', - 8: 'f4m_ak', - 9: 'f4m_l3', - 10: 'ism', - 13: 'smil', # rtmp - 18: 'dash', - } _VERSIONS = { 0: 'youtube', # only in AuthenticateVideo videoVersions 1: 'level3', @@ -96,189 +153,130 @@ 4: 'amazon', } - def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): - formats = [] - els = smil.findall('.//{http://www.w3.org/2001/SMIL20/Language}video') - for el in els: - src = el.attrib['src'] - m = re.match(r'''(?xi) - (?P[a-z0-9]+): - (?P - [/a-z0-9]+ # The directory and main part of the URL - _(?P[0-9]+)k - _(?P[0-9]+)x(?P[0-9]+) - _(?P[a-z0-9]+) - _(?P[0-9]+) - _(?P[a-z0-9]+) - _(?P[0-9]+) - \.[a-z0-9]+ # File extension - )''', src) - if not m: - continue - - format_url = self._SMIL_BASE_URL + m.group('path') - formats.append({ - 'url': format_url, - 'format_id': 'smil_' + m.group('tbr'), - 'vcodec': m.group('vcodec'), - 'acodec': m.group('acodec'), - 'tbr': int(m.group('tbr')), - 'vbr': int(m.group('vbr')), - 'abr': int(m.group('abr')), - 'ext': m.group('ext'), - 'width': int(m.group('width')), - 'height': int(m.group('height')), - }) - return formats - def _initialize_api(self, video_id): - req = sanitized_Request( - 'http://www.vevo.com/auth', data=b'') webpage = self._download_webpage( - req, None, + 'https://accounts.vevo.com/token', None, note='Retrieving oauth token', - errnote='Unable to retrieve oauth token') + errnote='Unable to retrieve oauth token', + data=json.dumps({ + 'client_id': 'SPupX1tvqFEopQ1YS6SS', + 'grant_type': 'urn:vevo:params:oauth:grant-type:anonymous', + }).encode('utf-8'), + headers={ + 'Content-Type': 'application/json', + }) - if 'THIS PAGE IS CURRENTLY UNAVAILABLE IN YOUR REGION' in webpage: - raise ExtractorError( - '%s said: This page is currently unavailable in your region.' % self.IE_NAME, expected=True) + if re.search(r'(?i)THIS PAGE IS CURRENTLY UNAVAILABLE IN YOUR REGION', webpage): + self.raise_geo_restricted( + '%s said: This page is currently unavailable in your region' % self.IE_NAME) auth_info = self._parse_json(webpage, video_id) - self._api_url_template = self.http_scheme() + '//apiv2.vevo.com/%s?token=' + auth_info['access_token'] + self._api_url_template = self.http_scheme() + '//apiv2.vevo.com/%s?token=' + auth_info['legacy_token'] - def _call_api(self, path, video_id, note, errnote, fatal=True): - return self._download_json(self._api_url_template % path, video_id, note, errnote) + def _call_api(self, path, *args, **kwargs): + try: + data = self._download_json(self._api_url_template % path, *args, **kwargs) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError): + errors = self._parse_json(e.cause.read().decode(), None)['errors'] + error_message = ', '.join([error['message'] for error in errors]) + raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message), expected=True) + raise + return data def _real_extract(self, url): video_id = self._match_id(url) - json_url = 'http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id - response = self._download_json( - json_url, video_id, 'Downloading video info', 'Unable to download info') - video_info = response.get('video') or {} - video_versions = video_info.get('videoVersions') + self._initialize_api(video_id) + + video_info = self._call_api( + 'video/%s' % video_id, video_id, 'Downloading api video info', + 'Failed to download video info') + + video_versions = self._call_api( + 'video/%s/streams' % video_id, video_id, + 'Downloading video versions info', + 'Failed to download video versions info', + fatal=False) + + # Some videos are only available via webpage (e.g. + # https://github.com/ytdl-org/youtube-dl/issues/9366) + if not video_versions: + webpage = self._download_webpage(url, video_id) + json_data = self._extract_json(webpage, video_id) + if 'streams' in json_data.get('default', {}): + video_versions = json_data['default']['streams'][video_id][0] + else: + video_versions = [ + value + for key, value in json_data['apollo']['data'].items() + if key.startswith('%s.streams' % video_id)] + uploader = None - timestamp = None - view_count = None + artist = None + featured_artist = None + artists = video_info.get('artists') + for curr_artist in artists: + if curr_artist.get('role') == 'Featured': + featured_artist = curr_artist['name'] + else: + artist = uploader = curr_artist['name'] + formats = [] + for video_version in video_versions: + version = self._VERSIONS.get(video_version.get('version'), 'generic') + version_url = video_version.get('url') + if not version_url: + continue - if not video_info: - if response.get('statusCode') != 909: - ytid = response.get('errorInfo', {}).get('ytid') - if ytid: - self.report_warning( - 'Video is geoblocked, trying with the YouTube video %s' % ytid) - return self.url_result(ytid, 'Youtube', ytid) - - if 'statusMessage' in response: - raise ExtractorError('%s said: %s' % ( - self.IE_NAME, response['statusMessage']), expected=True) - raise ExtractorError('Unable to extract videos') - - self._initialize_api(video_id) - video_info = self._call_api( - 'video/%s' % video_id, video_id, 'Downloading api video info', - 'Failed to download video info') - - video_versions = self._call_api( - 'video/%s/streams' % video_id, video_id, - 'Downloading video versions info', - 'Failed to download video versions info') - - timestamp = parse_iso8601(video_info.get('releaseDate')) - artists = video_info.get('artists') - if artists: - uploader = artists[0]['name'] - view_count = int_or_none(video_info.get('views', {}).get('total')) - - for video_version in video_versions: - version = self._VERSIONS.get(video_version['version']) - version_url = video_version.get('url') - if not version_url: + if '.ism' in version_url: + continue + elif '.mpd' in version_url: + formats.extend(self._extract_mpd_formats( + version_url, video_id, mpd_id='dash-%s' % version, + note='Downloading %s MPD information' % version, + errnote='Failed to download %s MPD information' % version, + fatal=False)) + elif '.m3u8' in version_url: + formats.extend(self._extract_m3u8_formats( + version_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls-%s' % version, + note='Downloading %s m3u8 information' % version, + errnote='Failed to download %s m3u8 information' % version, + fatal=False)) + else: + m = re.search(r'''(?xi) + _(?P[0-9]+)x(?P[0-9]+) + _(?P[a-z0-9]+) + _(?P[0-9]+) + _(?P[a-z0-9]+) + _(?P[0-9]+) + \.(?P[a-z0-9]+)''', version_url) + if not m: continue - if '.ism' in version_url: - continue - elif '.mpd' in version_url: - formats.extend(self._extract_mpd_formats( - version_url, video_id, mpd_id='dash-%s' % version, - note='Downloading %s MPD information' % version, - errnote='Failed to download %s MPD information' % version, - fatal=False)) - elif '.m3u8' in version_url: - formats.extend(self._extract_m3u8_formats( - version_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls-%s' % version, - note='Downloading %s m3u8 information' % version, - errnote='Failed to download %s m3u8 information' % version, - fatal=False)) - else: - m = re.search(r'''(?xi) - _(?P[0-9]+)x(?P[0-9]+) - _(?P[a-z0-9]+) - _(?P[0-9]+) - _(?P[a-z0-9]+) - _(?P[0-9]+) - \.(?P[a-z0-9]+)''', version_url) - if not m: - continue - - formats.append({ - 'url': version_url, - 'format_id': 'http-%s-%s' % (version, video_version['quality']), - 'vcodec': m.group('vcodec'), - 'acodec': m.group('acodec'), - 'vbr': int(m.group('vbr')), - 'abr': int(m.group('abr')), - 'ext': m.group('ext'), - 'width': int(m.group('width')), - 'height': int(m.group('height')), - }) - else: - timestamp = int_or_none(self._search_regex( - r'/Date\((\d+)\)/', - video_info['releaseDate'], 'release date', fatal=False), - scale=1000) - artists = video_info.get('mainArtists') - if artists: - uploader = artists[0]['artistName'] - - smil_parsed = False - for video_version in video_info['videoVersions']: - version = self._VERSIONS.get(video_version['version']) - if version == 'youtube': - continue - else: - source_type = self._SOURCE_TYPES.get(video_version['sourceType']) - renditions = compat_etree_fromstring(video_version['data']) - if source_type == 'http': - for rend in renditions.findall('rendition'): - attr = rend.attrib - formats.append({ - 'url': attr['url'], - 'format_id': 'http-%s-%s' % (version, attr['name']), - 'height': int_or_none(attr.get('frameheight')), - 'width': int_or_none(attr.get('frameWidth')), - 'tbr': int_or_none(attr.get('totalBitrate')), - 'vbr': int_or_none(attr.get('videoBitrate')), - 'abr': int_or_none(attr.get('audioBitrate')), - 'vcodec': attr.get('videoCodec'), - 'acodec': attr.get('audioCodec'), - }) - elif source_type == 'hls': - formats.extend(self._extract_m3u8_formats( - renditions.find('rendition').attrib['url'], video_id, - 'mp4', 'm3u8_native', m3u8_id='hls-%s' % version, - note='Downloading %s m3u8 information' % version, - errnote='Failed to download %s m3u8 information' % version, - fatal=False)) - elif source_type == 'smil' and version == 'level3' and not smil_parsed: - formats.extend(self._extract_smil_formats( - renditions.find('rendition').attrib['url'], video_id, False)) - smil_parsed = True + formats.append({ + 'url': version_url, + 'format_id': 'http-%s-%s' % (version, video_version['quality']), + 'vcodec': m.group('vcodec'), + 'acodec': m.group('acodec'), + 'vbr': int(m.group('vbr')), + 'abr': int(m.group('abr')), + 'ext': m.group('ext'), + 'width': int(m.group('width')), + 'height': int(m.group('height')), + }) self._sort_formats(formats) - title = video_info['title'] + track = video_info['title'] + if featured_artist: + artist = '%s ft. %s' % (artist, featured_artist) + title = '%s - %s' % (artist, track) if artist else track + + genres = video_info.get('genres') + genre = ( + genres[0] if genres and isinstance(genres, list) + and isinstance(genres[0], compat_str) else None) is_explicit = video_info.get('isExplicit') if is_explicit is True: @@ -288,16 +286,85 @@ else: age_limit = None - duration = video_info.get('duration') - return { 'id': video_id, 'title': title, 'formats': formats, 'thumbnail': video_info.get('imageUrl') or video_info.get('thumbnailUrl'), - 'timestamp': timestamp, + 'timestamp': parse_iso8601(video_info.get('releaseDate')), 'uploader': uploader, - 'duration': duration, - 'view_count': view_count, + 'duration': int_or_none(video_info.get('duration')), + 'view_count': int_or_none(video_info.get('views', {}).get('total')), 'age_limit': age_limit, + 'track': track, + 'artist': uploader, + 'genre': genre, } + + +class VevoPlaylistIE(VevoBaseIE): + _VALID_URL = r'https?://(?:www\.)?vevo\.com/watch/(?Pplaylist|genre)/(?P[^/?#&]+)' + + _TESTS = [{ + 'url': 'http://www.vevo.com/watch/playlist/dadbf4e7-b99f-4184-9670-6f0e547b6a29', + 'info_dict': { + 'id': 'dadbf4e7-b99f-4184-9670-6f0e547b6a29', + 'title': 'Best-Of: Birdman', + }, + 'playlist_count': 10, + }, { + 'url': 'http://www.vevo.com/watch/genre/rock', + 'info_dict': { + 'id': 'rock', + 'title': 'Rock', + }, + 'playlist_count': 20, + }, { + 'url': 'http://www.vevo.com/watch/playlist/dadbf4e7-b99f-4184-9670-6f0e547b6a29?index=0', + 'md5': '32dcdfddddf9ec6917fc88ca26d36282', + 'info_dict': { + 'id': 'USCMV1100073', + 'ext': 'mp4', + 'title': 'Birdman - Y.U. MAD', + 'timestamp': 1323417600, + 'upload_date': '20111209', + 'uploader': 'Birdman', + 'track': 'Y.U. MAD', + 'artist': 'Birdman', + 'genre': 'Rap/Hip-Hop', + }, + 'expected_warnings': ['Unable to download SMIL file'], + }, { + 'url': 'http://www.vevo.com/watch/genre/rock?index=0', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + playlist_id = mobj.group('id') + playlist_kind = mobj.group('kind') + + webpage = self._download_webpage(url, playlist_id) + + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + index = qs.get('index', [None])[0] + + if index: + video_id = self._search_regex( + r']+content=(["\'])vevo://video/(?P.+?)\1[^>]*>', + webpage, 'video id', default=None, group='id') + if video_id: + return self.url_result('vevo:%s' % video_id, VevoIE.ie_key()) + + playlists = self._extract_json(webpage, playlist_id)['default']['%ss' % playlist_kind] + + playlist = (list(playlists.values())[0] + if playlist_kind == 'playlist' else playlists[playlist_id]) + + entries = [ + self.url_result('vevo:%s' % src, VevoIE.ie_key()) + for src in playlist['isrcs']] + + return self.playlist_result( + entries, playlist.get('playlistId') or playlist_id, + playlist.get('name'), playlist.get('description')) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/vgtv.py youtube-dl-2019.05.20/youtube_dl/extractor/vgtv.py --- youtube-dl-2016.02.22/youtube_dl/extractor/vgtv.py 2016-01-31 11:57:01.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/vgtv.py 2019-06-07 18:49:07.000000000 +0000 @@ -8,11 +8,13 @@ from ..utils import ( ExtractorError, float_or_none, + try_get, ) class VGTVIE(XstreamIE): IE_DESC = 'VGTV, BTTV, FTV, Aftenposten and Aftonbladet' + _GEO_BYPASS = False _HOST_TO_APPNAME = { 'vgtv.no': 'vgtv', @@ -20,6 +22,9 @@ 'aftenbladet.no/tv': 'satv', 'fvn.no/fvntv': 'fvntv', 'aftenposten.no/webtv': 'aptv', + 'ap.vgtv.no/webtv': 'aptv', + 'tv.aftonbladet.se/abtv': 'abtv', + 'www.aftonbladet.se/tv': 'abtv', } _APP_NAME_TO_VENDOR = { @@ -28,6 +33,7 @@ 'satv': 'sa', 'fvntv': 'fvn', 'aptv': 'ap', + 'abtv': 'ab', } _VALID_URL = r'''(?x) @@ -35,10 +41,11 @@ (?P %s ) - / + /? (?: - \#!/(?:video|live)/| - embed?.*id= + (?:\#!/)?(?:video|live)/| + embed?.*id=| + a(?:rticles)?/ )| (?P %s @@ -56,7 +63,7 @@ 'ext': 'mp4', 'title': 'Hevnen er søt: Episode 10 - Abu', 'description': 'md5:e25e4badb5f544b04341e14abdc72234', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'duration': 648.000, 'timestamp': 1404626400, 'upload_date': '20140706', @@ -71,7 +78,7 @@ 'ext': 'flv', 'title': 'OPPTAK: VGTV følger EM-kvalifiseringen', 'description': 'md5:3772d9c0dc2dff92a886b60039a7d4d3', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'duration': 9103.0, 'timestamp': 1410113864, 'upload_date': '20140907', @@ -91,7 +98,7 @@ 'ext': 'mp4', 'title': 'V75 fra Solvalla 30.05.15', 'description': 'md5:b3743425765355855f88e096acc93231', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'duration': 25966, 'timestamp': 1432975582, 'upload_date': '20150530', @@ -107,19 +114,48 @@ 'md5': 'fd828cd29774a729bf4d4425fe192972', 'info_dict': { 'id': '21039', - 'ext': 'mov', + 'ext': 'mp4', 'title': 'TRAILER: «SWEATSHOP» - I can´t take any more', 'description': 'md5:21891f2b0dd7ec2f78d84a50e54f8238', 'duration': 66, 'timestamp': 1417002452, 'upload_date': '20141126', 'view_count': int, - } + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://www.bt.no/tv/#!/video/100250/norling-dette-er-forskjellen-paa-1-divisjon-og-eliteserien', 'only_matching': True, }, + { + 'url': 'http://ap.vgtv.no/webtv#!/video/111084/de-nye-bysyklene-lettere-bedre-gir-stoerre-hjul-og-feste-til-mobil', + 'only_matching': True, + }, + { + # geoblocked + 'url': 'http://www.vgtv.no/#!/video/127205/inside-the-mind-of-favela-funk', + 'only_matching': True, + }, + { + 'url': 'http://tv.aftonbladet.se/abtv/articles/36015', + 'only_matching': True, + }, + { + 'url': 'https://www.aftonbladet.se/tv/a/36015', + 'only_matching': True, + }, + { + 'url': 'abtv:140026', + 'only_matching': True, + }, + { + 'url': 'http://www.vgtv.no/video/84196/hevnen-er-soet-episode-10-abu', + 'only_matching': True, + }, ] def _real_extract(self, url): @@ -144,18 +180,18 @@ if len(video_id) == 5: if appname == 'bttv': info = self._extract_video_info('btno', video_id) - elif appname == 'aptv': - info = self._extract_video_info('ap', video_id) streams = data['streamUrls'] stream_type = data.get('streamType') - + is_live = stream_type == 'live' formats = [] hls_url = streams.get('hls') if hls_url: formats.extend(self._extract_m3u8_formats( - hls_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + hls_url, video_id, 'mp4', + entry_protocol='m3u8' if is_live else 'm3u8_native', + m3u8_id='hls', fatal=False)) hds_url = streams.get('hds') if hds_url: @@ -176,7 +212,7 @@ format_info = { 'url': mp4_url, } - mobj = re.search('(\d+)_(\d+)_(\d+)', mp4_url) + mobj = re.search(r'(\d+)_(\d+)_(\d+)', mp4_url) if mobj: tbr = int(mobj.group(3)) format_info.update({ @@ -189,17 +225,24 @@ info['formats'].extend(formats) + if not info['formats']: + properties = try_get( + data, lambda x: x['streamConfiguration']['properties'], list) + if properties and 'geoblocked' in properties: + raise self.raise_geo_restricted( + countries=[host.rpartition('.')[-1].partition('/')[0].upper()]) + self._sort_formats(info['formats']) info.update({ 'id': video_id, - 'title': self._live_title(data['title']) if stream_type == 'live' else data['title'], + 'title': self._live_title(data['title']) if is_live else data['title'], 'description': data['description'], 'thumbnail': data['images']['main'] + '?t[]=900x506q80', 'timestamp': data['published'], 'duration': float_or_none(data['duration'], 1000), 'view_count': data['displays'], - 'is_live': True if stream_type == 'live' else False, + 'is_live': is_live, }) return info @@ -207,7 +250,7 @@ class BTArticleIE(InfoExtractor): IE_NAME = 'bt:article' IE_DESC = 'Bergens Tidende Articles' - _VALID_URL = 'http://(?:www\.)?bt\.no/(?:[^/]+/)+(?P[^/]+)-\d+\.html' + _VALID_URL = r'https?://(?:www\.)?bt\.no/(?:[^/]+/)+(?P[^/]+)-\d+\.html' _TEST = { 'url': 'http://www.bt.no/nyheter/lokalt/Kjemper-for-internatet-1788214.html', 'md5': '2acbe8ad129b3469d5ae51b1158878df', @@ -216,7 +259,7 @@ 'ext': 'mp4', 'title': 'Alrekstad internat', 'description': 'md5:dc81a9056c874fedb62fc48a300dac58', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'duration': 191, 'timestamp': 1289991323, 'upload_date': '20101117', @@ -234,7 +277,7 @@ class BTVestlendingenIE(InfoExtractor): IE_NAME = 'bt:vestlendingen' IE_DESC = 'Bergens Tidende - Vestlendingen' - _VALID_URL = 'http://(?:www\.)?bt\.no/spesial/vestlendingen/#!/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?bt\.no/spesial/vestlendingen/#!/(?P\d+)' _TESTS = [{ 'url': 'http://www.bt.no/spesial/vestlendingen/#!/86588', 'md5': 'd7d17e3337dc80de6d3a540aefbe441b', diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/vh1.py youtube-dl-2019.05.20/youtube_dl/extractor/vh1.py --- youtube-dl-2016.02.22/youtube_dl/extractor/vh1.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/vh1.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,127 +1,41 @@ +# coding: utf-8 from __future__ import unicode_literals -from .mtv import MTVIE +from .mtv import MTVServicesInfoExtractor -import re -from ..utils import fix_xml_ampersands - -class VH1IE(MTVIE): +class VH1IE(MTVServicesInfoExtractor): IE_NAME = 'vh1.com' - _FEED_URL = 'http://www.vh1.com/player/embed/AS3/fullepisode/rss/' + _FEED_URL = 'http://www.vh1.com/feeds/mrss/' _TESTS = [{ - 'url': 'http://www.vh1.com/video/metal-evolution/full-episodes/progressive-metal/1678612/playlist.jhtml', - 'playlist': [ - { - 'md5': '7827a7505f59633983165bbd2c119b52', - 'info_dict': { - 'id': '731565', - 'ext': 'mp4', - 'title': 'Metal Evolution: Ep. 11 Act 1', - 'description': 'Many rock academics have proclaimed that the truly progressive musicianship of the last 20 years has been found right here in the world of heavy metal, rather than obvious locales such as jazz, fusion or progressive rock. It stands to reason then, that much of this jaw-dropping virtuosity occurs within what\'s known as progressive metal, a genre that takes root with the likes of Rush in the \'70s, Queensryche and Fates Warning in the \'80s, and Dream Theater in the \'90s. Since then, the genre has exploded with creativity, spawning mind-bending, genre-defying acts such as Tool, Mastodon, Coheed And Cambria, Porcupine Tree, Meshuggah, A Perfect Circle and Opeth. Episode 12 looks at the extreme musicianship of these bands, as well as their often extreme literary prowess and conceptual strength, the end result being a rich level of respect and attention such challenging acts have brought upon the world of heavy metal, from a critical community usually dismissive of the form.' - } - }, - { - 'md5': '34fb4b7321c546b54deda2102a61821f', - 'info_dict': { - 'id': '731567', - 'ext': 'mp4', - 'title': 'Metal Evolution: Ep. 11 Act 2', - 'description': 'Many rock academics have proclaimed that the truly progressive musicianship of the last 20 years has been found right here in the world of heavy metal, rather than obvious locales such as jazz, fusion or progressive rock. It stands to reason then, that much of this jaw-dropping virtuosity occurs within what\'s known as progressive metal, a genre that takes root with the likes of Rush in the \'70s, Queensryche and Fates Warning in the \'80s, and Dream Theater in the \'90s. Since then, the genre has exploded with creativity, spawning mind-bending, genre-defying acts such as Tool, Mastodon, Coheed And Cambria, Porcupine Tree, Meshuggah, A Perfect Circle and Opeth. Episode 11 looks at the extreme musicianship of these bands, as well as their often extreme literary prowess and conceptual strength, the end result being a rich level of respect and attention such challenging acts have brought upon the world of heavy metal, from a critical community usually dismissive of the form.' - } - }, - { - 'md5': '813f38dba4c1b8647196135ebbf7e048', - 'info_dict': { - 'id': '731568', - 'ext': 'mp4', - 'title': 'Metal Evolution: Ep. 11 Act 3', - 'description': 'Many rock academics have proclaimed that the truly progressive musicianship of the last 20 years has been found right here in the world of heavy metal, rather than obvious locales such as jazz, fusion or progressive rock. It stands to reason then, that much of this jaw-dropping virtuosity occurs within what\'s known as progressive metal, a genre that takes root with the likes of Rush in the \'70s, Queensryche and Fates Warning in the \'80s, and Dream Theater in the \'90s. Since then, the genre has exploded with creativity, spawning mind-bending, genre-defying acts such as Tool, Mastodon, Coheed And Cambria, Porcupine Tree, Meshuggah, A Perfect Circle and Opeth. Episode 11 looks at the extreme musicianship of these bands, as well as their often extreme literary prowess and conceptual strength, the end result being a rich level of respect and attention such challenging acts have brought upon the world of heavy metal, from a critical community usually dismissive of the form.' - } - }, - { - 'md5': '51adb72439dfaed11c799115d76e497f', - 'info_dict': { - 'id': '731569', - 'ext': 'mp4', - 'title': 'Metal Evolution: Ep. 11 Act 4', - 'description': 'Many rock academics have proclaimed that the truly progressive musicianship of the last 20 years has been found right here in the world of heavy metal, rather than obvious locales such as jazz, fusion or progressive rock. It stands to reason then, that much of this jaw-dropping virtuosity occurs within what\'s known as progressive metal, a genre that takes root with the likes of Rush in the \'70s, Queensryche and Fates Warning in the \'80s, and Dream Theater in the \'90s. Since then, the genre has exploded with creativity, spawning mind-bending, genre-defying acts such as Tool, Mastodon, Coheed And Cambria, Porcupine Tree, Meshuggah, A Perfect Circle and Opeth. Episode 11 looks at the extreme musicianship of these bands, as well as their often extreme literary prowess and conceptual strength, the end result being a rich level of respect and attention such challenging acts have brought upon the world of heavy metal, from a critical community usually dismissive of the form.' - } - }, - { - 'md5': '93d554aaf79320703b73a95288c76a6e', - 'info_dict': { - 'id': '731570', - 'ext': 'mp4', - 'title': 'Metal Evolution: Ep. 11 Act 5', - 'description': 'Many rock academics have proclaimed that the truly progressive musicianship of the last 20 years has been found right here in the world of heavy metal, rather than obvious locales such as jazz, fusion or progressive rock. It stands to reason then, that much of this jaw-dropping virtuosity occurs within what\'s known as progressive metal, a genre that takes root with the likes of Rush in the \'70s, Queensryche and Fates Warning in the \'80s, and Dream Theater in the \'90s. Since then, the genre has exploded with creativity, spawning mind-bending, genre-defying acts such as Tool, Mastodon, Coheed And Cambria, Porcupine Tree, Meshuggah, A Perfect Circle and Opeth. Episode 11 looks at the extreme musicianship of these bands, as well as their often extreme literary prowess and conceptual strength, the end result being a rich level of respect and attention such challenging acts have brought upon the world of heavy metal, from a critical community usually dismissive of the form.' - } - } - ], - 'skip': 'Blocked outside the US', - }, { - # Clip - 'url': 'http://www.vh1.com/video/misc/706675/metal-evolution-episode-1-pre-metal-show-clip.jhtml#id=1674118', - 'md5': '7d67cf6d9cdc6b4f3d3ac97a55403844', + 'url': 'http://www.vh1.com/episodes/0umwpq/hip-hop-squares-kent-jones-vs-nick-young-season-1-ep-120', 'info_dict': { - 'id': '706675', - 'ext': 'mp4', - 'title': 'Metal Evolution: Episode 1 Pre-Metal Show Clip', - 'description': 'The greatest documentary ever made about Heavy Metal begins as our host Sam Dunn travels the globe to seek out the origins and influences that helped create Heavy Metal. Sam speaks to legends like Kirk Hammett, Alice Cooper, Slash, Bill Ward, Geezer Butler, Tom Morello, Ace Frehley, Lemmy Kilmister, Dave Davies, and many many more. This episode is the prologue for the 11 hour series, and Sam goes back to the very beginning to reveal how Heavy Metal was created.' + 'title': 'Kent Jones vs. Nick Young', + 'description': 'Come to Play. Stay to Party. With Mike Epps, TIP, O’Shea Jackson Jr., T-Pain, Tisha Campbell-Martin and more.', }, - 'skip': 'Blocked outside the US', + 'playlist_mincount': 4, }, { - # Short link - 'url': 'http://www.vh1.com/video/play.jhtml?id=1678353', - 'md5': '853192b87ad978732b67dd8e549b266a', + # Clip + 'url': 'http://www.vh1.com/video-clips/t74mif/scared-famous-scared-famous-extended-preview', 'info_dict': { - 'id': '730355', + 'id': '0a50c2d2-a86b-4141-9565-911c7e2d0b92', 'ext': 'mp4', - 'title': 'Metal Evolution: Episode 11 Progressive Metal Sneak', - 'description': 'In Metal Evolution\'s finale sneak, Sam sits with Michael Giles of King Crimson and gets feedback from Metallica guitarist Kirk Hammett on why the group was influential.' + 'title': 'Scared Famous|October 9, 2017|1|NO-EPISODE#|Scared Famous + Extended Preview', + 'description': 'md5:eff5551a274c473a29463de40f7b09da', + 'upload_date': '20171009', + 'timestamp': 1507574700, }, - 'skip': 'Blocked outside the US', - }, { - 'url': 'http://www.vh1.com/video/macklemore-ryan-lewis/900535/cant-hold-us-ft-ray-dalton.jhtml', - 'md5': 'b1bcb5b4380c9d7f544065589432dee7', - 'info_dict': { - 'id': '900535', - 'ext': 'mp4', - 'title': 'Macklemore & Ryan Lewis - "Can\'t Hold Us ft. Ray Dalton"', - 'description': 'The Heist' + 'params': { + # m3u8 download + 'skip_download': True, }, - 'skip': 'Blocked outside the US', }] - _VALID_URL = r'''(?x) - https?://www\.vh1\.com/video/ - (?: - .+?/full-episodes/.+?/(?P[^/]+)/playlist\.jhtml - | - (?: - play.jhtml\?id=| - misc/.+?/.+?\.jhtml\#id= - ) - (?P[0-9]+)$ - | - [^/]+/(?P[0-9]+)/[^/]+? - ) - ''' + _VALID_URL = r'https?://(?:www\.)?vh1\.com/(?:video-clips|episodes)/(?P[^/?#.]+)' def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - if mobj.group('music_id'): - id_field = 'vid' - video_id = mobj.group('music_id') - else: - video_id = mobj.group('playlist_id') or mobj.group('video_id') - id_field = 'id' - doc_url = '%s?%s=%s' % (self._FEED_URL, id_field, video_id) - - idoc = self._download_xml( - doc_url, video_id, - 'Downloading info', transform_source=fix_xml_ampersands) - return self.playlist_result( - [self._get_video_info(item) for item in idoc.findall('.//item')], - playlist_id=video_id, - ) + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + mgid = self._extract_triforce_mgid(webpage) + videos_info = self._get_videos_info(mgid) + return videos_info diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/vice.py youtube-dl-2019.05.20/youtube_dl/extractor/vice.py --- youtube-dl-2016.02.22/youtube_dl/extractor/vice.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/vice.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,40 +1,337 @@ +# coding: utf-8 from __future__ import unicode_literals +import re +import time +import hashlib +import json +import random + +from .adobepass import AdobePassIE +from .youtube import YoutubeIE from .common import InfoExtractor -from .ooyala import OoyalaIE -from ..utils import ExtractorError +from ..compat import ( + compat_HTTPError, + compat_str, +) +from ..utils import ( + ExtractorError, + int_or_none, + parse_age_limit, + str_or_none, + try_get, +) -class ViceIE(InfoExtractor): - _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)+(?P.+)' +class ViceIE(AdobePassIE): + IE_NAME = 'vice' + _VALID_URL = r'https?://(?:(?:video|vms)\.vice|(?:www\.)?viceland)\.com/(?P[^/]+)/(?:video/[^/]+|embed)/(?P[\da-f]+)' + _TESTS = [{ + 'url': 'https://video.vice.com/en_us/video/pet-cremator/58c69e38a55424f1227dc3f7', + 'info_dict': { + 'id': '5e647f0125e145c9aef2069412c0cbde', + 'ext': 'mp4', + 'title': '10 Questions You Always Wanted To Ask: Pet Cremator', + 'description': 'md5:fe856caacf61fe0e74fab15ce2b07ca5', + 'uploader': 'vice', + 'uploader_id': '57a204088cb727dec794c67b', + 'timestamp': 1489664942, + 'upload_date': '20170316', + 'age_limit': 14, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': ['UplynkPreplay'], + }, { + # geo restricted to US + 'url': 'https://video.vice.com/en_us/video/the-signal-from-tolva/5816510690b70e6c5fd39a56', + 'info_dict': { + 'id': '930c0ad1f47141cc955087eecaddb0e2', + 'ext': 'mp4', + 'uploader': 'waypoint', + 'title': 'The Signal From Tölva', + 'description': 'md5:3927e3c79f9e8094606a2b3c5b5e55d5', + 'uploader_id': '57f7d621e05ca860fa9ccaf9', + 'timestamp': 1477941983, + 'upload_date': '20161031', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': ['UplynkPreplay'], + }, { + 'url': 'https://video.vice.com/alps/video/ulfs-wien-beruchtigste-grafitti-crew-part-1/581b12b60a0e1f4c0fb6ea2f', + 'info_dict': { + 'id': '581b12b60a0e1f4c0fb6ea2f', + 'ext': 'mp4', + 'title': 'ULFs - Wien berüchtigste Grafitti Crew - Part 1', + 'description': '

    Zwischen Hinterzimmer-Tattoos und U-Bahnschächten erzählen uns die Ulfs, wie es ist, "süchtig nach Sachbeschädigung" zu sein.

    ', + 'uploader': 'VICE', + 'uploader_id': '57a204088cb727dec794c67b', + 'timestamp': 1485368119, + 'upload_date': '20170125', + 'age_limit': 14, + }, + 'params': { + # AES-encrypted m3u8 + 'skip_download': True, + 'proxy': '127.0.0.1:8118', + }, + 'add_ie': ['UplynkPreplay'], + }, { + 'url': 'https://video.vice.com/en_us/video/pizza-show-trailer/56d8c9a54d286ed92f7f30e4', + 'only_matching': True, + }, { + 'url': 'https://video.vice.com/en_us/embed/57f41d3556a0a80f54726060', + 'only_matching': True, + }, { + 'url': 'https://vms.vice.com/en_us/video/preplay/58c69e38a55424f1227dc3f7', + 'only_matching': True, + }, { + 'url': 'https://www.viceland.com/en_us/video/thursday-march-1-2018/5a8f2d7ff1cdb332dd446ec1', + 'only_matching': True, + }] - _TESTS = [ - { - 'url': 'http://www.vice.com/Fringes/cowboy-capitalists-part-1', - 'info_dict': { - 'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp', - 'ext': 'mp4', - 'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov', - 'duration': 725.983, - }, - 'params': { - # Requires ffmpeg (m3u8 manifest) - 'skip_download': True, - }, - }, { - 'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab', - 'only_matching': True, - } - ] + @staticmethod + def _extract_urls(webpage): + return re.findall( + r']+\bsrc=["\']((?:https?:)?//video\.vice\.com/[^/]+/embed/[\da-f]+)', + webpage) + + @staticmethod + def _extract_url(webpage): + urls = ViceIE._extract_urls(webpage) + return urls[0] if urls else None def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + locale, video_id = re.match(self._VALID_URL, url).groups() + + webpage = self._download_webpage( + 'https://video.vice.com/%s/embed/%s' % (locale, video_id), + video_id) + + video = self._parse_json( + self._search_regex( + r'PREFETCH_DATA\s*=\s*({.+?})\s*;\s*\n', webpage, + 'app state'), video_id)['video'] + video_id = video.get('vms_id') or video.get('id') or video_id + title = video['title'] + is_locked = video.get('locked') + rating = video.get('rating') + thumbnail = video.get('thumbnail_url') + duration = int_or_none(video.get('duration')) + series = try_get( + video, lambda x: x['episode']['season']['show']['title'], + compat_str) + episode_number = try_get( + video, lambda x: x['episode']['episode_number']) + season_number = try_get( + video, lambda x: x['episode']['season']['season_number']) + uploader = None + + query = {} + if is_locked: + resource = self._get_mvpd_resource( + 'VICELAND', title, video_id, rating) + query['tvetoken'] = self._extract_mvpd_auth( + url, video_id, 'VICELAND', resource) + + # signature generation algorithm is reverse engineered from signatureGenerator in + # webpack:///../shared/~/vice-player/dist/js/vice-player.js in + # https://www.viceland.com/assets/common/js/web.vendor.bundle.js + # new JS is located here https://vice-web-statics-cdn.vice.com/vice-player/player-embed.js + exp = int(time.time()) + 1440 + + query.update({ + 'exp': exp, + 'sign': hashlib.sha512(('%s:GET:%d' % (video_id, exp)).encode()).hexdigest(), + '_ad_blocked': None, + '_ad_unit': '', + '_debug': '', + 'platform': 'desktop', + 'rn': random.randint(10000, 100000), + 'fbprebidtoken': '', + }) + try: - embed_code = self._search_regex( - r'embedCode=([^&\'"]+)', webpage, - 'ooyala embed code') - ooyala_url = OoyalaIE._url_for_embed_code(embed_code) - except ExtractorError: - raise ExtractorError('The page doesn\'t contain a video', expected=True) - return self.url_result(ooyala_url, ie='Ooyala') + preplay = self._download_json( + 'https://vms.vice.com/%s/video/preplay/%s' % (locale, video_id), + video_id, query=query) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 401): + error = json.loads(e.cause.read().decode()) + error_message = error.get('error_description') or error['details'] + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, error_message), expected=True) + raise + + video_data = preplay['video'] + base = video_data['base'] + uplynk_preplay_url = preplay['preplayURL'] + episode = video_data.get('episode', {}) + channel = video_data.get('channel', {}) + + subtitles = {} + cc_url = preplay.get('ccURL') + if cc_url: + subtitles['en'] = [{ + 'url': cc_url, + }] + + return { + '_type': 'url_transparent', + 'url': uplynk_preplay_url, + 'id': video_id, + 'title': title, + 'description': base.get('body') or base.get('display_body'), + 'thumbnail': thumbnail, + 'duration': int_or_none(video_data.get('video_duration')) or duration, + 'timestamp': int_or_none(video_data.get('created_at'), 1000), + 'age_limit': parse_age_limit(video_data.get('video_rating')), + 'series': video_data.get('show_title') or series, + 'episode_number': int_or_none(episode.get('episode_number') or episode_number), + 'episode_id': str_or_none(episode.get('id') or video_data.get('episode_id')), + 'season_number': int_or_none(season_number), + 'season_id': str_or_none(episode.get('season_id')), + 'uploader': channel.get('base', {}).get('title') or channel.get('name') or uploader, + 'uploader_id': str_or_none(channel.get('id')), + 'subtitles': subtitles, + 'ie_key': 'UplynkPreplay', + } + + +class ViceShowIE(InfoExtractor): + IE_NAME = 'vice:show' + _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?show/(?P[^/?#&]+)' + + _TEST = { + 'url': 'https://munchies.vice.com/en/show/fuck-thats-delicious-2', + 'info_dict': { + 'id': 'fuck-thats-delicious-2', + 'title': "Fuck, That's Delicious", + 'description': 'Follow the culinary adventures of rapper Action Bronson during his ongoing world tour.', + }, + 'playlist_count': 17, + } + + def _real_extract(self, url): + show_id = self._match_id(url) + webpage = self._download_webpage(url, show_id) + + entries = [ + self.url_result(video_url, ViceIE.ie_key()) + for video_url, _ in re.findall( + r']+class="article-title"[^>]+data-id="\d+"[^>]*>\s*]+href="(%s.*?)"' + % ViceIE._VALID_URL, webpage)] + + title = self._search_regex( + r'(.+?)', webpage, 'title', default=None) + if title: + title = re.sub(r'(.+)\s*\|\s*.+$', r'\1', title).strip() + description = self._html_search_meta( + 'description', webpage, 'description') + + return self.playlist_result(entries, show_id, title, description) + + +class ViceArticleIE(InfoExtractor): + IE_NAME = 'vice:article' + _VALID_URL = r'https://www\.vice\.com/[^/]+/article/(?P[^?#]+)' + + _TESTS = [{ + 'url': 'https://www.vice.com/en_us/article/on-set-with-the-woman-making-mormon-porn-in-utah', + 'info_dict': { + 'id': '41eae2a47b174a1398357cec55f1f6fc', + 'ext': 'mp4', + 'title': 'Mormon War on Porn ', + 'description': 'md5:6394a8398506581d0346b9ab89093fef', + 'uploader': 'vice', + 'uploader_id': '57a204088cb727dec794c67b', + 'timestamp': 1491883129, + 'upload_date': '20170411', + 'age_limit': 17, + }, + 'params': { + # AES-encrypted m3u8 + 'skip_download': True, + }, + 'add_ie': ['UplynkPreplay'], + }, { + 'url': 'https://www.vice.com/en_us/article/how-to-hack-a-car', + 'md5': '7fe8ebc4fa3323efafc127b82bd821d9', + 'info_dict': { + 'id': '3jstaBeXgAs', + 'ext': 'mp4', + 'title': 'How to Hack a Car: Phreaked Out (Episode 2)', + 'description': 'md5:ee95453f7ff495db8efe14ae8bf56f30', + 'uploader': 'Motherboard', + 'uploader_id': 'MotherboardTV', + 'upload_date': '20140529', + }, + 'add_ie': ['Youtube'], + }, { + 'url': 'https://www.vice.com/en_us/article/znm9dx/karley-sciortino-slutever-reloaded', + 'md5': 'a7ecf64ee4fa19b916c16f4b56184ae2', + 'info_dict': { + 'id': 'e2ed435eb67e43efb66e6ef9a6930a88', + 'ext': 'mp4', + 'title': "Making The World's First Male Sex Doll", + 'description': 'md5:916078ef0e032d76343116208b6cc2c4', + 'uploader': 'vice', + 'uploader_id': '57a204088cb727dec794c67b', + 'timestamp': 1476919911, + 'upload_date': '20161019', + 'age_limit': 17, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [ViceIE.ie_key()], + }, { + 'url': 'https://www.vice.com/en_us/article/cowboy-capitalists-part-1', + 'only_matching': True, + }, { + 'url': 'https://www.vice.com/ru/article/big-night-out-ibiza-clive-martin-229', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + prefetch_data = self._parse_json(self._search_regex( + r'__APP_STATE\s*=\s*({.+?})(?:\s*\|\|\s*{}\s*)?;\s*\n', + webpage, 'app state'), display_id)['pageData'] + body = prefetch_data['body'] + + def _url_res(video_url, ie_key): + return { + '_type': 'url_transparent', + 'url': video_url, + 'display_id': display_id, + 'ie_key': ie_key, + } + + vice_url = ViceIE._extract_url(webpage) + if vice_url: + return _url_res(vice_url, ViceIE.ie_key()) + + embed_code = self._search_regex( + r'embedCode=([^&\'"]+)', body, + 'ooyala embed code', default=None) + if embed_code: + return _url_res('ooyala:%s' % embed_code, 'Ooyala') + + youtube_url = YoutubeIE._extract_url(body) + if youtube_url: + return _url_res(youtube_url, YoutubeIE.ie_key()) + + video_url = self._html_search_regex( + r'data-video-url="([^"]+)"', + prefetch_data['embed_code'], 'video URL') + + return _url_res(video_url, ViceIE.ie_key()) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/vidbit.py youtube-dl-2019.05.20/youtube_dl/extractor/vidbit.py --- youtube-dl-2016.02.22/youtube_dl/extractor/vidbit.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/vidbit.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,84 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + int_or_none, + js_to_json, + remove_end, + unified_strdate, +) + + +class VidbitIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vidbit\.co/(?:watch|embed)\?.*?\bv=(?P[\da-zA-Z]+)' + _TESTS = [{ + 'url': 'http://www.vidbit.co/watch?v=jkL2yDOEq2', + 'md5': '1a34b7f14defe3b8fafca9796892924d', + 'info_dict': { + 'id': 'jkL2yDOEq2', + 'ext': 'mp4', + 'title': 'Intro to VidBit', + 'description': 'md5:5e0d6142eec00b766cbf114bfd3d16b7', + 'thumbnail': r're:https?://.*\.jpg$', + 'upload_date': '20160618', + 'view_count': int, + 'comment_count': int, + } + }, { + 'url': 'http://www.vidbit.co/embed?v=jkL2yDOEq2&auto=0&water=0', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + compat_urlparse.urljoin(url, '/watch?v=%s' % video_id), video_id) + + video_url, title = [None] * 2 + + config = self._parse_json(self._search_regex( + r'(?s)\.setup\(({.+?})\);', webpage, 'setup', default='{}'), + video_id, transform_source=js_to_json) + if config: + if config.get('file'): + video_url = compat_urlparse.urljoin(url, config['file']) + title = config.get('title') + + if not video_url: + video_url = compat_urlparse.urljoin(url, self._search_regex( + r'file\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'video URL', group='url')) + + if not title: + title = remove_end( + self._html_search_regex( + (r'

    (.+?)

    ', r'(.+?)'), + webpage, 'title', default=None) or self._og_search_title(webpage), + ' - VidBit') + + description = self._html_search_meta( + ('description', 'og:description', 'twitter:description'), + webpage, 'description') + + upload_date = unified_strdate(self._html_search_meta( + 'datePublished', webpage, 'upload date')) + + view_count = int_or_none(self._search_regex( + r'(\d+) views', + webpage, 'view count', fatal=False)) + comment_count = int_or_none(self._search_regex( + r'id=["\']cmt_num["\'][^>]*>\((\d+)\)', + webpage, 'comment count', fatal=False)) + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'description': description, + 'thumbnail': self._og_search_thumbnail(webpage), + 'upload_date': upload_date, + 'view_count': view_count, + 'comment_count': comment_count, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/viddler.py youtube-dl-2019.05.20/youtube_dl/extractor/viddler.py --- youtube-dl-2016.02.22/youtube_dl/extractor/viddler.py 2016-02-09 19:12:26.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/viddler.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,19 +1,16 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse, - compat_urlparse, -) from ..utils import ( float_or_none, int_or_none, - sanitized_Request, ) class ViddlerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?viddler\.com/(?:v|embed|player)/(?P[a-z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?viddler\.com/(?:v|embed|player)/(?P[a-z0-9]+)(?:.+?\bsecret=(\d+))?' _TESTS = [{ 'url': 'http://www.viddler.com/v/43903784', 'md5': '9eee21161d2c7f5b39690c3e325fab2f', @@ -26,7 +23,7 @@ 'timestamp': 1335371429, 'upload_date': '20120425', 'duration': 100.89, - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'view_count': int, 'comment_count': int, 'categories': ['video content', 'high quality video', 'video made easy', 'how to produce video with limited resources', 'viddler'], @@ -78,23 +75,18 @@ }] def _real_extract(self, url): - video_id = self._match_id(url) + video_id, secret = re.match(self._VALID_URL, url).groups() query = { 'video_id': video_id, 'key': 'v0vhrt7bg2xq1vyxhkct', } - - qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - secret = qs.get('secret', [None])[0] if secret: query['secret'] = secret - headers = {'Referer': 'http://static.cdn-ec.viddler.com/js/arpeggio/v2/embed.html'} - request = sanitized_Request( - 'http://api.viddler.com/api/v2/viddler.videos.getPlaybackDetails.json?%s' - % compat_urllib_parse.urlencode(query), None, headers) - data = self._download_json(request, video_id)['video'] + data = self._download_json( + 'http://api.viddler.com/api/v2/viddler.videos.getPlaybackDetails.json', + video_id, headers={'Referer': url}, query=query)['video'] formats = [] for filed in data['files']: diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/videa.py youtube-dl-2019.05.20/youtube_dl/extractor/videa.py --- youtube-dl-2016.02.22/youtube_dl/extractor/videa.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/videa.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,106 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + mimetype2ext, + parse_codecs, + xpath_element, + xpath_text, +) + + +class VideaIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + videa(?:kid)?\.hu/ + (?: + videok/(?:[^/]+/)*[^?#&]+-| + player\?.*?\bv=| + player/v/ + ) + (?P[^?#&]+) + ''' + _TESTS = [{ + 'url': 'http://videa.hu/videok/allatok/az-orult-kigyasz-285-kigyot-kigyo-8YfIAjxwWGwT8HVQ', + 'md5': '97a7af41faeaffd9f1fc864a7c7e7603', + 'info_dict': { + 'id': '8YfIAjxwWGwT8HVQ', + 'ext': 'mp4', + 'title': 'Az őrült kígyász 285 kígyót enged szabadon', + 'thumbnail': r're:^https?://.*', + 'duration': 21, + }, + }, { + 'url': 'http://videa.hu/videok/origo/jarmuvek/supercars-elozes-jAHDWfWSJH5XuFhH', + 'only_matching': True, + }, { + 'url': 'http://videa.hu/player?v=8YfIAjxwWGwT8HVQ', + 'only_matching': True, + }, { + 'url': 'http://videa.hu/player/v/8YfIAjxwWGwT8HVQ?autoplay=1', + 'only_matching': True, + }, { + 'url': 'https://videakid.hu/videok/origo/jarmuvek/supercars-elozes-jAHDWfWSJH5XuFhH', + 'only_matching': True, + }, { + 'url': 'https://videakid.hu/player?v=8YfIAjxwWGwT8HVQ', + 'only_matching': True, + }, { + 'url': 'https://videakid.hu/player/v/8YfIAjxwWGwT8HVQ?autoplay=1', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [url for _, url in re.findall( + r']+src=(["\'])(?P(?:https?:)?//videa\.hu/player\?.*?\bv=.+?)\1', + webpage)] + + def _real_extract(self, url): + video_id = self._match_id(url) + + info = self._download_xml( + 'http://videa.hu/videaplayer_get_xml.php', video_id, + query={'v': video_id}) + + video = xpath_element(info, './/video', 'video', fatal=True) + sources = xpath_element(info, './/video_sources', 'sources', fatal=True) + + title = xpath_text(video, './title', fatal=True) + + formats = [] + for source in sources.findall('./video_source'): + source_url = source.text + if not source_url: + continue + f = parse_codecs(source.get('codecs')) + f.update({ + 'url': source_url, + 'ext': mimetype2ext(source.get('mimetype')) or 'mp4', + 'format_id': source.get('name'), + 'width': int_or_none(source.get('width')), + 'height': int_or_none(source.get('height')), + }) + formats.append(f) + self._sort_formats(formats) + + thumbnail = xpath_text(video, './poster_src') + duration = int_or_none(xpath_text(video, './duration')) + + age_limit = None + is_adult = xpath_text(video, './is_adult_content', default=None) + if is_adult: + age_limit = 18 if is_adult == '1' else 0 + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'age_limit': age_limit, + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/videodetective.py youtube-dl-2019.05.20/youtube_dl/extractor/videodetective.py --- youtube-dl-2016.02.22/youtube_dl/extractor/videodetective.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/videodetective.py 2019-06-07 18:49:07.000000000 +0000 @@ -6,7 +6,7 @@ class VideoDetectiveIE(InfoExtractor): - _VALID_URL = r'https?://www\.videodetective\.com/[^/]+/[^/]+/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?videodetective\.com/[^/]+/[^/]+/(?P\d+)' _TEST = { 'url': 'http://www.videodetective.com/movies/kick-ass-2/194487', @@ -14,8 +14,11 @@ 'id': '194487', 'ext': 'mp4', 'title': 'KICK-ASS 2', - 'description': 'md5:65ba37ad619165afac7d432eaded6013', - 'duration': 138, + 'description': 'md5:c189d5b7280400630a1d3dd17eaa8d8a', + }, + 'params': { + # m3u8 download + 'skip_download': True, }, } @@ -24,4 +27,4 @@ webpage = self._download_webpage(url, video_id) og_video = self._og_search_video_url(webpage) query = compat_urlparse.urlparse(og_video).query - return self.url_result(InternetVideoArchiveIE._build_url(query), ie=InternetVideoArchiveIE.ie_key()) + return self.url_result(InternetVideoArchiveIE._build_json_url(query), ie=InternetVideoArchiveIE.ie_key()) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/videomega.py youtube-dl-2019.05.20/youtube_dl/extractor/videomega.py --- youtube-dl-2016.02.22/youtube_dl/extractor/videomega.py 2016-01-14 09:35:12.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/videomega.py 2019-06-07 18:47:18.000000000 +0000 @@ -4,11 +4,13 @@ import re from .common import InfoExtractor -from ..utils import sanitized_Request +from ..utils import ( + decode_packed_codes, + sanitized_Request, +) class VideoMegaIE(InfoExtractor): - _WORKING = False _VALID_URL = r'(?:videomega:|https?://(?:www\.)?videomega\.tv/(?:(?:view|iframe|cdn)\.php)?\?ref=)(?P[A-Za-z0-9]+)' _TESTS = [{ 'url': 'http://videomega.tv/cdn.php?ref=AOSQBJYKIDDIKYJBQSOA', @@ -17,7 +19,7 @@ 'id': 'AOSQBJYKIDDIKYJBQSOA', 'ext': 'mp4', 'title': '1254207', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', } }, { 'url': 'http://videomega.tv/cdn.php?ref=AOSQBJYKIDDIKYJBQSOA&width=1070&height=600', @@ -42,8 +44,10 @@ r'(?:^[Vv]ideo[Mm]ega\.tv\s-\s*|\s*-\svideomega\.tv$)', '', title) thumbnail = self._search_regex( r']+?poster="([^"]+)"', webpage, 'thumbnail', fatal=False) + + real_codes = decode_packed_codes(webpage) video_url = self._search_regex( - r']+?src="([^"]+)"', webpage, 'video URL') + r'"src"\s*,\s*"([^"]+)"', real_codes, 'video URL') return { 'id': video_id, diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/videomore.py youtube-dl-2019.05.20/youtube_dl/extractor/videomore.py --- youtube-dl-2016.02.22/youtube_dl/extractor/videomore.py 2016-02-09 11:57:41.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/videomore.py 2019-06-07 18:49:07.000000000 +0000 @@ -4,51 +4,62 @@ import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( int_or_none, - parse_age_limit, - parse_iso8601, + orderedSet, + parse_duration, + str_or_none, + unified_strdate, + url_or_none, + xpath_element, xpath_text, ) class VideomoreIE(InfoExtractor): IE_NAME = 'videomore' - _VALID_URL = r'videomore:(?P\d+)$|https?://videomore\.ru/(?:(?:embed|[^/]+/[^/]+)/|[^/]+\?.*\btrack_id=)(?P\d+)(?:[/?#&]|\.(?:xml|json)|$)' + _VALID_URL = r'''(?x) + videomore:(?P\d+)$| + https?://(?:player\.)?videomore\.ru/ + (?: + (?: + embed| + [^/]+/[^/]+ + )/| + [^/]*\?.*?\btrack_id= + ) + (?P\d+) + (?:[/?#&]|\.(?:xml|json)|$) + ''' _TESTS = [{ 'url': 'http://videomore.ru/kino_v_detalayah/5_sezon/367617', - 'md5': '70875fbf57a1cd004709920381587185', + 'md5': '44455a346edc0d509ac5b5a5b531dc35', 'info_dict': { 'id': '367617', 'ext': 'flv', - 'title': 'В гостях Алексей Чумаков и Юлия Ковальчук', - 'description': 'В гостях – лучшие романтические комедии года, «Выживший» Иньярриту и «Стив Джобс» Дэнни Бойла.', + 'title': 'Кино в деталях 5 сезон В гостях Алексей Чумаков и Юлия Ковальчук', 'series': 'Кино в деталях', 'episode': 'В гостях Алексей Чумаков и Юлия Ковальчук', - 'episode_number': None, - 'season': 'Сезон 2015', - 'season_number': 5, - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'duration': 2910, - 'age_limit': 16, 'view_count': int, + 'comment_count': int, + 'age_limit': 16, }, }, { 'url': 'http://videomore.ru/embed/259974', 'info_dict': { 'id': '259974', 'ext': 'flv', - 'title': '80 серия', - 'description': '«Медведей» ждет решающий матч. Макеев выясняет отношения со Стрельцовым. Парни узнают подробности прошлого Макеева.', + 'title': 'Молодежка 2 сезон 40 серия', 'series': 'Молодежка', - 'episode': '80 серия', - 'episode_number': 40, - 'season': '2 сезон', - 'season_number': 2, - 'thumbnail': 're:^https?://.*\.jpg', + 'episode': '40 серия', + 'thumbnail': r're:^https?://.*\.jpg', 'duration': 2809, - 'age_limit': 16, 'view_count': int, + 'comment_count': int, + 'age_limit': 16, }, 'params': { 'skip_download': True, @@ -58,14 +69,9 @@ 'info_dict': { 'id': '341073', 'ext': 'flv', - 'title': 'Команда проиграла из-за Бакина?', - 'description': 'Молодежка 3 сезон скоро', - 'series': 'Молодежка', + 'title': 'Промо Команда проиграла из-за Бакина?', 'episode': 'Команда проиграла из-за Бакина?', - 'episode_number': None, - 'season': 'Промо', - 'season_number': 99, - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'duration': 29, 'age_limit': 16, 'view_count': int, @@ -91,13 +97,21 @@ }, { 'url': 'videomore:367617', 'only_matching': True, + }, { + 'url': 'https://player.videomore.ru/?partner_id=97&track_id=736234&autoplay=0&userToken=', + 'only_matching': True, }] @staticmethod def _extract_url(webpage): mobj = re.search( - r']+data=(["\'])https?://videomore.ru/player\.swf\?.*config=(?Phttps?://videomore\.ru/(?:[^/]+/)+\d+\.xml).*\1', + r']+data=(["\'])https?://videomore\.ru/player\.swf\?.*config=(?Phttps?://videomore\.ru/(?:[^/]+/)+\d+\.xml).*\1', webpage) + if not mobj: + mobj = re.search( + r']+src=([\'"])(?Phttps?://videomore\.ru/embed/\d+)', + webpage) + if mobj: return mobj.group('url') @@ -109,42 +123,33 @@ 'http://videomore.ru/video/tracks/%s.xml' % video_id, video_id, 'Downloading video XML') - video_url = xpath_text(video, './/video_url', 'video url', fatal=True) + item = xpath_element(video, './/playlist/item', fatal=True) + + title = xpath_text( + item, ('./title', './episode_name'), 'title', fatal=True) + + video_url = xpath_text(item, './video_url', 'video url', fatal=True) formats = self._extract_f4m_formats(video_url, video_id, f4m_id='hds') + self._sort_formats(formats) + + thumbnail = xpath_text(item, './thumbnail_url') + duration = int_or_none(xpath_text(item, './duration')) + view_count = int_or_none(xpath_text(item, './views')) + comment_count = int_or_none(xpath_text(item, './count_comments')) + age_limit = int_or_none(xpath_text(item, './min_age')) - data = self._download_json( - 'http://videomore.ru/video/tracks/%s.json' % video_id, - video_id, 'Downloading video JSON') - - title = data.get('title') or data['project_title'] - description = data.get('description') or data.get('description_raw') - timestamp = parse_iso8601(data.get('published_at')) - duration = int_or_none(data.get('duration')) - view_count = int_or_none(data.get('views')) - age_limit = parse_age_limit(data.get('min_age')) - thumbnails = [{ - 'url': thumbnail, - } for thumbnail in data.get('big_thumbnail_urls', [])] - - series = data.get('project_title') - episode = data.get('title') - episode_number = int_or_none(data.get('episode_of_season') or None) - season = data.get('season_title') - season_number = int_or_none(data.get('season_pos') or None) + series = xpath_text(item, './project_name') + episode = xpath_text(item, './episode_name') return { 'id': video_id, 'title': title, - 'description': description, 'series': series, 'episode': episode, - 'episode_number': episode_number, - 'season': season, - 'season_number': season_number, - 'thumbnails': thumbnails, - 'timestamp': timestamp, + 'thumbnail': thumbnail, 'duration': duration, 'view_count': view_count, + 'comment_count': comment_count, 'age_limit': age_limit, 'formats': formats, } @@ -152,7 +157,7 @@ class VideomoreVideoIE(InfoExtractor): IE_NAME = 'videomore:video' - _VALID_URL = r'https?://videomore\.ru/(?:(?:[^/]+/){2})?(?P[^/?#&]+)[/?#&]*$' + _VALID_URL = r'https?://videomore\.ru/(?:(?:[^/]+/){2})?(?P[^/?#&]+)(?:/*|[?#&].*?)$' _TESTS = [{ # single video with og:video:iframe 'url': 'http://videomore.ru/elki_3', @@ -161,7 +166,7 @@ 'ext': 'flv', 'title': 'Ёлки 3', 'description': '', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'duration': 5579, 'age_limit': 6, 'view_count': int, @@ -184,7 +189,7 @@ 'ext': 'flv', 'title': '1 серия. Здравствуй, Аквавилль!', 'description': 'md5:c6003179538b5d353e7bcd5b1372b2d7', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'duration': 754, 'age_limit': 6, 'view_count': int, @@ -192,6 +197,9 @@ 'params': { 'skip_download': True, }, + }, { + 'url': 'https://videomore.ru/molodezhka/6_sezon/29_seriya?utm_so', + 'only_matching': True, }] @classmethod @@ -212,13 +220,16 @@ r'track-id=["\'](\d+)', r'xcnt_product_id\s*=\s*(\d+)'), webpage, 'video id') video_url = 'videomore:%s' % video_id + else: + video_id = None - return self.url_result(video_url, VideomoreIE.ie_key()) + return self.url_result( + video_url, ie=VideomoreIE.ie_key(), video_id=video_id) class VideomoreSeasonIE(InfoExtractor): IE_NAME = 'videomore:season' - _VALID_URL = r'https?://videomore\.ru/(?!embed)(?P[^/]+/[^/?#&]+)[/?#&]*$' + _VALID_URL = r'https?://videomore\.ru/(?!embed)(?P[^/]+/[^/?#&]+)(?:/*|[?#&].*?)$' _TESTS = [{ 'url': 'http://videomore.ru/molodezhka/sezon_promo', 'info_dict': { @@ -226,8 +237,16 @@ 'title': 'Молодежка Промо', }, 'playlist_mincount': 12, + }, { + 'url': 'http://videomore.ru/molodezhka/sezon_promo?utm_so', + 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return (False if (VideomoreIE.suitable(url) or VideomoreVideoIE.suitable(url)) + else super(VideomoreSeasonIE, cls).suitable(url)) + def _real_extract(self, url): display_id = self._match_id(url) @@ -235,9 +254,54 @@ title = self._og_search_title(webpage) - entries = [ - self.url_result(item) for item in re.findall( - r']+href="((?:https?:)?//videomore\.ru/%s/[^/]+)"[^>]+class="widget-item-desc"' - % display_id, webpage)] + data = self._parse_json( + self._html_search_regex( + r'\bclass=["\']seasons-tracks["\'][^>]+\bdata-custom-data=(["\'])(?P{.+?})\1', + webpage, 'data', default='{}', group='value'), + display_id, fatal=False) + + entries = [] + + if data: + episodes = data.get('episodes') + if isinstance(episodes, list): + for ep in episodes: + if not isinstance(ep, dict): + continue + ep_id = int_or_none(ep.get('id')) + ep_url = url_or_none(ep.get('url')) + if ep_id: + e = { + 'url': 'videomore:%s' % ep_id, + 'id': compat_str(ep_id), + } + elif ep_url: + e = {'url': ep_url} + else: + continue + e.update({ + '_type': 'url', + 'ie_key': VideomoreIE.ie_key(), + 'title': str_or_none(ep.get('title')), + 'thumbnail': url_or_none(ep.get('image')), + 'duration': parse_duration(ep.get('duration')), + 'episode_number': int_or_none(ep.get('number')), + 'upload_date': unified_strdate(ep.get('date')), + }) + entries.append(e) + + if not entries: + entries = [ + self.url_result( + 'videomore:%s' % video_id, ie=VideomoreIE.ie_key(), + video_id=video_id) + for video_id in orderedSet(re.findall( + r':(?:id|key)=["\'](\d+)["\']', webpage))] + + if not entries: + entries = [ + self.url_result(item) for item in re.findall( + r']+href="((?:https?:)?//videomore\.ru/%s/[^/]+)"[^>]+class="widget-item-desc"' + % display_id, webpage)] return self.playlist_result(entries, display_id, title) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/videopremium.py youtube-dl-2019.05.20/youtube_dl/extractor/videopremium.py --- youtube-dl-2016.02.22/youtube_dl/extractor/videopremium.py 2016-02-14 22:26:17.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/videopremium.py 2019-06-07 18:49:07.000000000 +0000 @@ -26,7 +26,7 @@ webpage_url = 'http://videopremium.tv/' + video_id webpage = self._download_webpage(webpage_url, video_id) - if re.match(r'^]*>window.location\s*=', webpage): + if re.match(r'^]*>window\.location\s*=', webpage): # Download again, we need a cookie webpage = self._download_webpage( webpage_url, video_id, diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/videopress.py youtube-dl-2019.05.20/youtube_dl/extractor/videopress.py --- youtube-dl-2016.02.22/youtube_dl/extractor/videopress.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/videopress.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,96 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + float_or_none, + parse_age_limit, + qualities, + random_birthday, + try_get, + unified_timestamp, + urljoin, +) + + +class VideoPressIE(InfoExtractor): + _VALID_URL = r'https?://videopress\.com/embed/(?P[\da-zA-Z]+)' + _TESTS = [{ + 'url': 'https://videopress.com/embed/kUJmAcSf', + 'md5': '706956a6c875873d51010921310e4bc6', + 'info_dict': { + 'id': 'kUJmAcSf', + 'ext': 'mp4', + 'title': 'VideoPress Demo', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 634.6, + 'timestamp': 1434983935, + 'upload_date': '20150622', + 'age_limit': 0, + }, + }, { + # 17+, requires birth_* params + 'url': 'https://videopress.com/embed/iH3gstfZ', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r']+src=["\']((?:https?://)?videopress\.com/embed/[\da-zA-Z]+)', + webpage) + + def _real_extract(self, url): + video_id = self._match_id(url) + + query = random_birthday('birth_year', 'birth_month', 'birth_day') + video = self._download_json( + 'https://public-api.wordpress.com/rest/v1.1/videos/%s' % video_id, + video_id, query=query) + + title = video['title'] + + def base_url(scheme): + return try_get( + video, lambda x: x['file_url_base'][scheme], compat_str) + + base_url = base_url('https') or base_url('http') + + QUALITIES = ('std', 'dvd', 'hd') + quality = qualities(QUALITIES) + + formats = [] + for format_id, f in video['files'].items(): + if not isinstance(f, dict): + continue + for ext, path in f.items(): + if ext in ('mp4', 'ogg'): + formats.append({ + 'url': urljoin(base_url, path), + 'format_id': '%s-%s' % (format_id, ext), + 'ext': determine_ext(path, ext), + 'quality': quality(format_id), + }) + original_url = try_get(video, lambda x: x['original'], compat_str) + if original_url: + formats.append({ + 'url': original_url, + 'format_id': 'original', + 'quality': len(QUALITIES), + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': video.get('description'), + 'thumbnail': video.get('poster'), + 'duration': float_or_none(video.get('duration'), 1000), + 'timestamp': unified_timestamp(video.get('upload_date')), + 'age_limit': parse_age_limit(video.get('rating')), + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/videott.py youtube-dl-2019.05.20/youtube_dl/extractor/videott.py --- youtube-dl-2016.02.22/youtube_dl/extractor/videott.py 2016-01-14 09:35:12.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/videott.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,65 +0,0 @@ -from __future__ import unicode_literals - -import re -import base64 - -from .common import InfoExtractor -from ..utils import ( - unified_strdate, - int_or_none, -) - - -class VideoTtIE(InfoExtractor): - _WORKING = False - ID_NAME = 'video.tt' - IE_DESC = 'video.tt - Your True Tube' - _VALID_URL = r'http://(?:www\.)?video\.tt/(?:(?:video|embed)/|watch_video\.php\?v=)(?P[\da-zA-Z]{9})' - - _TESTS = [{ - 'url': 'http://www.video.tt/watch_video.php?v=amd5YujV8', - 'md5': 'b13aa9e2f267effb5d1094443dff65ba', - 'info_dict': { - 'id': 'amd5YujV8', - 'ext': 'flv', - 'title': 'Motivational video Change your mind in just 2.50 mins', - 'description': '', - 'upload_date': '20130827', - 'uploader': 'joseph313', - } - }, { - 'url': 'http://video.tt/embed/amd5YujV8', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - settings = self._download_json( - 'http://www.video.tt/player_control/settings.php?v=%s' % video_id, video_id, - 'Downloading video JSON')['settings'] - - video = settings['video_details']['video'] - - formats = [ - { - 'url': base64.b64decode(res['u'].encode('utf-8')).decode('utf-8'), - 'ext': 'flv', - 'format_id': res['l'], - } for res in settings['res'] if res['u'] - ] - - return { - 'id': video_id, - 'title': video['title'], - 'description': video['description'], - 'thumbnail': settings['config']['thumbnail'], - 'upload_date': unified_strdate(video['added']), - 'uploader': video['owner'], - 'view_count': int_or_none(video['view_count']), - 'comment_count': None if video.get('comment_count') == '--' else int_or_none(video['comment_count']), - 'like_count': int_or_none(video['liked']), - 'dislike_count': int_or_none(video['disliked']), - 'formats': formats, - } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/vidio.py youtube-dl-2019.05.20/youtube_dl/extractor/vidio.py --- youtube-dl-2016.02.22/youtube_dl/extractor/vidio.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/vidio.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,77 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import int_or_none + + +class VidioIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vidio\.com/watch/(?P\d+)-(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015', + 'md5': 'cd2801394afc164e9775db6a140b91fe', + 'info_dict': { + 'id': '165683', + 'display_id': 'dj_ambred-booyah-live-2015', + 'ext': 'mp4', + 'title': 'DJ_AMBRED - Booyah (Live 2015)', + 'description': 'md5:27dc15f819b6a78a626490881adbadf8', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 149, + 'like_count': int, + }, + }, { + 'url': 'https://www.vidio.com/watch/77949-south-korea-test-fires-missile-that-can-strike-all-of-the-north', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id, display_id = mobj.group('id', 'display_id') + + webpage = self._download_webpage(url, display_id) + + title = self._og_search_title(webpage) + + m3u8_url, duration, thumbnail = [None] * 3 + + clips = self._parse_json( + self._html_search_regex( + r'data-json-clips\s*=\s*(["\'])(?P\[.+?\])\1', + webpage, 'video data', default='[]', group='data'), + display_id, fatal=False) + if clips: + clip = clips[0] + m3u8_url = clip.get('sources', [{}])[0].get('file') + duration = clip.get('clip_duration') + thumbnail = clip.get('image') + + m3u8_url = m3u8_url or self._search_regex( + r'data(?:-vjs)?-clip-hls-url=(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'hls url', group='url') + formats = self._extract_m3u8_formats( + m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native') + self._sort_formats(formats) + + duration = int_or_none(duration or self._search_regex( + r'data-video-duration=(["\'])(?P\d+)\1', webpage, + 'duration', fatal=False, group='duration')) + thumbnail = thumbnail or self._og_search_thumbnail(webpage) + + like_count = int_or_none(self._search_regex( + (r']+data-comment-vote-count=["\'](\d+)', + r']+class=["\'].*?\blike(?:__|-)count\b.*?["\'][^>]*>\s*(\d+)'), + webpage, 'like count', fatal=False)) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': self._og_search_description(webpage), + 'thumbnail': thumbnail, + 'duration': duration, + 'like_count': like_count, + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/vidlii.py youtube-dl-2019.05.20/youtube_dl/extractor/vidlii.py --- youtube-dl-2016.02.22/youtube_dl/extractor/vidlii.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/vidlii.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,125 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + float_or_none, + get_element_by_id, + int_or_none, + strip_or_none, + unified_strdate, + urljoin, +) + + +class VidLiiIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vidlii\.com/(?:watch|embed)\?.*?\bv=(?P[0-9A-Za-z_-]{11})' + _TESTS = [{ + 'url': 'https://www.vidlii.com/watch?v=tJluaH4BJ3v', + 'md5': '9bf7d1e005dfa909b6efb0a1ff5175e2', + 'info_dict': { + 'id': 'tJluaH4BJ3v', + 'ext': 'mp4', + 'title': 'Vidlii is against me', + 'description': 'md5:fa3f119287a2bfb922623b52b1856145', + 'thumbnail': 're:https://.*.jpg', + 'uploader': 'APPle5auc31995', + 'uploader_url': 'https://www.vidlii.com/user/APPle5auc31995', + 'upload_date': '20171107', + 'duration': 212, + 'view_count': int, + 'comment_count': int, + 'average_rating': float, + 'categories': ['News & Politics'], + 'tags': ['Vidlii', 'Jan', 'Videogames'], + } + }, { + 'url': 'https://www.vidlii.com/embed?v=tJluaH4BJ3v&a=0', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'https://www.vidlii.com/watch?v=%s' % video_id, video_id) + + video_url = self._search_regex( + r'src\s*:\s*(["\'])(?P(?:https?://)?(?:(?!\1).)+)\1', webpage, + 'video url', group='url') + + title = self._search_regex( + (r'

    ([^<]+)

    ', r'([^<]+) - VidLii<'), webpage, + 'title') + + description = self._html_search_meta( + ('description', 'twitter:description'), webpage, + default=None) or strip_or_none( + get_element_by_id('des_text', webpage)) + + thumbnail = self._html_search_meta( + 'twitter:image', webpage, default=None) + if not thumbnail: + thumbnail_path = self._search_regex( + r'img\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'thumbnail', fatal=False, group='url') + if thumbnail_path: + thumbnail = urljoin(url, thumbnail_path) + + uploader = self._search_regex( + r'<div[^>]+class=["\']wt_person[^>]+>\s*<a[^>]+\bhref=["\']/user/[^>]+>([^<]+)', + webpage, 'uploader', fatal=False) + uploader_url = 'https://www.vidlii.com/user/%s' % uploader if uploader else None + + upload_date = unified_strdate(self._html_search_meta( + 'datePublished', webpage, default=None) or self._search_regex( + r'<date>([^<]+)', webpage, 'upload date', fatal=False)) + + duration = int_or_none(self._html_search_meta( + 'video:duration', webpage, 'duration', + default=None) or self._search_regex( + r'duration\s*:\s*(\d+)', webpage, 'duration', fatal=False)) + + view_count = int_or_none(self._search_regex( + (r'<strong>(\d+)</strong> views', + r'Views\s*:\s*<strong>(\d+)</strong>'), + webpage, 'view count', fatal=False)) + + comment_count = int_or_none(self._search_regex( + (r'<span[^>]+id=["\']cmt_num[^>]+>(\d+)', + r'Comments\s*:\s*<strong>(\d+)'), + webpage, 'comment count', fatal=False)) + + average_rating = float_or_none(self._search_regex( + r'rating\s*:\s*([\d.]+)', webpage, 'average rating', fatal=False)) + + category = self._html_search_regex( + r'<div>Category\s*:\s*</div>\s*<div>\s*<a[^>]+>([^<]+)', webpage, + 'category', fatal=False) + categories = [category] if category else None + + tags = [ + strip_or_none(tag) + for tag in re.findall( + r'<a[^>]+\bhref=["\']/results\?.*?q=[^>]*>([^<]+)', + webpage) if strip_or_none(tag) + ] or None + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'uploader_url': uploader_url, + 'upload_date': upload_date, + 'duration': duration, + 'view_count': view_count, + 'comment_count': comment_count, + 'average_rating': average_rating, + 'categories': categories, + 'tags': tags, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/vidme.py youtube-dl-2019.05.20/youtube_dl/extractor/vidme.py --- youtube-dl-2016.02.22/youtube_dl/extractor/vidme.py 2016-02-09 11:57:41.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/vidme.py 2019-06-07 18:49:07.000000000 +0000 @@ -9,6 +9,7 @@ int_or_none, float_or_none, parse_iso8601, + url_or_none, ) @@ -23,7 +24,7 @@ 'ext': 'mp4', 'title': 'Fishing for piranha - the easy way', 'description': 'source: https://www.facebook.com/photo.php?v=312276045600871', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'timestamp': 1406313244, 'upload_date': '20140725', 'age_limit': 0, @@ -39,7 +40,7 @@ 'id': 'Gc6M', 'ext': 'mp4', 'title': 'O Mere Dil ke chain - Arnav and Khushi VM', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'timestamp': 1441211642, 'upload_date': '20150902', 'uploader': 'SunshineM', @@ -61,7 +62,7 @@ 'ext': 'mp4', 'title': 'The Carver', 'description': 'md5:e9c24870018ae8113be936645b93ba3c', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'timestamp': 1433203629, 'upload_date': '20150602', 'uploader': 'Thomas', @@ -82,7 +83,7 @@ 'id': 'Wmur', 'ext': 'mp4', 'title': 'naked smoking & stretching', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'timestamp': 1430931613, 'upload_date': '20150506', 'uploader': 'naked-yogi', @@ -115,7 +116,7 @@ 'id': 'e5g', 'ext': 'mp4', 'title': 'Video upload (e5g)', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'timestamp': 1401480195, 'upload_date': '20140530', 'uploader': None, @@ -161,13 +162,28 @@ 'or for violating the terms of use.', expected=True) - formats = [{ - 'format_id': f.get('type'), - 'url': f['uri'], - 'width': int_or_none(f.get('width')), - 'height': int_or_none(f.get('height')), - 'preference': 0 if f.get('type', '').endswith('clip') else 1, - } for f in video.get('formats', []) if f.get('uri')] + formats = [] + for f in video.get('formats', []): + format_url = url_or_none(f.get('uri')) + if not format_url: + continue + format_type = f.get('type') + if format_type == 'dash': + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False)) + elif format_type == 'hls': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'format_id': f.get('type'), + 'url': format_url, + 'width': int_or_none(f.get('width')), + 'height': int_or_none(f.get('height')), + 'preference': 0 if f.get('type', '').endswith( + 'clip') else 1, + }) if not formats and video.get('complete_url'): formats.append({ @@ -245,29 +261,35 @@ class VidmeUserIE(VidmeListBaseIE): IE_NAME = 'vidme:user' - _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]{6,})(?!/likes)(?:[^\da-zA-Z]|$)' + _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z_-]{6,})(?!/likes)(?:[^\da-zA-Z_-]|$)' _API_ITEM = 'list' _TITLE = 'Videos' - _TEST = { - 'url': 'https://vid.me/EFARCHIVE', + _TESTS = [{ + 'url': 'https://vid.me/MasakoX', 'info_dict': { - 'id': '3834632', - 'title': 'EFARCHIVE - %s' % _TITLE, + 'id': '16112341', + 'title': 'MasakoX - %s' % _TITLE, }, - 'playlist_mincount': 238, - } + 'playlist_mincount': 191, + }, { + 'url': 'https://vid.me/unsQuare_netWork', + 'only_matching': True, + }] class VidmeUserLikesIE(VidmeListBaseIE): IE_NAME = 'vidme:user:likes' - _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]{6,})/likes' + _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z_-]{6,})/likes' _API_ITEM = 'likes' _TITLE = 'Likes' - _TEST = { + _TESTS = [{ 'url': 'https://vid.me/ErinAlexis/likes', 'info_dict': { 'id': '6483530', 'title': 'ErinAlexis - %s' % _TITLE, }, 'playlist_mincount': 415, - } + }, { + 'url': 'https://vid.me/Kaleidoscope-Ish/likes', + 'only_matching': True, + }] diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/vidzi.py youtube-dl-2019.05.20/youtube_dl/extractor/vidzi.py --- youtube-dl-2016.02.22/youtube_dl/extractor/vidzi.py 2016-02-01 10:41:30.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/vidzi.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,39 +1,68 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor -from ..utils import smuggle_url +from ..utils import ( + decode_packed_codes, + js_to_json, + NO_DEFAULT, + PACKED_CODES_RE, +) class VidziIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vidzi\.tv/(?P<id>\w+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?vidzi\.(?:tv|cc|si|nu)/(?:embed-)?(?P<id>[0-9a-zA-Z]+)' + _TESTS = [{ 'url': 'http://vidzi.tv/cghql9yq6emu.html', 'md5': '4f16c71ca0c8c8635ab6932b5f3f1660', 'info_dict': { 'id': 'cghql9yq6emu', 'ext': 'mp4', 'title': 'youtube-dl test video 1\\\\2\'3/4<5\\\\6ä7↭', - 'uploader': 'vidzi.tv', }, 'params': { # m3u8 download 'skip_download': True, }, - } + }, { + 'url': 'http://vidzi.tv/embed-4z2yb0rzphe9-600x338.html', + 'only_matching': True, + }, { + 'url': 'http://vidzi.cc/cghql9yq6emu.html', + 'only_matching': True, + }, { + 'url': 'https://vidzi.si/rph9gztxj1et.html', + 'only_matching': True, + }, { + 'url': 'http://vidzi.nu/cghql9yq6emu.html', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage( + 'http://vidzi.tv/%s' % video_id, video_id) title = self._html_search_regex( r'(?s)<h2 class="video-title">(.*?)</h2>', webpage, 'title') - # Vidzi now uses jwplayer, which can be handled by GenericIE - return { - '_type': 'url_transparent', - 'id': video_id, - 'title': title, - 'url': smuggle_url(url, {'to_generic': True}), - 'ie_key': 'Generic', - } + codes = [webpage] + codes.extend([ + decode_packed_codes(mobj.group(0)).replace('\\\'', '\'') + for mobj in re.finditer(PACKED_CODES_RE, webpage)]) + for num, code in enumerate(codes, 1): + jwplayer_data = self._parse_json( + self._search_regex( + r'setup\(([^)]+)\)', code, 'jwplayer data', + default=NO_DEFAULT if num == len(codes) else '{}'), + video_id, transform_source=lambda s: js_to_json( + re.sub(r'\s*\+\s*window\[.+?\]', '', s))) + if jwplayer_data: + break + + info_dict = self._parse_jwplayer_data(jwplayer_data, video_id, require_title=False) + info_dict['title'] = title + + return info_dict diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/vier.py youtube-dl-2019.05.20/youtube_dl/extractor/vier.py --- youtube-dl-2016.02.22/youtube_dl/extractor/vier.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/vier.py 2019-06-07 18:49:07.000000000 +0000 @@ -5,55 +5,190 @@ import itertools from .common import InfoExtractor +from ..utils import ( + urlencode_postdata, + int_or_none, + unified_strdate, +) class VierIE(InfoExtractor): IE_NAME = 'vier' - _VALID_URL = r'https?://(?:www\.)?vier\.be/(?:[^/]+/videos/(?P<display_id>[^/]+)(?:/(?P<id>\d+))?|video/v3/embed/(?P<embed_id>\d+))' + IE_DESC = 'vier.be and vijf.be' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?(?P<site>vier|vijf)\.be/ + (?: + (?: + [^/]+/videos| + video(?:/[^/]+)* + )/ + (?P<display_id>[^/]+)(?:/(?P<id>\d+))?| + (?: + video/v3/embed| + embed/video/public + )/(?P<embed_id>\d+) + ) + ''' + _NETRC_MACHINE = 'vier' _TESTS = [{ 'url': 'http://www.vier.be/planb/videos/het-wordt-warm-de-moestuin/16129', + 'md5': 'e4ae2054a6b040ef1e289e20d111b46e', 'info_dict': { 'id': '16129', 'display_id': 'het-wordt-warm-de-moestuin', 'ext': 'mp4', 'title': 'Het wordt warm in De Moestuin', 'description': 'De vele uren werk eisen hun tol. Wim droomt van assistentie...', + 'upload_date': '20121025', + 'series': 'Plan B', + 'tags': ['De Moestuin', 'Moestuin', 'meisjes', 'Tomaat', 'Wim', 'Droom'], + }, + }, { + 'url': 'http://www.vijf.be/temptationisland/videos/zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas/2561614', + 'info_dict': { + 'id': '2561614', + 'display_id': 'zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas', + 'ext': 'mp4', + 'title': 'md5:84f45fe48b8c1fa296a7f6d208d080a7', + 'description': 'md5:0356d4981e58b8cbee19355cbd51a8fe', + 'upload_date': '20170228', + 'series': 'Temptation Island', + 'tags': list, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839', + 'info_dict': { + 'id': '2674839', + 'display_id': 'jani-gaat-naar-tokio-aflevering-4', + 'ext': 'mp4', + 'title': 'Jani gaat naar Tokio - Aflevering 4', + 'description': 'md5:aa8d611541db6ae9e863125704511f88', + 'upload_date': '20170501', + 'series': 'Jani gaat', + 'episode_number': 4, + 'tags': ['Jani Gaat', 'Volledige Aflevering'], + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Requires account credentials', + }, { + # Requires account credentials but bypassed extraction via v3/embed page + # without metadata + 'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839', + 'info_dict': { + 'id': '2674839', + 'display_id': 'jani-gaat-naar-tokio-aflevering-4', + 'ext': 'mp4', + 'title': 'jani-gaat-naar-tokio-aflevering-4', }, 'params': { - # m3u8 download 'skip_download': True, }, + 'expected_warnings': ['Log in to extract metadata'], }, { - 'url': 'http://www.vier.be/planb/videos/mieren-herders-van-de-bladluizen', + # Without video id in URL + 'url': 'http://www.vier.be/planb/videos/dit-najaar-plan-b', 'only_matching': True, }, { 'url': 'http://www.vier.be/video/v3/embed/16129', 'only_matching': True, + }, { + 'url': 'https://www.vijf.be/embed/video/public/4093', + 'only_matching': True, + }, { + 'url': 'https://www.vier.be/video/blockbusters/in-juli-en-augustus-summer-classics', + 'only_matching': True, + }, { + 'url': 'https://www.vier.be/video/achter-de-rug/2017/achter-de-rug-seizoen-1-aflevering-6', + 'only_matching': True, }] + def _real_initialize(self): + self._logged_in = False + + def _login(self, site): + username, password = self._get_login_info() + if username is None or password is None: + return + + login_page = self._download_webpage( + 'http://www.%s.be/user/login' % site, + None, note='Logging in', errnote='Unable to log in', + data=urlencode_postdata({ + 'form_id': 'user_login', + 'name': username, + 'pass': password, + }), + headers={'Content-Type': 'application/x-www-form-urlencoded'}) + + login_error = self._html_search_regex( + r'(?s)<div class="messages error">\s*<div>\s*<h2.+?</h2>(.+?)<', + login_page, 'login error', default=None) + if login_error: + self.report_warning('Unable to log in: %s' % login_error) + else: + self._logged_in = True + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) embed_id = mobj.group('embed_id') display_id = mobj.group('display_id') or embed_id + video_id = mobj.group('id') or embed_id + site = mobj.group('site') + + if not self._logged_in: + self._login(site) webpage = self._download_webpage(url, display_id) + if r'id="user-login"' in webpage: + self.report_warning( + 'Log in to extract metadata', video_id=display_id) + webpage = self._download_webpage( + 'http://www.%s.be/video/v3/embed/%s' % (site, video_id), + display_id) + video_id = self._search_regex( [r'data-nid="(\d+)"', r'"nid"\s*:\s*"(\d+)"'], - webpage, 'video id') - application = self._search_regex( - [r'data-application="([^"]+)"', r'"application"\s*:\s*"([^"]+)"'], - webpage, 'application', default='vier_vod') - filename = self._search_regex( - [r'data-filename="([^"]+)"', r'"filename"\s*:\s*"([^"]+)"'], - webpage, 'filename') + webpage, 'video id', default=video_id or display_id) - playlist_url = 'http://vod.streamcloud.be/%s/mp4:_definst_/%s.mp4/playlist.m3u8' % (application, filename) - formats = self._extract_m3u8_formats(playlist_url, display_id, 'mp4') + playlist_url = self._search_regex( + r'data-file=(["\'])(?P<url>(?:https?:)?//[^/]+/.+?\.m3u8.*?)\1', + webpage, 'm3u8 url', default=None, group='url') + + if not playlist_url: + application = self._search_regex( + [r'data-application="([^"]+)"', r'"application"\s*:\s*"([^"]+)"'], + webpage, 'application', default=site + '_vod') + filename = self._search_regex( + [r'data-filename="([^"]+)"', r'"filename"\s*:\s*"([^"]+)"'], + webpage, 'filename') + playlist_url = 'http://vod.streamcloud.be/%s/_definst_/mp4:%s.mp4/playlist.m3u8' % (application, filename) + + formats = self._extract_wowza_formats( + playlist_url, display_id, skip_protocols=['dash']) + self._sort_formats(formats) title = self._og_search_title(webpage, default=display_id) - description = self._og_search_description(webpage, default=None) + description = self._html_search_regex( + r'(?s)<div\b[^>]+\bclass=(["\'])[^>]*?\bfield-type-text-with-summary\b[^>]*?\1[^>]*>.*?<p>(?P<value>.+?)</p>', + webpage, 'description', default=None, group='value') thumbnail = self._og_search_thumbnail(webpage, default=None) + upload_date = unified_strdate(self._html_search_regex( + r'(?s)<div\b[^>]+\bclass=(["\'])[^>]*?\bfield-name-post-date\b[^>]*?\1[^>]*>.*?(?P<value>\d{2}/\d{2}/\d{4})', + webpage, 'upload date', default=None, group='value')) + + series = self._search_regex( + r'data-program=(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, + 'series', default=None, group='value') + episode_number = int_or_none(self._search_regex( + r'(?i)aflevering (\d+)', title, 'episode number', default=None)) + tags = re.findall(r'<a\b[^>]+\bhref=["\']/tags/[^>]+>([^<]+)<', webpage) return { 'id': video_id, @@ -61,13 +196,17 @@ 'title': title, 'description': description, 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'series': series, + 'episode_number': episode_number, + 'tags': tags, 'formats': formats, } class VierVideosIE(InfoExtractor): IE_NAME = 'vier:videos' - _VALID_URL = r'https?://(?:www\.)?vier\.be/(?P<program>[^/]+)/videos(?:\?.*\bpage=(?P<page>\d+)|$)' + _VALID_URL = r'https?://(?:www\.)?(?P<site>vier|vijf)\.be/(?P<program>[^/]+)/videos(?:\?.*\bpage=(?P<page>\d+)|$)' _TESTS = [{ 'url': 'http://www.vier.be/demoestuin/videos', 'info_dict': { @@ -75,6 +214,12 @@ }, 'playlist_mincount': 153, }, { + 'url': 'http://www.vijf.be/temptationisland/videos', + 'info_dict': { + 'id': 'temptationisland', + }, + 'playlist_mincount': 159, + }, { 'url': 'http://www.vier.be/demoestuin/videos?page=6', 'info_dict': { 'id': 'demoestuin-page6', @@ -91,6 +236,7 @@ def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) program = mobj.group('program') + site = mobj.group('site') page_id = mobj.group('page') if page_id: @@ -104,13 +250,13 @@ entries = [] for current_page_id in itertools.count(start_page): current_page = self._download_webpage( - 'http://www.vier.be/%s/videos?page=%d' % (program, current_page_id), + 'http://www.%s.be/%s/videos?page=%d' % (site, program, current_page_id), program, 'Downloading page %d' % (current_page_id + 1)) page_entries = [ - self.url_result('http://www.vier.be' + video_url, 'Vier') + self.url_result('http://www.' + site + '.be' + video_url, 'Vier') for video_url in re.findall( - r'<h3><a href="(/[^/]+/videos/[^/]+(?:/\d+)?)">', current_page)] + r'<h[23]><a href="(/[^/]+/videos/[^/]+(?:/\d+)?)">', current_page)] entries.extend(page_entries) if page_id or '>Meer<' not in current_page: break diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/viewlift.py youtube-dl-2019.05.20/youtube_dl/extractor/viewlift.py --- youtube-dl-2016.02.22/youtube_dl/extractor/viewlift.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/viewlift.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,266 @@ +from __future__ import unicode_literals + +import base64 +import re + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote +from ..utils import ( + ExtractorError, + clean_html, + determine_ext, + int_or_none, + js_to_json, + parse_age_limit, + parse_duration, +) + + +class ViewLiftBaseIE(InfoExtractor): + _DOMAINS_REGEX = r'(?:snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|(?:monumental|lax)sportsnetwork|vayafilm)\.com|hoichoi\.tv' + + +class ViewLiftEmbedIE(ViewLiftBaseIE): + _VALID_URL = r'https?://(?:(?:www|embed)\.)?(?:%s)/embed/player\?.*\bfilmId=(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' % ViewLiftBaseIE._DOMAINS_REGEX + _TESTS = [{ + 'url': 'http://embed.snagfilms.com/embed/player?filmId=74849a00-85a9-11e1-9660-123139220831&w=500', + 'md5': '2924e9215c6eff7a55ed35b72276bd93', + 'info_dict': { + 'id': '74849a00-85a9-11e1-9660-123139220831', + 'ext': 'mp4', + 'title': '#whilewewatch', + } + }, { + # invalid labels, 360p is better that 480p + 'url': 'http://www.snagfilms.com/embed/player?filmId=17ca0950-a74a-11e0-a92a-0026bb61d036', + 'md5': '882fca19b9eb27ef865efeeaed376a48', + 'info_dict': { + 'id': '17ca0950-a74a-11e0-a92a-0026bb61d036', + 'ext': 'mp4', + 'title': 'Life in Limbo', + } + }, { + 'url': 'http://www.snagfilms.com/embed/player?filmId=0000014c-de2f-d5d6-abcf-ffef58af0017', + 'only_matching': True, + }] + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:embed\.)?(?:%s)/embed/player.+?)\1' % ViewLiftBaseIE._DOMAINS_REGEX, + webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + if '>This film is not playable in your area.<' in webpage: + raise ExtractorError( + 'Film %s is not playable in your area.' % video_id, expected=True) + + formats = [] + has_bitrate = False + sources = self._parse_json(self._search_regex( + r'(?s)sources:\s*(\[.+?\]),', webpage, + 'sources', default='[]'), video_id, js_to_json) + for source in sources: + file_ = source.get('file') + if not file_: + continue + type_ = source.get('type') + ext = determine_ext(file_) + format_id = source.get('label') or ext + if all(v in ('m3u8', 'hls') for v in (type_, ext)): + formats.extend(self._extract_m3u8_formats( + file_, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + else: + bitrate = int_or_none(self._search_regex( + [r'(\d+)kbps', r'_\d{1,2}x\d{1,2}_(\d{3,})\.%s' % ext], + file_, 'bitrate', default=None)) + if not has_bitrate and bitrate: + has_bitrate = True + height = int_or_none(self._search_regex( + r'^(\d+)[pP]$', format_id, 'height', default=None)) + formats.append({ + 'url': file_, + 'format_id': 'http-%s%s' % (format_id, ('-%dk' % bitrate if bitrate else '')), + 'tbr': bitrate, + 'height': height, + }) + if not formats: + hls_url = self._parse_json(self._search_regex( + r'filmInfo\.src\s*=\s*({.+?});', + webpage, 'src'), video_id, js_to_json)['src'] + formats = self._extract_m3u8_formats( + hls_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False) + field_preference = None if has_bitrate else ('height', 'tbr', 'format_id') + self._sort_formats(formats, field_preference) + + title = self._search_regex( + [r"title\s*:\s*'([^']+)'", r'<title>([^<]+)'], + webpage, 'title') + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + } + + +class ViewLiftIE(ViewLiftBaseIE): + _VALID_URL = r'https?://(?:www\.)?(?P%s)/(?:films/title|show|(?:news/)?videos?)/(?P[^?#]+)' % ViewLiftBaseIE._DOMAINS_REGEX + _TESTS = [{ + 'url': 'http://www.snagfilms.com/films/title/lost_for_life', + 'md5': '19844f897b35af219773fd63bdec2942', + 'info_dict': { + 'id': '0000014c-de2f-d5d6-abcf-ffef58af0017', + 'display_id': 'lost_for_life', + 'ext': 'mp4', + 'title': 'Lost for Life', + 'description': 'md5:ea10b5a50405ae1f7b5269a6ec594102', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 4489, + 'categories': 'mincount:3', + 'age_limit': 14, + 'upload_date': '20150421', + 'timestamp': 1429656819, + } + }, { + 'url': 'http://www.snagfilms.com/show/the_world_cut_project/india', + 'md5': 'e6292e5b837642bbda82d7f8bf3fbdfd', + 'info_dict': { + 'id': '00000145-d75c-d96e-a9c7-ff5c67b20000', + 'display_id': 'the_world_cut_project/india', + 'ext': 'mp4', + 'title': 'India', + 'description': 'md5:5c168c5a8f4719c146aad2e0dfac6f5f', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 979, + 'categories': 'mincount:2', + 'timestamp': 1399478279, + 'upload_date': '20140507', + } + }, { + # Film is not playable in your area. + 'url': 'http://www.snagfilms.com/films/title/inside_mecca', + 'only_matching': True, + }, { + # Film is not available. + 'url': 'http://www.snagfilms.com/show/augie_alone/flirting', + 'only_matching': True, + }, { + 'url': 'http://www.winnersview.com/videos/the-good-son', + 'only_matching': True, + }, { + # Was once Kaltura embed + 'url': 'https://www.monumentalsportsnetwork.com/videos/john-carlson-postgame-2-25-15', + 'only_matching': True, + }] + + def _real_extract(self, url): + domain, display_id = re.match(self._VALID_URL, url).groups() + + webpage = self._download_webpage(url, display_id) + + if ">Sorry, the Film you're looking for is not available.<" in webpage: + raise ExtractorError( + 'Film %s is not available.' % display_id, expected=True) + + initial_store_state = self._search_regex( + r"window\.initialStoreState\s*=.*?JSON\.parse\(unescape\(atob\('([^']+)'\)\)\)", + webpage, 'Initial Store State', default=None) + if initial_store_state: + modules = self._parse_json(compat_urllib_parse_unquote(base64.b64decode( + initial_store_state).decode()), display_id)['page']['data']['modules'] + content_data = next(m['contentData'][0] for m in modules if m.get('moduleType') == 'VideoDetailModule') + gist = content_data['gist'] + film_id = gist['id'] + title = gist['title'] + video_assets = content_data['streamingInfo']['videoAssets'] + + formats = [] + mpeg_video_assets = video_assets.get('mpeg') or [] + for video_asset in mpeg_video_assets: + video_asset_url = video_asset.get('url') + if not video_asset: + continue + bitrate = int_or_none(video_asset.get('bitrate')) + height = int_or_none(self._search_regex( + r'^_?(\d+)[pP]$', video_asset.get('renditionValue'), + 'height', default=None)) + formats.append({ + 'url': video_asset_url, + 'format_id': 'http%s' % ('-%d' % bitrate if bitrate else ''), + 'tbr': bitrate, + 'height': height, + 'vcodec': video_asset.get('codec'), + }) + + hls_url = video_assets.get('hls') + if hls_url: + formats.extend(self._extract_m3u8_formats( + hls_url, film_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + self._sort_formats(formats, ('height', 'tbr', 'format_id')) + + info = { + 'id': film_id, + 'display_id': display_id, + 'title': title, + 'description': gist.get('description'), + 'thumbnail': gist.get('videoImageUrl'), + 'duration': int_or_none(gist.get('runtime')), + 'age_limit': parse_age_limit(content_data.get('parentalRating')), + 'timestamp': int_or_none(gist.get('publishDate'), 1000), + 'formats': formats, + } + for k in ('categories', 'tags'): + info[k] = [v['title'] for v in content_data.get(k, []) if v.get('title')] + return info + else: + film_id = self._search_regex(r'filmId=([\da-f-]{36})"', webpage, 'film id') + + snag = self._parse_json( + self._search_regex( + r'Snag\.page\.data\s*=\s*(\[.+?\]);', webpage, 'snag', default='[]'), + display_id) + + for item in snag: + if item.get('data', {}).get('film', {}).get('id') == film_id: + data = item['data']['film'] + title = data['title'] + description = clean_html(data.get('synopsis')) + thumbnail = data.get('image') + duration = int_or_none(data.get('duration') or data.get('runtime')) + categories = [ + category['title'] for category in data.get('categories', []) + if category.get('title')] + break + else: + title = self._search_regex( + r'itemprop="title">([^<]+)<', webpage, 'title') + description = self._html_search_regex( + r'(?s)
    (.+?)
    ', + webpage, 'description', default=None) or self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail(webpage) + duration = parse_duration(self._search_regex( + r'([^<]+)<', + webpage, 'duration', fatal=False)) + categories = re.findall(r'
    ([^<]+)', webpage) + + return { + '_type': 'url_transparent', + 'url': 'http://%s/embed/player?filmId=%s' % (domain, film_id), + 'id': film_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'categories': categories, + 'ie_key': 'ViewLiftEmbed', + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/viewster.py youtube-dl-2019.05.20/youtube_dl/extractor/viewster.py --- youtube-dl-2016.02.22/youtube_dl/extractor/viewster.py 2016-01-23 10:08:46.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/viewster.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,10 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import ( compat_HTTPError, - compat_urllib_parse, compat_urllib_parse_unquote, ) from ..utils import ( @@ -14,6 +15,7 @@ parse_iso8601, sanitized_Request, HEADRequest, + url_basename, ) @@ -75,16 +77,18 @@ _ACCEPT_HEADER = 'application/json, text/javascript, */*; q=0.01' - def _download_json(self, url, video_id, note='Downloading JSON metadata', fatal=True): + def _download_json(self, url, video_id, note='Downloading JSON metadata', fatal=True, query={}): request = sanitized_Request(url) request.add_header('Accept', self._ACCEPT_HEADER) request.add_header('Auth-token', self._AUTH_TOKEN) - return super(ViewsterIE, self)._download_json(request, video_id, note, fatal=fatal) + return super(ViewsterIE, self)._download_json(request, video_id, note, fatal=fatal, query=query) def _real_extract(self, url): video_id = self._match_id(url) # Get 'api_token' cookie - self._request_webpage(HEADRequest('http://www.viewster.com/'), video_id) + self._request_webpage( + HEADRequest('http://www.viewster.com/'), + video_id, headers=self.geo_verification_headers()) cookies = self._get_cookies('http://www.viewster.com/') self._AUTH_TOKEN = compat_urllib_parse_unquote(cookies['api_token'].value) @@ -114,43 +118,84 @@ return self.playlist_result(entries, video_id, title, description) formats = [] - for media_type in ('application/f4m+xml', 'application/x-mpegURL', 'video/mp4'): - media = self._download_json( - 'https://public-api.viewster.com/movies/%s/video?mediaType=%s' - % (entry_id, compat_urllib_parse.quote(media_type)), - video_id, 'Downloading %s JSON' % media_type, fatal=False) - if not media: - continue - video_url = media.get('Uri') - if not video_url: + for language_set in info.get('LanguageSets', []): + manifest_url = None + m3u8_formats = [] + audio = language_set.get('Audio') or '' + subtitle = language_set.get('Subtitle') or '' + base_format_id = audio + if subtitle: + base_format_id += '-%s' % subtitle + + def concat(suffix, sep='-'): + return (base_format_id + '%s%s' % (sep, suffix)) if base_format_id else suffix + + medias = self._download_json( + 'https://public-api.viewster.com/movies/%s/videos' % entry_id, + video_id, fatal=False, query={ + 'mediaTypes': ['application/f4m+xml', 'application/x-mpegURL', 'video/mp4'], + 'language': audio, + 'subtitle': subtitle, + }) + if not medias: continue - ext = determine_ext(video_url) - if ext == 'f4m': - video_url += '&' if '?' in video_url else '?' - video_url += 'hdcore=3.2.0&plugin=flowplayer-3.2.0.1' - formats.extend(self._extract_f4m_formats( - video_url, video_id, f4m_id='hds')) - elif ext == 'm3u8': - m3u8_formats = self._extract_m3u8_formats( - video_url, video_id, 'mp4', m3u8_id='hls', - fatal=False) # m3u8 sometimes fail - if m3u8_formats: - formats.extend(m3u8_formats) - else: - format_id = media.get('Bitrate') - f = { - 'url': video_url, - 'format_id': 'mp4-%s' % format_id, - 'height': int_or_none(media.get('Height')), - 'width': int_or_none(media.get('Width')), - 'preference': 1, - } - if format_id and not f['height']: - f['height'] = int_or_none(self._search_regex( - r'^(\d+)[pP]$', format_id, 'height', default=None)) - formats.append(f) + for media in medias: + video_url = media.get('Uri') + if not video_url: + continue + ext = determine_ext(video_url) + if ext == 'f4m': + manifest_url = video_url + video_url += '&' if '?' in video_url else '?' + video_url += 'hdcore=3.2.0&plugin=flowplayer-3.2.0.1' + formats.extend(self._extract_f4m_formats( + video_url, video_id, f4m_id=concat('hds'))) + elif ext == 'm3u8': + manifest_url = video_url + m3u8_formats = self._extract_m3u8_formats( + video_url, video_id, 'mp4', m3u8_id=concat('hls'), + fatal=False) # m3u8 sometimes fail + if m3u8_formats: + formats.extend(m3u8_formats) + else: + qualities_basename = self._search_regex( + r'/([^/]+)\.csmil/', + manifest_url, 'qualities basename', default=None) + if not qualities_basename: + continue + QUALITIES_RE = r'((,\d+k)+,?)' + qualities = self._search_regex( + QUALITIES_RE, qualities_basename, + 'qualities', default=None) + if not qualities: + continue + qualities = list(map(lambda q: int(q[:-1]), qualities.strip(',').split(','))) + qualities.sort() + http_template = re.sub(QUALITIES_RE, r'%dk', qualities_basename) + http_url_basename = url_basename(video_url) + if m3u8_formats: + self._sort_formats(m3u8_formats) + m3u8_formats = list(filter( + lambda f: f.get('vcodec') != 'none', m3u8_formats)) + if len(qualities) == len(m3u8_formats): + for q, m3u8_format in zip(qualities, m3u8_formats): + f = m3u8_format.copy() + f.update({ + 'url': video_url.replace(http_url_basename, http_template % q), + 'format_id': f['format_id'].replace('hls', 'http'), + 'protocol': 'http', + }) + formats.append(f) + else: + for q in qualities: + formats.append({ + 'url': video_url.replace(http_url_basename, http_template % q), + 'ext': 'mp4', + 'format_id': 'http-%d' % q, + 'tbr': q, + }) - if not formats and not info.get('LanguageSets') and not info.get('VODSettings'): + if not formats and not info.get('VODSettings'): self.raise_geo_restricted() self._sort_formats(formats) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/viidea.py youtube-dl-2019.05.20/youtube_dl/extractor/viidea.py --- youtube-dl-2016.02.22/youtube_dl/extractor/viidea.py 2016-01-31 11:57:01.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/viidea.py 2019-06-07 18:49:07.000000000 +0000 @@ -4,18 +4,20 @@ from .common import InfoExtractor from ..compat import ( - compat_urlparse, + compat_HTTPError, compat_str, + compat_urlparse, ) from ..utils import ( - parse_duration, + ExtractorError, js_to_json, + parse_duration, parse_iso8601, ) class ViideaIE(InfoExtractor): - _VALID_URL = r'''(?x)http://(?:www\.)?(?: + _VALID_URL = r'''(?x)https?://(?:www\.)?(?: videolectures\.net| flexilearn\.viidea\.net| presentations\.ocwconsortium\.org| @@ -40,7 +42,7 @@ 'ext': 'mp4', 'title': 'Automatics, robotics and biocybernetics', 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482', - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', 'timestamp': 1372349289, 'upload_date': '20130627', 'duration': 565, @@ -58,7 +60,7 @@ 'ext': 'flv', 'title': 'NLP at Google', 'description': 'md5:fc7a6d9bf0302d7cc0e53f7ca23747b3', - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', 'timestamp': 1284375600, 'upload_date': '20100913', 'duration': 5352, @@ -74,7 +76,7 @@ 'id': '23181', 'title': 'Deep Learning Summer School, Montreal 2015', 'description': 'md5:0533a85e4bd918df52a01f0e1ebe87b7', - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', 'timestamp': 1438560000, }, 'playlist_count': 30, @@ -85,7 +87,7 @@ 'id': '9737', 'display_id': 'mlss09uk_bishop_ibi', 'title': 'Introduction To Bayesian Inference', - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', 'timestamp': 1251622800, }, 'playlist': [{ @@ -94,7 +96,7 @@ 'display_id': 'mlss09uk_bishop_ibi_part1', 'ext': 'wmv', 'title': 'Introduction To Bayesian Inference (Part 1)', - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', 'duration': 4622, 'timestamp': 1251622800, 'upload_date': '20090830', @@ -105,7 +107,7 @@ 'display_id': 'mlss09uk_bishop_ibi_part2', 'ext': 'wmv', 'title': 'Introduction To Bayesian Inference (Part 2)', - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', 'duration': 5641, 'timestamp': 1251622800, 'upload_date': '20090830', @@ -128,9 +130,16 @@ base_url = self._proto_relative_url(cfg['livepipe'], 'http:') - lecture_data = self._download_json( - '%s/site/api/lecture/%s?format=json' % (base_url, lecture_id), - lecture_id)['lecture'][0] + try: + lecture_data = self._download_json( + '%s/site/api/lecture/%s?format=json' % (base_url, lecture_id), + lecture_id)['lecture'][0] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + msg = self._parse_json( + e.cause.read().decode('utf-8'), lecture_id) + raise ExtractorError(msg['detail'], expected=True) + raise lecture_info = { 'id': lecture_id, @@ -151,6 +160,7 @@ smil_url = '%s/%s/video/%s/smil.xml' % (base_url, lecture_slug, part_id) smil = self._download_smil(smil_url, lecture_id) info = self._parse_smil(smil, smil_url, lecture_id) + self._sort_formats(info['formats']) info['id'] = lecture_id if not multipart else '%s_part%s' % (lecture_id, part_id) info['display_id'] = lecture_slug if not multipart else '%s_part%s' % (lecture_slug, part_id) if multipart: diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/viki.py youtube-dl-2019.05.20/youtube_dl/extractor/viki.py --- youtube-dl-2016.02.22/youtube_dl/extractor/viki.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/viki.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,11 +1,12 @@ # coding: utf-8 from __future__ import unicode_literals -import json -import time -import hmac import hashlib +import hmac import itertools +import json +import re +import time from .common import InfoExtractor from ..utils import ( @@ -20,12 +21,13 @@ class VikiBaseIE(InfoExtractor): _VALID_URL_BASE = r'https?://(?:www\.)?viki\.(?:com|net|mx|jp|fr)/' _API_QUERY_TEMPLATE = '/v4/%sapp=%s&t=%s&site=www.viki.com' - _API_URL_TEMPLATE = 'http://api.viki.io%s&sig=%s' + _API_URL_TEMPLATE = 'https://api.viki.io%s&sig=%s' - _APP = '65535a' + _APP = '100005a' _APP_VERSION = '2.2.5.1428709186' - _APP_SECRET = '-$iJ}@p7!G@SyU/je1bEyWg}upLu-6V6-Lg9VD(]siH,r.,m-r|ulZ,U4LC/SeR)' + _APP_SECRET = 'MM_d*yP@`&1@]@!AVrXf_o-HVEnoTnm$O-ti4[G~$JDI/Dc-&piU&z&5.;:}95=Iad' + _GEO_BYPASS = False _NETRC_MACHINE = 'viki' _token = None @@ -76,14 +78,17 @@ def _check_errors(self, data): for reason, status in data.get('blocking', {}).items(): if status and reason in self._ERRORS: + message = self._ERRORS[reason] + if reason == 'geo': + self.raise_geo_restricted(msg=message) raise ExtractorError('%s said: %s' % ( - self.IE_NAME, self._ERRORS[reason]), expected=True) + self.IE_NAME, message), expected=True) def _real_initialize(self): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return @@ -94,17 +99,20 @@ login = self._call_api( 'sessions.json', None, - 'Logging in as %s' % username, post_data=login_form) + 'Logging in', post_data=login_form) self._token = login.get('token') if not self._token: self.report_warning('Unable to get session token, login has probably failed') @staticmethod - def dict_selection(dict_obj, preferred_key): + def dict_selection(dict_obj, preferred_key, allow_fallback=True): if preferred_key in dict_obj: return dict_obj.get(preferred_key) + if not allow_fallback: + return + filtered_dict = list(filter(None, [dict_obj.get(k) for k in dict_obj.keys()])) return filtered_dict[0] if filtered_dict else None @@ -153,20 +161,17 @@ 'like_count': int, 'age_limit': 13, }, - 'params': { - # m3u8 download - 'skip_download': True, - } + 'skip': 'Blocked in the US', }, { # episode 'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1', - 'md5': '190f3ef426005ba3a080a63325955bc3', + 'md5': '5fa476a902e902783ac7a4d615cdbc7a', 'info_dict': { 'id': '44699v', 'ext': 'mp4', 'title': 'Boys Over Flowers - Episode 1', - 'description': 'md5:52617e4f729c7d03bfd4bcbbb6e946f2', - 'duration': 4155, + 'description': 'md5:b89cf50038b480b88b5b3c93589a9076', + 'duration': 4204, 'timestamp': 1270496524, 'upload_date': '20100405', 'uploader': 'group8', @@ -176,13 +181,13 @@ }, { # youtube external 'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1', - 'md5': '216d1afdc0c64d1febc1e9f2bd4b864b', + 'md5': '63f8600c1da6f01b7640eee7eca4f1da', 'info_dict': { 'id': '50562v', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'Poor Nastya [COMPLETE] - Episode 1', 'description': '', - 'duration': 607, + 'duration': 606, 'timestamp': 1274949505, 'upload_date': '20101213', 'uploader': 'ad14065n', @@ -217,7 +222,7 @@ self._check_errors(video) - title = self.dict_selection(video.get('titles', {}), 'en') + title = self.dict_selection(video.get('titles', {}), 'en', allow_fallback=False) if not title: title = 'Episode %d' % video.get('number') if video.get('type') == 'episode' else video.get('id') or video_id container_titles = video.get('container', {}).get('titles', {}) @@ -276,13 +281,38 @@ height = int_or_none(self._search_regex( r'^(\d+)[pP]$', format_id, 'height', default=None)) for protocol, format_dict in stream_dict.items(): + # rtmps URLs does not seem to work + if protocol == 'rtmps': + continue + format_url = format_dict['url'] if format_id == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_dict['url'], video_id, 'mp4', 'm3u8_native', - m3u8_id='m3u8-%s' % protocol, fatal=False)) + m3u8_formats = self._extract_m3u8_formats( + format_url, video_id, 'mp4', + entry_protocol='m3u8_native', + m3u8_id='m3u8-%s' % protocol, fatal=False) + # Despite CODECS metadata in m3u8 all video-only formats + # are actually video+audio + for f in m3u8_formats: + if f.get('acodec') == 'none' and f.get('vcodec') != 'none': + f['acodec'] = None + formats.extend(m3u8_formats) + elif format_url.startswith('rtmp'): + mobj = re.search( + r'^(?Prtmp://[^/]+/(?P.+?))/(?Pmp4:.+)$', + format_url) + if not mobj: + continue + formats.append({ + 'format_id': 'rtmp-%s' % format_id, + 'ext': 'flv', + 'url': mobj.group('url'), + 'play_path': mobj.group('playpath'), + 'app': mobj.group('app'), + 'page_url': url, + }) else: formats.append({ - 'url': format_dict['url'], + 'url': format_url, 'format_id': '%s-%s' % (format_id, protocol), 'height': height, }) @@ -302,7 +332,7 @@ 'title': 'Boys Over Flowers', 'description': 'md5:ecd3cff47967fe193cff37c0bec52790', }, - 'playlist_count': 70, + 'playlist_mincount': 71, }, { 'url': 'http://www.viki.com/tv/1354c-poor-nastya-complete', 'info_dict': { @@ -347,7 +377,7 @@ for video in page['response']: video_id = video['id'] entries.append(self.url_result( - 'http://www.viki.com/videos/%s' % video_id, 'Viki')) + 'https://www.viki.com/videos/%s' % video_id, 'Viki')) if not page['pagination']['next']: break diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/vimeo.py youtube-dl-2019.05.20/youtube_dl/extractor/vimeo.py --- youtube-dl-2016.02.22/youtube_dl/extractor/vimeo.py 2016-02-14 22:26:17.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/vimeo.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,6 +1,7 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals +import base64 import json import re import itertools @@ -8,23 +9,28 @@ from .common import InfoExtractor from ..compat import ( compat_HTTPError, + compat_str, compat_urlparse, ) from ..utils import ( determine_ext, - encode_dict, ExtractorError, + js_to_json, InAdvancePagedList, int_or_none, + merge_dicts, + NO_DEFAULT, + parse_filesize, + qualities, RegexNotFoundError, sanitized_Request, smuggle_url, std_headers, - unified_strdate, + try_get, + unified_timestamp, unsmuggle_url, urlencode_postdata, unescapeHTML, - parse_filesize, ) @@ -34,26 +40,55 @@ _LOGIN_URL = 'https://vimeo.com/log_in' def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: if self._LOGIN_REQUIRED: raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) return - self.report_login() - webpage = self._download_webpage(self._LOGIN_URL, None, False) + webpage = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page') token, vuid = self._extract_xsrft_and_vuid(webpage) - data = urlencode_postdata(encode_dict({ + data = { 'action': 'login', 'email': username, 'password': password, 'service': 'vimeo', 'token': token, - })) - login_request = sanitized_Request(self._LOGIN_URL, data) - login_request.add_header('Content-Type', 'application/x-www-form-urlencoded') - login_request.add_header('Referer', self._LOGIN_URL) + } self._set_vimeo_cookie('vuid', vuid) - self._download_webpage(login_request, None, False, 'Wrong login info') + try: + self._download_webpage( + self._LOGIN_URL, None, 'Logging in', + data=urlencode_postdata(data), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': self._LOGIN_URL, + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 418: + raise ExtractorError( + 'Unable to log in: bad username or password', + expected=True) + raise ExtractorError('Unable to log in') + + def _verify_video_password(self, url, video_id, webpage): + password = self._downloader.params.get('videopassword') + if password is None: + raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) + token, vuid = self._extract_xsrft_and_vuid(webpage) + data = urlencode_postdata({ + 'password': password, + 'token': token, + }) + if url.startswith('http://'): + # vimeo only supports https now, but the user can give an http url + url = url.replace('http://', 'https://') + password_request = sanitized_Request(url + '/password', data) + password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') + password_request.add_header('Referer', url) + self._set_vimeo_cookie('vuid', vuid) + return self._download_webpage( + password_request, video_id, + 'Verifying the password', 'Wrong password') def _extract_xsrft_and_vuid(self, webpage): xsrft = self._search_regex( @@ -67,21 +102,170 @@ def _set_vimeo_cookie(self, name, value): self._set_cookie('vimeo.com', name, value) + def _vimeo_sort_formats(self, formats): + # Bitrates are completely broken. Single m3u8 may contain entries in kbps and bps + # at the same time without actual units specified. This lead to wrong sorting. + self._sort_formats(formats, field_preference=('preference', 'height', 'width', 'fps', 'tbr', 'format_id')) + + def _parse_config(self, config, video_id): + video_data = config['video'] + video_title = video_data['title'] + live_event = video_data.get('live_event') or {} + is_live = live_event.get('status') == 'started' + + formats = [] + config_files = video_data.get('files') or config['request'].get('files', {}) + for f in config_files.get('progressive', []): + video_url = f.get('url') + if not video_url: + continue + formats.append({ + 'url': video_url, + 'format_id': 'http-%s' % f.get('quality'), + 'width': int_or_none(f.get('width')), + 'height': int_or_none(f.get('height')), + 'fps': int_or_none(f.get('fps')), + 'tbr': int_or_none(f.get('bitrate')), + }) + + # TODO: fix handling of 308 status code returned for live archive manifest requests + for files_type in ('hls', 'dash'): + for cdn_name, cdn_data in config_files.get(files_type, {}).get('cdns', {}).items(): + manifest_url = cdn_data.get('url') + if not manifest_url: + continue + format_id = '%s-%s' % (files_type, cdn_name) + if files_type == 'hls': + formats.extend(self._extract_m3u8_formats( + manifest_url, video_id, 'mp4', + 'm3u8' if is_live else 'm3u8_native', m3u8_id=format_id, + note='Downloading %s m3u8 information' % cdn_name, + fatal=False)) + elif files_type == 'dash': + mpd_pattern = r'/%s/(?:sep/)?video/' % video_id + mpd_manifest_urls = [] + if re.search(mpd_pattern, manifest_url): + for suffix, repl in (('', 'video'), ('_sep', 'sep/video')): + mpd_manifest_urls.append((format_id + suffix, re.sub( + mpd_pattern, '/%s/%s/' % (video_id, repl), manifest_url))) + else: + mpd_manifest_urls = [(format_id, manifest_url)] + for f_id, m_url in mpd_manifest_urls: + if 'json=1' in m_url: + real_m_url = (self._download_json(m_url, video_id, fatal=False) or {}).get('url') + if real_m_url: + m_url = real_m_url + mpd_formats = self._extract_mpd_formats( + m_url.replace('/master.json', '/master.mpd'), video_id, f_id, + 'Downloading %s MPD information' % cdn_name, + fatal=False) + for f in mpd_formats: + if f.get('vcodec') == 'none': + f['preference'] = -50 + elif f.get('acodec') == 'none': + f['preference'] = -40 + formats.extend(mpd_formats) + + live_archive = live_event.get('archive') or {} + live_archive_source_url = live_archive.get('source_url') + if live_archive_source_url and live_archive.get('status') == 'done': + formats.append({ + 'format_id': 'live-archive-source', + 'url': live_archive_source_url, + 'preference': 1, + }) + + subtitles = {} + text_tracks = config['request'].get('text_tracks') + if text_tracks: + for tt in text_tracks: + subtitles[tt['lang']] = [{ + 'ext': 'vtt', + 'url': 'https://vimeo.com' + tt['url'], + }] + + thumbnails = [] + if not is_live: + for key, thumb in video_data.get('thumbs', {}).items(): + thumbnails.append({ + 'id': key, + 'width': int_or_none(key), + 'url': thumb, + }) + thumbnail = video_data.get('thumbnail') + if thumbnail: + thumbnails.append({ + 'url': thumbnail, + }) + + owner = video_data.get('owner') or {} + video_uploader_url = owner.get('url') + + return { + 'title': self._live_title(video_title) if is_live else video_title, + 'uploader': owner.get('name'), + 'uploader_id': video_uploader_url.split('/')[-1] if video_uploader_url else None, + 'uploader_url': video_uploader_url, + 'thumbnails': thumbnails, + 'duration': int_or_none(video_data.get('duration')), + 'formats': formats, + 'subtitles': subtitles, + 'is_live': is_live, + } + + def _extract_original_format(self, url, video_id): + download_data = self._download_json( + url, video_id, fatal=False, + query={'action': 'load_download_config'}, + headers={'X-Requested-With': 'XMLHttpRequest'}) + if download_data: + source_file = download_data.get('source_file') + if isinstance(source_file, dict): + download_url = source_file.get('download_url') + if download_url and not source_file.get('is_cold') and not source_file.get('is_defrosting'): + source_name = source_file.get('public_name', 'Original') + if self._is_valid_url(download_url, video_id, '%s video' % source_name): + ext = (try_get( + source_file, lambda x: x['extension'], + compat_str) or determine_ext( + download_url, None) or 'mp4').lower() + return { + 'url': download_url, + 'ext': ext, + 'width': int_or_none(source_file.get('width')), + 'height': int_or_none(source_file.get('height')), + 'filesize': parse_filesize(source_file.get('size')), + 'format_id': source_name, + 'preference': 1, + } + class VimeoIE(VimeoBaseInfoExtractor): """Information extractor for vimeo.com.""" # _VALID_URL matches Vimeo URLs _VALID_URL = r'''(?x) - https?:// - (?:(?:www|(?Pplayer))\.)? - vimeo(?Ppro)?\.com/ - (?!channels/[^/?#]+/?(?:$|[?#])|album/) - (?:.*?/)? - (?:(?:play_redirect_hls|moogaloop\.swf)\?clip_id=)? - (?:videos?/)? - (?P[0-9]+) - /?(?:[?&].*)?(?:[#].*)?$''' + https?:// + (?: + (?: + www| + (?Pplayer) + ) + \. + )? + vimeo(?Ppro)?\.com/ + (?!(?:channels|album)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/) + (?:.*?/)? + (?: + (?: + play_redirect_hls| + moogaloop\.swf)\?clip_id= + )? + (?:videos?/)? + (?P[0-9]+) + (?:/[\da-f]+)? + /?(?:[?&].*)?(?:[#].*)?$ + ''' IE_NAME = 'vimeo' _TESTS = [ { @@ -91,11 +275,14 @@ 'id': '56015672', 'ext': 'mp4', 'title': "youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550", - 'description': 'md5:2d3305bad981a06ff79f027f19865021', + 'description': 'md5:509a9ad5c9bf97c60faee9203aca4479', + 'timestamp': 1355990239, 'upload_date': '20121220', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user7108434', 'uploader_id': 'user7108434', 'uploader': 'Filippo Valsorda', 'duration': 10, + 'license': 'by-sa', }, }, { @@ -105,6 +292,7 @@ 'info_dict': { 'id': '68093876', 'ext': 'mp4', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/openstreetmapus', 'uploader_id': 'openstreetmapus', 'uploader': 'OpenStreetMap US', 'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography', @@ -121,6 +309,7 @@ 'ext': 'mp4', 'title': 'Kathy Sierra: Building the minimum Badass User, Business of Software 2012', 'uploader': 'The BLN & Business of Software', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/theblnbusinessofsoftware', 'uploader_id': 'theblnbusinessofsoftware', 'duration': 3610, 'description': None, @@ -134,11 +323,13 @@ 'id': '68375962', 'ext': 'mp4', 'title': 'youtube-dl password protected test video', + 'timestamp': 1371200155, 'upload_date': '20130614', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user18948128', 'uploader_id': 'user18948128', 'uploader': 'Jaime Marquínez Ferrándiz', 'duration': 10, - 'description': 'This is "youtube-dl password protected test video" by Jaime Marquínez Ferrándiz on Vimeo, the home for high quality videos and the people\u2026', + 'description': 'md5:dca3ea23adb29ee387127bc4ddfce63f', }, 'params': { 'videopassword': 'youtube-dl', @@ -147,18 +338,21 @@ { 'url': 'http://vimeo.com/channels/keypeele/75629013', 'md5': '2f86a05afe9d7abc0b9126d229bbe15d', - 'note': 'Video is freely available via original URL ' - 'and protected with password when accessed via http://vimeo.com/75629013', 'info_dict': { 'id': '75629013', 'ext': 'mp4', 'title': 'Key & Peele: Terrorist Interrogation', 'description': 'md5:8678b246399b070816b12313e8b4eb5c', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/atencio', 'uploader_id': 'atencio', 'uploader': 'Peter Atencio', - 'upload_date': '20130927', + 'channel_id': 'keypeele', + 'channel_url': r're:https?://(?:www\.)?vimeo\.com/channels/keypeele', + 'timestamp': 1380339469, + 'upload_date': '20130928', 'duration': 187, }, + 'expected_warnings': ['Unable to download JSON metadata'], }, { 'url': 'http://vimeo.com/76979871', @@ -168,7 +362,9 @@ 'ext': 'mp4', 'title': 'The New Vimeo Player (You Know, For Videos)', 'description': 'md5:2ec900bf97c3f389378a96aee11260ea', + 'timestamp': 1381846109, 'upload_date': '20131015', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/staff', 'uploader_id': 'staff', 'uploader': 'Vimeo Staff', 'duration': 62, @@ -183,6 +379,7 @@ 'ext': 'mp4', 'title': 'Pier Solar OUYA Official Trailer', 'uploader': 'Tulio Gonçalves', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user28849593', 'uploader_id': 'user28849593', }, }, @@ -195,12 +392,72 @@ 'ext': 'mp4', 'title': 'FOX CLASSICS - Forever Classic ID - A Full Minute', 'uploader': 'The DMCI', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/dmci', 'uploader_id': 'dmci', + 'timestamp': 1324343742, 'upload_date': '20111220', 'description': 'md5:ae23671e82d05415868f7ad1aec21147', }, }, { + # only available via https://vimeo.com/channels/tributes/6213729 and + # not via https://vimeo.com/6213729 + 'url': 'https://vimeo.com/channels/tributes/6213729', + 'info_dict': { + 'id': '6213729', + 'ext': 'mp4', + 'title': 'Vimeo Tribute: The Shining', + 'uploader': 'Casey Donahue', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/caseydonahue', + 'uploader_id': 'caseydonahue', + 'channel_url': r're:https?://(?:www\.)?vimeo\.com/channels/tributes', + 'channel_id': 'tributes', + 'timestamp': 1250886430, + 'upload_date': '20090821', + 'description': 'md5:bdbf314014e58713e6e5b66eb252f4a6', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download JSON metadata'], + }, + { + # redirects to ondemand extractor and should be passed through it + # for successful extraction + 'url': 'https://vimeo.com/73445910', + 'info_dict': { + 'id': '73445910', + 'ext': 'mp4', + 'title': 'The Reluctant Revolutionary', + 'uploader': '10Ft Films', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/tenfootfilms', + 'uploader_id': 'tenfootfilms', + }, + 'params': { + 'skip_download': True, + }, + }, + { + 'url': 'http://player.vimeo.com/video/68375962', + 'md5': 'aaf896bdb7ddd6476df50007a0ac0ae7', + 'info_dict': { + 'id': '68375962', + 'ext': 'mp4', + 'title': 'youtube-dl password protected test video', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user18948128', + 'uploader_id': 'user18948128', + 'uploader': 'Jaime Marquínez Ferrándiz', + 'duration': 10, + }, + 'params': { + 'videopassword': 'youtube-dl', + }, + }, + { + 'url': 'http://vimeo.com/moogaloop.swf?clip_id=2539741', + 'only_matching': True, + }, + { 'url': 'https://vimeo.com/109815029', 'note': 'Video not completely processed, "failed" seed status', 'only_matching': True, @@ -210,85 +467,104 @@ 'only_matching': True, }, { + 'url': 'https://vimeo.com/album/2632481/video/79010983', + 'only_matching': True, + }, + { # source file returns 403: Forbidden 'url': 'https://vimeo.com/7809605', 'only_matching': True, }, + { + 'url': 'https://vimeo.com/160743502/abd0e13fb4', + 'only_matching': True, + } + # https://gettingthingsdone.com/workflowmap/ + # vimeo embed with check-password page protected by Referer header ] @staticmethod - def _extract_vimeo_url(url, webpage): + def _smuggle_referrer(url, referrer_url): + return smuggle_url(url, {'http_headers': {'Referer': referrer_url}}) + + @staticmethod + def _extract_urls(url, webpage): + urls = [] # Look for embedded (iframe) Vimeo player - mobj = re.search( - r']+?src=(["\'])(?P(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage) - if mobj: - player_url = unescapeHTML(mobj.group('url')) - surl = smuggle_url(player_url, {'http_headers': {'Referer': url}}) - return surl - # Look for embedded (swf embed) Vimeo player - mobj = re.search( - r']+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage) - if mobj: - return mobj.group(1) + for mobj in re.finditer( + r']+?src=(["\'])(?P(?:https?:)?//player\.vimeo\.com/video/\d+.*?)\1', + webpage): + urls.append(VimeoIE._smuggle_referrer(unescapeHTML(mobj.group('url')), url)) + PLAIN_EMBED_RE = ( + # Look for embedded (swf embed) Vimeo player + r']+?src=(["\'])(?P(?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)\1', + # Look more for non-standard embedded Vimeo player + r']+src=(["\'])(?P(?:https?:)?//(?:www\.)?vimeo\.com/[0-9]+)\1', + ) + for embed_re in PLAIN_EMBED_RE: + for mobj in re.finditer(embed_re, webpage): + urls.append(mobj.group('url')) + return urls - def _verify_video_password(self, url, video_id, webpage): - password = self._downloader.params.get('videopassword') - if password is None: - raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) - token, vuid = self._extract_xsrft_and_vuid(webpage) - data = urlencode_postdata(encode_dict({ - 'password': password, - 'token': token, - })) - if url.startswith('http://'): - # vimeo only supports https now, but the user can give an http url - url = url.replace('http://', 'https://') - password_request = sanitized_Request(url + '/password', data) - password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') - password_request.add_header('Referer', url) - self._set_vimeo_cookie('vuid', vuid) - return self._download_webpage( - password_request, video_id, - 'Verifying the password', 'Wrong password') + @staticmethod + def _extract_url(url, webpage): + urls = VimeoIE._extract_urls(url, webpage) + return urls[0] if urls else None - def _verify_player_video_password(self, url, video_id): + def _verify_player_video_password(self, url, video_id, headers): password = self._downloader.params.get('videopassword') if password is None: raise ExtractorError('This video is protected by a password, use the --video-password option') - data = urlencode_postdata(encode_dict({'password': password})) - pass_url = url + '/check-password' - password_request = sanitized_Request(pass_url, data) - password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') - return self._download_json( - password_request, video_id, - 'Verifying the password', - 'Wrong password') + data = urlencode_postdata({ + 'password': base64.b64encode(password.encode()), + }) + headers = merge_dicts(headers, { + 'Content-Type': 'application/x-www-form-urlencoded', + }) + checked = self._download_json( + url + '/check-password', video_id, + 'Verifying the password', data=data, headers=headers) + if checked is False: + raise ExtractorError('Wrong video password', expected=True) + return checked def _real_initialize(self): self._login() def _real_extract(self, url): url, data = unsmuggle_url(url, {}) - headers = std_headers + headers = std_headers.copy() if 'http_headers' in data: - headers = headers.copy() headers.update(data['http_headers']) if 'Referer' not in headers: headers['Referer'] = url + channel_id = self._search_regex( + r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None) + # Extract ID from URL mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') orig_url = url - if mobj.group('pro') or mobj.group('player'): + if mobj.group('pro'): + # some videos require portfolio_id to be present in player url + # https://github.com/ytdl-org/youtube-dl/issues/20070 + url = self._extract_url(url, self._download_webpage(url, video_id)) + elif mobj.group('player'): url = 'https://player.vimeo.com/video/' + video_id - else: + elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')): url = 'https://vimeo.com/' + video_id # Retrieve video webpage to extract further information - request = sanitized_Request(url, None, headers) + request = sanitized_Request(url, headers=headers) try: - webpage = self._download_webpage(request, video_id) + webpage, urlh = self._download_webpage_handle(request, video_id) + redirect_url = compat_str(urlh.geturl()) + # Some URLs redirect to ondemand can't be extracted with + # this extractor right away thus should be passed through + # ondemand extractor (e.g. https://vimeo.com/73445910) + if VimeoOndemandIE.suitable(redirect_url): + return self.url_result(redirect_url, VimeoOndemandIE.ie_key()) except ExtractorError as ee: if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: errmsg = ee.cause.read() @@ -315,6 +591,9 @@ '%s said: %s' % (self.IE_NAME, seed_status['title']), expected=True) + cc_license = None + timestamp = None + # Extract the config JSON try: try: @@ -324,22 +603,28 @@ if not config_url: # Sometimes new react-based page is served instead of old one that require # different config URL extraction approach (see - # https://github.com/rg3/youtube-dl/pull/7209) + # https://github.com/ytdl-org/youtube-dl/pull/7209) vimeo_clip_page_config = self._search_regex( r'vimeo\.clip_page_config\s*=\s*({.+?});', webpage, 'vimeo clip page config') - config_url = self._parse_json( - vimeo_clip_page_config, video_id)['player']['config_url'] + page_config = self._parse_json(vimeo_clip_page_config, video_id) + config_url = page_config['player']['config_url'] + cc_license = page_config.get('cc_license') + timestamp = try_get( + page_config, lambda x: x['clip']['uploaded_on'], + compat_str) config_json = self._download_webpage(config_url, video_id) config = json.loads(config_json) except RegexNotFoundError: # For pro videos or player.vimeo.com urls # We try to find out to which variable is assigned the config dic - m_variable_name = re.search('(\w)\.video\.id', webpage) + m_variable_name = re.search(r'(\w)\.video\.id', webpage) if m_variable_name is not None: - config_re = r'%s=({[^}].+?});' % re.escape(m_variable_name.group(1)) + config_re = [r'%s=({[^}].+?});' % re.escape(m_variable_name.group(1))] else: config_re = [r' = {config:({.+?}),assets:', r'(?:[abc])=({.+?});'] + config_re.append(r'\bvar\s+r\s*=\s*({.+?})\s*;') + config_re.append(r'\bconfig\s*=\s*({.+?})\s*;') config = self._search_regex(config_re, webpage, 'info section', flags=re.DOTALL) config = json.loads(config) @@ -350,37 +635,38 @@ if re.search(r']+?id="pw_form"', webpage) is not None: if '_video_password_verified' in data: raise ExtractorError('video password verification failed!') - self._verify_video_password(url, video_id, webpage) + self._verify_video_password(redirect_url, video_id, webpage) return self._real_extract( - smuggle_url(url, {'_video_password_verified': 'verified'})) + smuggle_url(redirect_url, {'_video_password_verified': 'verified'})) else: raise ExtractorError('Unable to extract info section', cause=e) else: if config.get('view') == 4: - config = self._verify_player_video_password(url, video_id) + config = self._verify_player_video_password(redirect_url, video_id, headers) + + vod = config.get('video', {}).get('vod', {}) + + def is_rented(): + if '>You rented this title.<' in webpage: + return True + if config.get('user', {}).get('purchased'): + return True + for purchase_option in vod.get('purchase_options', []): + if purchase_option.get('purchased'): + return True + label = purchase_option.get('label_string') + if label and (label.startswith('You rented this') or label.endswith(' remaining')): + return True + return False - if '>You rented this title.<' in webpage: - feature_id = config.get('video', {}).get('vod', {}).get('feature_id') + if is_rented() and vod.get('is_trailer'): + feature_id = vod.get('feature_id') if feature_id and not data.get('force_feature_id', False): return self.url_result(smuggle_url( 'https://player.vimeo.com/player/%s' % feature_id, {'force_feature_id': True}), 'Vimeo') - # Extract title - video_title = config['video']['title'] - - # Extract uploader and uploader_id - video_uploader = config['video']['owner']['name'] - video_uploader_id = config['video']['owner']['url'].split('/')[-1] if config['video']['owner']['url'] else None - - # Extract video thumbnail - video_thumbnail = config['video'].get('thumbnail') - if video_thumbnail is None: - video_thumbs = config['video'].get('thumbs') - if video_thumbs and isinstance(video_thumbs, dict): - _, video_thumbnail = sorted((int(width if width.isdigit() else 0), t_url) for (width, t_url) in video_thumbs.items())[-1] - # Extract video description video_description = self._html_search_regex( @@ -400,14 +686,11 @@ if not video_description and not mobj.group('player'): self._downloader.report_warning('Cannot find video description') - # Extract video duration - video_duration = int_or_none(config['video'].get('duration')) - # Extract upload date - video_upload_date = None - mobj = re.search(r']+datetime="([^"]+)"', webpage) - if mobj is not None: - video_upload_date = unified_strdate(mobj.group(1)) + if not timestamp: + timestamp = self._search_regex( + r']+datetime="([^"]+)"', webpage, + 'timestamp', default=None) try: view_count = int(self._search_regex(r'UserPlays:(\d+)', webpage, 'view count')) @@ -420,73 +703,97 @@ comment_count = None formats = [] - download_request = sanitized_Request('https://vimeo.com/%s?action=load_download_config' % video_id, headers={ - 'X-Requested-With': 'XMLHttpRequest'}) - download_data = self._download_json(download_request, video_id, fatal=False) - if download_data: - source_file = download_data.get('source_file') - if isinstance(source_file, dict): - download_url = source_file.get('download_url') - if download_url and not source_file.get('is_cold') and not source_file.get('is_defrosting'): - source_name = source_file.get('public_name', 'Original') - if self._is_valid_url(download_url, video_id, '%s video' % source_name): - ext = source_file.get('extension', determine_ext(download_url)).lower() - formats.append({ - 'url': download_url, - 'ext': ext, - 'width': int_or_none(source_file.get('width')), - 'height': int_or_none(source_file.get('height')), - 'filesize': parse_filesize(source_file.get('size')), - 'format_id': source_name, - 'preference': 1, - }) - config_files = config['video'].get('files') or config['request'].get('files', {}) - for f in config_files.get('progressive', []): - video_url = f.get('url') - if not video_url: - continue - formats.append({ - 'url': video_url, - 'format_id': 'http-%s' % f.get('quality'), - 'width': int_or_none(f.get('width')), - 'height': int_or_none(f.get('height')), - 'fps': int_or_none(f.get('fps')), - 'tbr': int_or_none(f.get('bitrate')), - }) - m3u8_url = config_files.get('hls', {}).get('url') - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - # Bitrates are completely broken. Single m3u8 may contain entries in kbps and bps - # at the same time without actual units specified. This lead to wrong sorting. - self._sort_formats(formats, field_preference=('preference', 'height', 'width', 'fps', 'format_id')) - subtitles = {} - text_tracks = config['request'].get('text_tracks') - if text_tracks: - for tt in text_tracks: - subtitles[tt['lang']] = [{ - 'ext': 'vtt', - 'url': 'https://vimeo.com' + tt['url'], - }] + source_format = self._extract_original_format( + 'https://vimeo.com/' + video_id, video_id) + if source_format: + formats.append(source_format) - return { + info_dict_config = self._parse_config(config, video_id) + formats.extend(info_dict_config['formats']) + self._vimeo_sort_formats(formats) + + json_ld = self._search_json_ld(webpage, video_id, default={}) + + if not cc_license: + cc_license = self._search_regex( + r']+rel=["\']license["\'][^>]+href=(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'license', default=None, group='license') + + channel_url = 'https://vimeo.com/channels/%s' % channel_id if channel_id else None + + info_dict = { 'id': video_id, - 'uploader': video_uploader, - 'uploader_id': video_uploader_id, - 'upload_date': video_upload_date, - 'title': video_title, - 'thumbnail': video_thumbnail, - 'description': video_description, - 'duration': video_duration, 'formats': formats, + 'timestamp': unified_timestamp(timestamp), + 'description': video_description, 'webpage_url': url, 'view_count': view_count, 'like_count': like_count, 'comment_count': comment_count, - 'subtitles': subtitles, + 'license': cc_license, + 'channel_id': channel_id, + 'channel_url': channel_url, } + info_dict = merge_dicts(info_dict, info_dict_config, json_ld) + + return info_dict + + +class VimeoOndemandIE(VimeoBaseInfoExtractor): + IE_NAME = 'vimeo:ondemand' + _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/(?P[^/?#&]+)' + _TESTS = [{ + # ondemand video not available via https://vimeo.com/id + 'url': 'https://vimeo.com/ondemand/20704', + 'md5': 'c424deda8c7f73c1dfb3edd7630e2f35', + 'info_dict': { + 'id': '105442900', + 'ext': 'mp4', + 'title': 'המעבדה - במאי יותם פלדמן', + 'uploader': 'גם סרטים', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/gumfilms', + 'uploader_id': 'gumfilms', + }, + 'params': { + 'format': 'best[protocol=https]', + }, + }, { + # requires Referer to be passed along with og:video:url + 'url': 'https://vimeo.com/ondemand/36938/126682985', + 'info_dict': { + 'id': '126682985', + 'ext': 'mp4', + 'title': 'Rävlock, rätt läte på rätt plats', + 'uploader': 'Lindroth & Norin', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user14430847', + 'uploader_id': 'user14430847', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://vimeo.com/ondemand/nazmaalik', + 'only_matching': True, + }, { + 'url': 'https://vimeo.com/ondemand/141692381', + 'only_matching': True, + }, { + 'url': 'https://vimeo.com/ondemand/thelastcolony/150274832', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + return self.url_result( + # Some videos require Referer to be passed along with og:video:url + # similarly to generic vimeo embeds (e.g. + # https://vimeo.com/ondemand/36938/126682985). + VimeoIE._smuggle_referrer(self._og_search_video_url(webpage), url), + VimeoIE.ie_key()) + class VimeoChannelIE(VimeoBaseInfoExtractor): IE_NAME = 'vimeo:channel' @@ -523,7 +830,7 @@ token, vuid = self._extract_xsrft_and_vuid(webpage) fields['token'] = token fields['password'] = password - post = urlencode_postdata(encode_dict(fields)) + post = urlencode_postdata(fields) password_path = self._search_regex( r'action="([^"]+)"', login_form, 'password URL') password_url = compat_urlparse.urljoin(page_url, password_path) @@ -547,8 +854,21 @@ webpage = self._login_list_password(page_url, list_id, webpage) yield self._extract_list_title(webpage) - for video_id in re.findall(r'id="clip_(\d+?)"', webpage): - yield self.url_result('https://vimeo.com/%s' % video_id, 'Vimeo') + # Try extracting href first since not all videos are available via + # short https://vimeo.com/id URL (e.g. https://vimeo.com/channels/tributes/6213729) + clips = re.findall( + r'id="clip_(\d+)"[^>]*>\s*]+href="(/(?:[^/]+/)*\1)(?:[^>]+\btitle="([^"]+)")?', webpage) + if clips: + for video_id, video_url, video_title in clips: + yield self.url_result( + compat_urlparse.urljoin(base_url, video_url), + VimeoIE.ie_key(), video_id=video_id, video_title=video_title) + # More relaxed fallback + else: + for video_id in re.findall(r'id=["\']clip_(\d+)', webpage): + yield self.url_result( + 'https://vimeo.com/%s' % video_id, + VimeoIE.ie_key(), video_id=video_id) if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None: break @@ -585,7 +905,7 @@ class VimeoAlbumIE(VimeoChannelIE): IE_NAME = 'vimeo:album' - _VALID_URL = r'https://vimeo\.com/album/(?P\d+)' + _VALID_URL = r'https://vimeo\.com/album/(?P\d+)(?:$|[?#]|/(?!video))' _TITLE_RE = r'
    ', + [r'(?s)]+class="video_layer_message"[^>]*>(.+?)
    ', + r'(?s)]+id="video_ext_msg"[^>]*>(.+?)
    '], info_page, 'error message', default=None) if error_message: raise ExtractorError(error_message, expected=True) @@ -228,9 +312,14 @@ 'You are trying to log in from an unusual location. You should confirm ownership at vk.com to log in with this IP.', expected=True) + ERROR_COPYRIGHT = 'Video %s has been removed from public access due to rightholder complaint.' + ERRORS = { r'>Видеозапись .*? была изъята из публичного доступа в связи с обращением правообладателя.<': - 'Video %s has been removed from public access due to rightholder complaint.', + ERROR_COPYRIGHT, + + r'>The video .*? was removed from public access by request of the copyright holder.<': + ERROR_COPYRIGHT, r'Please log in or <': 'Video %s is only available for registered users, ' @@ -244,19 +333,29 @@ r'Access denied': 'Access denied to video %s.', + + r'Видеозапись недоступна, так как её автор был заблокирован.': + 'Video %s is no longer available, because its author has been blocked.', + + r'This video is no longer available, because its author has been blocked.': + 'Video %s is no longer available, because its author has been blocked.', + + r'This video is no longer available, because it has been deleted.': + 'Video %s is no longer available, because it has been deleted.', + + r'The video .+? is not available in your region.': + 'Video %s is not available in your region.', } for error_re, error_msg in ERRORS.items(): if re.search(error_re, info_page): raise ExtractorError(error_msg % video_id, expected=True) - youtube_url = self._search_regex( - r']+src="((?:https?:)?//www.youtube.com/embed/[^"]+)"', - info_page, 'youtube iframe', default=None) + youtube_url = YoutubeIE._extract_url(info_page) if youtube_url: - return self.url_result(youtube_url, 'Youtube') + return self.url_result(youtube_url, ie=YoutubeIE.ie_key()) - vimeo_url = VimeoIE._extract_vimeo_url(url, info_page) + vimeo_url = VimeoIE._extract_url(url, info_page) if vimeo_url is not None: return self.url_result(vimeo_url) @@ -271,6 +370,10 @@ m_rutube.group(1).replace('\\', '')) return self.url_result(rutube_url) + dailymotion_urls = DailymotionIE._extract_urls(info_page) + if dailymotion_urls: + return self.url_result(dailymotion_urls[0], DailymotionIE.ie_key()) + m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.+?});', info_page) if m_opts: m_opts_url = re.search(r"url\s*:\s*'((?!/\b)[^']+)", m_opts.group(1)) @@ -280,48 +383,91 @@ opts_url = 'http:' + opts_url return self.url_result(opts_url) - data_json = self._search_regex(r'var\s+vars\s*=\s*({.+?});', info_page, 'vars') - data = json.loads(data_json) - - # Extract upload date - upload_date = None - mobj = re.search(r'id="mv_date(?:_views)?_wrap"[^>]*>([a-zA-Z]+ [0-9]+), ([0-9]+) at', info_page) - if mobj is not None: - mobj.group(1) + ' ' + mobj.group(2) - upload_date = unified_strdate(mobj.group(1) + ' ' + mobj.group(2)) - - view_count = None - views = self._html_search_regex( - r'"mv_views_count_number"[^>]*>(.+?\bviews?)<', - info_page, 'view count', fatal=False) - if views: - view_count = str_to_int(self._search_regex( - r'([\d,.]+)', views, 'view count', fatal=False)) - - formats = [{ - 'format_id': k, - 'url': v, - 'width': int(k[len('url'):]), - } for k, v in data.items() - if k.startswith('url')] + # vars does not look to be served anymore since 24.10.2016 + data = self._parse_json( + self._search_regex( + r'var\s+vars\s*=\s*({.+?});', info_page, 'vars', default='{}'), + video_id, fatal=False) + + # is served instead + if not data: + data = self._parse_json( + self._search_regex( + [r'\s*({.+?})\s*', r'\s*({.+})'], + info_page, 'json', default='{}'), + video_id) + if data: + data = data['player']['params'][0] + + if not data: + data = self._parse_json( + self._search_regex( + r'var\s+playerParams\s*=\s*({.+?})\s*;\s*\n', info_page, + 'player params'), + video_id)['params'][0] + + title = unescapeHTML(data['md_title']) + + # 2 = live + # 3 = post live (finished live) + is_live = data.get('live') == 2 + if is_live: + title = self._live_title(title) + + timestamp = unified_timestamp(self._html_search_regex( + r'class=["\']mv_info_date[^>]+>([^<]+)(?:<|from)', info_page, + 'upload date', default=None)) or int_or_none(data.get('date')) + + view_count = str_to_int(self._search_regex( + r'class=["\']mv_views_count[^>]+>\s*([\d,.]+)', + info_page, 'view count', default=None)) + + formats = [] + for format_id, format_url in data.items(): + format_url = url_or_none(format_url) + if not format_url or not format_url.startswith(('http', '//', 'rtmp')): + continue + if (format_id.startswith(('url', 'cache')) + or format_id in ('extra_data', 'live_mp4', 'postlive_mp4')): + height = int_or_none(self._search_regex( + r'^(?:url|cache)(\d+)', format_id, 'height', default=None)) + formats.append({ + 'format_id': format_id, + 'url': format_url, + 'height': height, + }) + elif format_id == 'hls': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', 'm3u8_native', + m3u8_id=format_id, fatal=False, live=is_live)) + elif format_id == 'rtmp': + formats.append({ + 'format_id': format_id, + 'url': format_url, + 'ext': 'flv', + }) self._sort_formats(formats) return { - 'id': compat_str(data['vid']), + 'id': video_id, 'formats': formats, - 'title': unescapeHTML(data['md_title']), + 'title': title, 'thumbnail': data.get('jpg'), 'uploader': data.get('md_author'), + 'uploader_id': str_or_none(data.get('author_id')), 'duration': data.get('duration'), - 'upload_date': upload_date, + 'timestamp': timestamp, 'view_count': view_count, + 'like_count': int_or_none(data.get('liked')), + 'dislike_count': int_or_none(data.get('nolikes')), + 'is_live': is_live, } -class VKUserVideosIE(InfoExtractor): +class VKUserVideosIE(VKBaseIE): IE_NAME = 'vk:uservideos' IE_DESC = "VK - User's Videos" - _VALID_URL = r'https?://vk\.com/videos(?P-?[0-9]+)(?!\?.*\bz=video)(?:[/?#&]|$)' + _VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/videos(?P-?[0-9]+)(?!\?.*\bz=video)(?:[/?#&]|$)' _TEMPLATE_URL = 'https://vk.com/videos' _TESTS = [{ 'url': 'http://vk.com/videos205387401', @@ -336,6 +482,12 @@ }, { 'url': 'http://vk.com/videos-97664626?section=all', 'only_matching': True, + }, { + 'url': 'http://m.vk.com/videos205387401', + 'only_matching': True, + }, { + 'url': 'http://new.vk.com/videos205387401', + 'only_matching': True, }] def _real_extract(self, url): @@ -353,3 +505,131 @@ webpage, 'title', default=page_id)) return self.playlist_result(entries, page_id, title) + + +class VKWallPostIE(VKBaseIE): + IE_NAME = 'vk:wallpost' + _VALID_URL = r'https?://(?:(?:(?:(?:m|new)\.)?vk\.com/(?:[^?]+\?.*\bw=)?wall(?P-?\d+_\d+)))' + _TESTS = [{ + # public page URL, audio playlist + 'url': 'https://vk.com/bs.official?w=wall-23538238_35', + 'info_dict': { + 'id': '23538238_35', + 'title': 'Black Shadow - Wall post 23538238_35', + 'description': 'md5:3f84b9c4f9ef499731cf1ced9998cc0c', + }, + 'playlist': [{ + 'md5': '5ba93864ec5b85f7ce19a9af4af080f6', + 'info_dict': { + 'id': '135220665_111806521', + 'ext': 'mp3', + 'title': 'Black Shadow - Слепое Верование', + 'duration': 370, + 'uploader': 'Black Shadow', + 'artist': 'Black Shadow', + 'track': 'Слепое Верование', + }, + }, { + 'md5': '4cc7e804579122b17ea95af7834c9233', + 'info_dict': { + 'id': '135220665_111802303', + 'ext': 'mp3', + 'title': 'Black Shadow - Война - Негасимое Бездны Пламя!', + 'duration': 423, + 'uploader': 'Black Shadow', + 'artist': 'Black Shadow', + 'track': 'Война - Негасимое Бездны Пламя!', + }, + 'params': { + 'skip_download': True, + }, + }], + 'params': { + 'usenetrc': True, + }, + 'skip': 'Requires vk account credentials', + }, { + # single YouTube embed, no leading - + 'url': 'https://vk.com/wall85155021_6319', + 'info_dict': { + 'id': '85155021_6319', + 'title': 'Sergey Gorbunov - Wall post 85155021_6319', + }, + 'playlist_count': 1, + 'params': { + 'usenetrc': True, + }, + 'skip': 'Requires vk account credentials', + }, { + # wall page URL + 'url': 'https://vk.com/wall-23538238_35', + 'only_matching': True, + }, { + # mobile wall page URL + 'url': 'https://m.vk.com/wall-23538238_35', + 'only_matching': True, + }] + + def _real_extract(self, url): + post_id = self._match_id(url) + + wall_url = 'https://vk.com/wall%s' % post_id + + post_id = remove_start(post_id, '-') + + webpage = self._download_webpage(wall_url, post_id) + + error = self._html_search_regex( + r'>Error
    \s*]+class=["\']body["\'][^>]*>([^<]+)', + webpage, 'error', default=None) + if error: + raise ExtractorError('VK said: %s' % error, expected=True) + + description = clean_html(get_element_by_class('wall_post_text', webpage)) + uploader = clean_html(get_element_by_class('author', webpage)) + thumbnail = self._og_search_thumbnail(webpage) + + entries = [] + + audio_ids = re.findall(r'data-full-id=["\'](\d+_\d+)', webpage) + if audio_ids: + al_audio = self._download_webpage( + 'https://vk.com/al_audio.php', post_id, + note='Downloading audio info', fatal=False, + data=urlencode_postdata({ + 'act': 'reload_audio', + 'al': '1', + 'ids': ','.join(audio_ids) + })) + if al_audio: + Audio = collections.namedtuple( + 'Audio', ['id', 'user_id', 'url', 'track', 'artist', 'duration']) + audios = self._parse_json( + self._search_regex( + r'(.+?)', al_audio, 'audios', default='[]'), + post_id, fatal=False, transform_source=unescapeHTML) + if isinstance(audios, list): + for audio in audios: + a = Audio._make(audio[:6]) + entries.append({ + 'id': '%s_%s' % (a.user_id, a.id), + 'url': a.url, + 'title': '%s - %s' % (a.artist, a.track) if a.artist and a.track else a.id, + 'thumbnail': thumbnail, + 'duration': a.duration, + 'uploader': uploader, + 'artist': a.artist, + 'track': a.track, + }) + + for video in re.finditer( + r']+href=(["\'])(?P/video(?:-?[\d_]+).*?)\1', webpage): + entries.append(self.url_result( + compat_urlparse.urljoin(url, video.group('url')), VKIE.ie_key())) + + title = 'Wall post %s' % post_id + + return self.playlist_result( + orderedSet(entries), post_id, + '%s - %s' % (uploader, title) if uploader else title, + description) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/vlive.py youtube-dl-2019.05.20/youtube_dl/extractor/vlive.py --- youtube-dl-2016.02.22/youtube_dl/extractor/vlive.py 2016-02-09 11:57:41.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/vlive.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,49 +1,148 @@ # coding: utf-8 from __future__ import unicode_literals +import re +import time +import itertools + from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_urlencode, + compat_str, +) from ..utils import ( dict_get, + ExtractorError, float_or_none, int_or_none, + remove_start, + try_get, + urlencode_postdata, ) -from ..compat import compat_urllib_parse class VLiveIE(InfoExtractor): IE_NAME = 'vlive' _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/video/(?P[0-9]+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.vlive.tv/video/1326', 'md5': 'cc7314812855ce56de70a06a27314983', 'info_dict': { 'id': '1326', 'ext': 'mp4', - 'title': "[V] Girl's Day's Broadcast", + 'title': "[V LIVE] Girl's Day's Broadcast", 'creator': "Girl's Day", 'view_count': int, }, - } + }, { + 'url': 'http://www.vlive.tv/video/16937', + 'info_dict': { + 'id': '16937', + 'ext': 'mp4', + 'title': '[V LIVE] 첸백시 걍방', + 'creator': 'EXO', + 'view_count': int, + 'subtitles': 'mincount:12', + }, + 'params': { + 'skip_download': True, + }, + }] + + @classmethod + def suitable(cls, url): + return False if VLivePlaylistIE.suitable(url) else super(VLiveIE, cls).suitable(url) def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( - 'http://www.vlive.tv/video/%s' % video_id, video_id) + 'https://www.vlive.tv/video/%s' % video_id, video_id) - long_video_id = self._search_regex( - r'vlive\.tv\.video\.ajax\.request\.handler\.init\(\s*"[0-9]+"\s*,\s*"[^"]*"\s*,\s*"([^"]+)"', - webpage, 'long video id') - - key = self._search_regex( - r'vlive\.tv\.video\.ajax\.request\.handler\.init\(\s*"[0-9]+"\s*,\s*"[^"]*"\s*,\s*"[^"]+"\s*,\s*"([^"]+)"', - webpage, 'key') + VIDEO_PARAMS_RE = r'\bvlive\.video\.init\(([^)]+)' + VIDEO_PARAMS_FIELD = 'video params' + params = self._parse_json(self._search_regex( + VIDEO_PARAMS_RE, webpage, VIDEO_PARAMS_FIELD, default=''), video_id, + transform_source=lambda s: '[' + s + ']', fatal=False) + + if not params or len(params) < 7: + params = self._search_regex( + VIDEO_PARAMS_RE, webpage, VIDEO_PARAMS_FIELD) + params = [p.strip(r'"') for p in re.split(r'\s*,\s*', params)] + + status, long_video_id, key = params[2], params[5], params[6] + status = remove_start(status, 'PRODUCT_') + + if status in ('LIVE_ON_AIR', 'BIG_EVENT_ON_AIR'): + return self._live(video_id, webpage) + elif status in ('VOD_ON_AIR', 'BIG_EVENT_INTRO'): + if long_video_id and key: + return self._replay(video_id, webpage, long_video_id, key) + else: + status = 'COMING_SOON' + + if status == 'LIVE_END': + raise ExtractorError('Uploading for replay. Please wait...', + expected=True) + elif status == 'COMING_SOON': + raise ExtractorError('Coming soon!', expected=True) + elif status == 'CANCELED': + raise ExtractorError('We are sorry, ' + 'but the live broadcast has been canceled.', + expected=True) + else: + raise ExtractorError('Unknown status %s' % status) + + def _get_common_fields(self, webpage): title = self._og_search_title(webpage) + creator = self._html_search_regex( + r']+class="info_area"[^>]*>\s*]*>([^<]+)', + webpage, 'creator', fatal=False) + thumbnail = self._og_search_thumbnail(webpage) + return { + 'title': title, + 'creator': creator, + 'thumbnail': thumbnail, + } + + def _live(self, video_id, webpage): + init_page = self._download_webpage( + 'https://www.vlive.tv/video/init/view', + video_id, note='Downloading live webpage', + data=urlencode_postdata({'videoSeq': video_id}), + headers={ + 'Referer': 'https://www.vlive.tv/video/%s' % video_id, + 'Content-Type': 'application/x-www-form-urlencoded' + }) + + live_params = self._search_regex( + r'"liveStreamInfo"\s*:\s*(".*"),', + init_page, 'live stream info') + live_params = self._parse_json(live_params, video_id) + live_params = self._parse_json(live_params, video_id) + + formats = [] + for vid in live_params.get('resolutions', []): + formats.extend(self._extract_m3u8_formats( + vid['cdnUrl'], video_id, 'mp4', + m3u8_id=vid.get('name'), + fatal=False, live=True)) + self._sort_formats(formats) + + info = self._get_common_fields(webpage) + info.update({ + 'title': self._live_title(info['title']), + 'id': video_id, + 'formats': formats, + 'is_live': True, + }) + return info + def _replay(self, video_id, webpage, long_video_id, key): playinfo = self._download_json( 'http://global.apis.naver.com/rmcnmv/rmcnmv/vod_play_videoInfo.json?%s' - % compat_urllib_parse.urlencode({ + % compat_urllib_parse_urlencode({ 'videoId': long_video_id, 'key': key, 'ptc': 'http', @@ -62,27 +161,163 @@ } for vid in playinfo.get('videos', {}).get('list', []) if vid.get('source')] self._sort_formats(formats) - thumbnail = self._og_search_thumbnail(webpage) - creator = self._html_search_regex( - r']+class="info_area"[^>]*>\s*]+class="name"[^>]*>([^<]+)', - webpage, 'creator', fatal=False) - view_count = int_or_none(playinfo.get('meta', {}).get('count')) subtitles = {} for caption in playinfo.get('captions', {}).get('list', []): - lang = dict_get(caption, ('language', 'locale', 'country', 'label')) + lang = dict_get(caption, ('locale', 'language', 'country', 'label')) if lang and caption.get('source'): subtitles[lang] = [{ 'ext': 'vtt', 'url': caption['source']}] - return { + info = self._get_common_fields(webpage) + info.update({ 'id': video_id, - 'title': title, - 'creator': creator, - 'thumbnail': thumbnail, - 'view_count': view_count, 'formats': formats, + 'view_count': view_count, 'subtitles': subtitles, - } + }) + return info + + +class VLiveChannelIE(InfoExtractor): + IE_NAME = 'vlive:channel' + _VALID_URL = r'https?://channels\.vlive\.tv/(?P[0-9A-Z]+)' + _TEST = { + 'url': 'http://channels.vlive.tv/FCD4B', + 'info_dict': { + 'id': 'FCD4B', + 'title': 'MAMAMOO', + }, + 'playlist_mincount': 110 + } + _APP_ID = '8c6cc7b45d2568fb668be6e05b6e5a3b' + + def _real_extract(self, url): + channel_code = self._match_id(url) + + webpage = self._download_webpage( + 'http://channels.vlive.tv/%s/video' % channel_code, channel_code) + + app_id = None + + app_js_url = self._search_regex( + r']+src=(["\'])(?Phttp.+?/app\.js.*?)\1', + webpage, 'app js', default=None, group='url') + + if app_js_url: + app_js = self._download_webpage( + app_js_url, channel_code, 'Downloading app JS', fatal=False) + if app_js: + app_id = self._search_regex( + r'Global\.VFAN_APP_ID\s*=\s*[\'"]([^\'"]+)[\'"]', + app_js, 'app id', default=None) + + app_id = app_id or self._APP_ID + + channel_info = self._download_json( + 'http://api.vfan.vlive.tv/vproxy/channelplus/decodeChannelCode', + channel_code, note='Downloading decode channel code', + query={ + 'app_id': app_id, + 'channelCode': channel_code, + '_': int(time.time()) + }) + + channel_seq = channel_info['result']['channelSeq'] + channel_name = None + entries = [] + + for page_num in itertools.count(1): + video_list = self._download_json( + 'http://api.vfan.vlive.tv/vproxy/channelplus/getChannelVideoList', + channel_code, note='Downloading channel list page #%d' % page_num, + query={ + 'app_id': app_id, + 'channelSeq': channel_seq, + # Large values of maxNumOfRows (~300 or above) may cause + # empty responses (see [1]), e.g. this happens for [2] that + # has more than 300 videos. + # 1. https://github.com/ytdl-org/youtube-dl/issues/13830 + # 2. http://channels.vlive.tv/EDBF. + 'maxNumOfRows': 100, + '_': int(time.time()), + 'pageNo': page_num + } + ) + + if not channel_name: + channel_name = try_get( + video_list, + lambda x: x['result']['channelInfo']['channelName'], + compat_str) + + videos = try_get( + video_list, lambda x: x['result']['videoList'], list) + if not videos: + break + + for video in videos: + video_id = video.get('videoSeq') + if not video_id: + continue + video_id = compat_str(video_id) + entries.append( + self.url_result( + 'http://www.vlive.tv/video/%s' % video_id, + ie=VLiveIE.ie_key(), video_id=video_id)) + + return self.playlist_result( + entries, channel_code, channel_name) + + +class VLivePlaylistIE(InfoExtractor): + IE_NAME = 'vlive:playlist' + _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/video/(?P[0-9]+)/playlist/(?P[0-9]+)' + _TEST = { + 'url': 'http://www.vlive.tv/video/22867/playlist/22912', + 'info_dict': { + 'id': '22912', + 'title': 'Valentine Day Message from TWICE' + }, + 'playlist_mincount': 9 + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id, playlist_id = mobj.group('video_id', 'id') + + VIDEO_URL_TEMPLATE = 'http://www.vlive.tv/video/%s' + if self._downloader.params.get('noplaylist'): + self.to_screen( + 'Downloading just video %s because of --no-playlist' % video_id) + return self.url_result( + VIDEO_URL_TEMPLATE % video_id, + ie=VLiveIE.ie_key(), video_id=video_id) + + self.to_screen( + 'Downloading playlist %s - add --no-playlist to just download video' + % playlist_id) + + webpage = self._download_webpage( + 'http://www.vlive.tv/video/%s/playlist/%s' + % (video_id, playlist_id), playlist_id) + + item_ids = self._parse_json( + self._search_regex( + r'playlistVideoSeqs\s*=\s*(\[[^]]+\])', webpage, + 'playlist video seqs'), + playlist_id) + + entries = [ + self.url_result( + VIDEO_URL_TEMPLATE % item_id, ie=VLiveIE.ie_key(), + video_id=compat_str(item_id)) + for item_id in item_ids] + + playlist_name = self._html_search_regex( + r']+class="[^"]*multicam_playlist[^>]*>\s*]+>([^<]+)', + webpage, 'playlist title', fatal=False) + + return self.playlist_result(entries, playlist_id, playlist_name) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/vodlocker.py youtube-dl-2019.05.20/youtube_dl/extractor/vodlocker.py --- youtube-dl-2016.02.22/youtube_dl/extractor/vodlocker.py 2016-01-15 18:42:53.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/vodlocker.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,12 +1,12 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, NO_DEFAULT, sanitized_Request, + urlencode_postdata, ) @@ -20,7 +20,7 @@ 'id': 'e8wvyzz4sl42', 'ext': 'mp4', 'title': 'Germany vs Brazil', - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', }, }] @@ -31,14 +31,15 @@ if any(p in webpage for p in ( '>THIS FILE WAS DELETED<', '>File Not Found<', - 'The file you were looking for could not be found, sorry for any inconvenience.<')): + 'The file you were looking for could not be found, sorry for any inconvenience.<', + '>The file was removed')): raise ExtractorError('Video %s does not exist' % video_id, expected=True) fields = self._hidden_inputs(webpage) if fields['op'] == 'download1': self._sleep(3, video_id) # they do detect when requests happen too fast! - post = compat_urllib_parse.urlencode(fields) + post = urlencode_postdata(fields) req = sanitized_Request(url, post) req.add_header('Content-type', 'application/x-www-form-urlencoded') webpage = self._download_webpage( diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/vodplatform.py youtube-dl-2019.05.20/youtube_dl/extractor/vodplatform.py --- youtube-dl-2016.02.22/youtube_dl/extractor/vodplatform.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/vodplatform.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,37 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import unescapeHTML + + +class VODPlatformIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vod-platform\.net/[eE]mbed/(?P[^/?#]+)' + _TEST = { + # from http://www.lbcgroup.tv/watch/chapter/29143/52844/%D8%A7%D9%84%D9%86%D8%B5%D8%B1%D8%A9-%D9%81%D9%8A-%D8%B6%D9%8A%D8%A7%D9%81%D8%A9-%D8%A7%D9%84%D9%80-cnn/ar + 'url': 'http://vod-platform.net/embed/RufMcytHDolTH1MuKHY9Fw', + 'md5': '1db2b7249ce383d6be96499006e951fc', + 'info_dict': { + 'id': 'RufMcytHDolTH1MuKHY9Fw', + 'ext': 'mp4', + 'title': 'LBCi News_ النصرة في ضيافة الـ "سي.أن.أن"', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = unescapeHTML(self._og_search_title(webpage)) + hidden_inputs = self._hidden_inputs(webpage) + + formats = self._extract_wowza_formats( + hidden_inputs.get('HiddenmyhHlsLink') or hidden_inputs['HiddenmyDashLink'], video_id, skip_protocols=['f4m', 'smil']) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': hidden_inputs.get('HiddenThumbnail') or self._og_search_thumbnail(webpage), + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/vodpl.py youtube-dl-2019.05.20/youtube_dl/extractor/vodpl.py --- youtube-dl-2016.02.22/youtube_dl/extractor/vodpl.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/vodpl.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,32 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .onet import OnetBaseIE + + +class VODPlIE(OnetBaseIE): + _VALID_URL = r'https?://vod\.pl/(?:[^/]+/)+(?P[0-9a-zA-Z]+)' + + _TESTS = [{ + 'url': 'https://vod.pl/filmy/chlopaki-nie-placza/3ep3jns', + 'md5': 'a7dc3b2f7faa2421aefb0ecaabf7ec74', + 'info_dict': { + 'id': '3ep3jns', + 'ext': 'mp4', + 'title': 'Chłopaki nie płaczą', + 'description': 'md5:f5f03b84712e55f5ac9f0a3f94445224', + 'timestamp': 1463415154, + 'duration': 5765, + 'upload_date': '20160516', + }, + }, { + 'url': 'https://vod.pl/seriale/belfer-na-planie-praca-kamery-online/2c10heh', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + info_dict = self._extract_from_id(self._search_mvp_id(webpage), webpage) + info_dict['id'] = video_id + return info_dict diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/voicerepublic.py youtube-dl-2019.05.20/youtube_dl/extractor/voicerepublic.py --- youtube-dl-2016.02.22/youtube_dl/extractor/voicerepublic.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/voicerepublic.py 2019-06-07 18:49:07.000000000 +0000 @@ -3,7 +3,10 @@ import re from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import ( + compat_str, + compat_urlparse, +) from ..utils import ( ExtractorError, determine_ext, @@ -16,14 +19,14 @@ _VALID_URL = r'https?://voicerepublic\.com/(?:talks|embed)/(?P[0-9a-z-]+)' _TESTS = [{ 'url': 'http://voicerepublic.com/talks/watching-the-watchers-building-a-sousveillance-state', - 'md5': '0554a24d1657915aa8e8f84e15dc9353', + 'md5': 'b9174d651323f17783000876347116e3', 'info_dict': { 'id': '2296', 'display_id': 'watching-the-watchers-building-a-sousveillance-state', 'ext': 'm4a', 'title': 'Watching the Watchers: Building a Sousveillance State', - 'description': 'md5:715ba964958afa2398df615809cfecb1', - 'thumbnail': 're:^https?://.*\.(?:png|jpg)$', + 'description': 'Secret surveillance programs have metadata too. The people and companies that operate secret surveillance programs can be surveilled.', + 'thumbnail': r're:^https?://.*\.(?:png|jpg)$', 'duration': 1800, 'view_count': int, } @@ -52,7 +55,7 @@ if data: title = data['title'] description = data.get('teaser') - talk_id = data.get('talk_id') or display_id + talk_id = compat_str(data.get('talk_id') or display_id) talk = data['talk'] duration = int_or_none(talk.get('duration')) formats = [{ diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/voot.py youtube-dl-2019.05.20/youtube_dl/extractor/voot.py --- youtube-dl-2016.02.22/youtube_dl/extractor/voot.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/voot.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,100 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + try_get, + unified_timestamp, +) + + +class VootIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?voot\.com/(?:[^/]+/)+(?P\d+)' + _GEO_COUNTRIES = ['IN'] + _TESTS = [{ + 'url': 'https://www.voot.com/shows/ishq-ka-rang-safed/1/360558/is-this-the-end-of-kamini-/441353', + 'info_dict': { + 'id': '0_8ledb18o', + 'ext': 'mp4', + 'title': 'Ishq Ka Rang Safed - Season 01 - Episode 340', + 'description': 'md5:06291fbbbc4dcbe21235c40c262507c1', + 'timestamp': 1472162937, + 'upload_date': '20160825', + 'duration': 1146, + 'series': 'Ishq Ka Rang Safed', + 'season_number': 1, + 'episode': 'Is this the end of Kamini?', + 'episode_number': 340, + 'view_count': int, + 'like_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Failed to download m3u8 information'], + }, { + 'url': 'https://www.voot.com/kids/characters/mighty-cat-masked-niyander-e-/400478/school-bag-disappears/440925', + 'only_matching': True, + }, { + 'url': 'https://www.voot.com/movies/pandavas-5/424627', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + media_info = self._download_json( + 'https://wapi.voot.com/ws/ott/getMediaInfo.json', video_id, + query={ + 'platform': 'Web', + 'pId': 2, + 'mediaId': video_id, + }) + + status_code = try_get(media_info, lambda x: x['status']['code'], int) + if status_code != 0: + raise ExtractorError(media_info['status']['message'], expected=True) + + media = media_info['assets'] + + entry_id = media['EntryId'] + title = media['MediaName'] + formats = self._extract_m3u8_formats( + 'https://cdnapisec.kaltura.com/p/1982551/playManifest/pt/https/f/applehttp/t/web/e/' + entry_id, + video_id, 'mp4', m3u8_id='hls') + self._sort_formats(formats) + + description, series, season_number, episode, episode_number = [None] * 5 + + for meta in try_get(media, lambda x: x['Metas'], list) or []: + key, value = meta.get('Key'), meta.get('Value') + if not key or not value: + continue + if key == 'ContentSynopsis': + description = value + elif key == 'RefSeriesTitle': + series = value + elif key == 'RefSeriesSeason': + season_number = int_or_none(value) + elif key == 'EpisodeMainTitle': + episode = value + elif key == 'EpisodeNo': + episode_number = int_or_none(value) + + return { + 'extractor_key': 'Kaltura', + 'id': entry_id, + 'title': title, + 'description': description, + 'series': series, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, + 'timestamp': unified_timestamp(media.get('CreationDate')), + 'duration': int_or_none(media.get('Duration')), + 'view_count': int_or_none(media.get('ViewCounter')), + 'like_count': int_or_none(media.get('like_counter')), + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/voxmedia.py youtube-dl-2019.05.20/youtube_dl/extractor/voxmedia.py --- youtube-dl-2016.02.22/youtube_dl/extractor/voxmedia.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/voxmedia.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,182 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .once import OnceIE +from ..compat import compat_urllib_parse_unquote +from ..utils import ExtractorError + + +class VoxMediaVolumeIE(OnceIE): + _VALID_URL = r'https?://volume\.vox-cdn\.com/embed/(?P[0-9a-f]{9})' + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video_data = self._parse_json(self._search_regex( + r'Volume\.createVideo\(({.+})\s*,\s*{.*}\s*,\s*\[.*\]\s*,\s*{.*}\);', webpage, 'video data'), video_id) + for provider_video_type in ('ooyala', 'youtube', 'brightcove'): + provider_video_id = video_data.get('%s_id' % provider_video_type) + if not provider_video_id: + continue + info = { + 'id': video_id, + 'title': video_data.get('title_short'), + 'description': video_data.get('description_long') or video_data.get('description_short'), + 'thumbnail': video_data.get('brightcove_thumbnail') + } + if provider_video_type == 'brightcove': + info['formats'] = self._extract_once_formats(provider_video_id) + self._sort_formats(info['formats']) + else: + info.update({ + '_type': 'url_transparent', + 'url': provider_video_id if provider_video_type == 'youtube' else '%s:%s' % (provider_video_type, provider_video_id), + 'ie_key': provider_video_type.capitalize(), + }) + return info + raise ExtractorError('Unable to find provider video id') + + +class VoxMediaIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?:(?:theverge|vox|sbnation|eater|polygon|curbed|racked)\.com|recode\.net)/(?:[^/]+/)*(?P[^/?]+)' + _TESTS = [{ + 'url': 'http://www.theverge.com/2014/6/27/5849272/material-world-how-google-discovered-what-software-is-made-of', + 'info_dict': { + 'id': '11eXZobjrG8DCSTgrNjVinU-YmmdYjhe', + 'ext': 'mp4', + 'title': 'Google\'s new material design direction', + 'description': 'md5:2f44f74c4d14a1f800ea73e1c6832ad2', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': ['Ooyala'], + }, { + # data-ooyala-id + 'url': 'http://www.theverge.com/2014/10/21/7025853/google-nexus-6-hands-on-photos-video-android-phablet', + 'md5': 'd744484ff127884cd2ba09e3fa604e4b', + 'info_dict': { + 'id': 'RkZXU4cTphOCPDMZg5oEounJyoFI0g-B', + 'ext': 'mp4', + 'title': 'The Nexus 6: hands-on with Google\'s phablet', + 'description': 'md5:87a51fe95ff8cea8b5bdb9ac7ae6a6af', + }, + 'add_ie': ['Ooyala'], + 'skip': 'Video Not Found', + }, { + # volume embed + 'url': 'http://www.vox.com/2016/3/31/11336640/mississippi-lgbt-religious-freedom-bill', + 'info_dict': { + 'id': 'wydzk3dDpmRz7PQoXRsTIX6XTkPjYL0b', + 'ext': 'mp4', + 'title': 'The new frontier of LGBTQ civil rights, explained', + 'description': 'md5:0dc58e94a465cbe91d02950f770eb93f', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': ['Ooyala'], + }, { + # youtube embed + 'url': 'http://www.vox.com/2016/3/24/11291692/robot-dance', + 'md5': '83b3080489fb103941e549352d3e0977', + 'info_dict': { + 'id': 'FcNHTJU1ufM', + 'ext': 'mp4', + 'title': 'How "the robot" became the greatest novelty dance of all time', + 'description': 'md5:b081c0d588b8b2085870cda55e6da176', + 'upload_date': '20160324', + 'uploader_id': 'voxdotcom', + 'uploader': 'Vox', + }, + 'add_ie': ['Youtube'], + }, { + # SBN.VideoLinkset.entryGroup multiple ooyala embeds + 'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok', + 'info_dict': { + 'id': 'national-signing-day-rationalizations-itll-be-ok-itll-be-ok', + 'title': '25 lies you will tell yourself on National Signing Day', + 'description': 'It\'s the most self-delusional time of the year, and everyone\'s gonna tell the same lies together!', + }, + 'playlist': [{ + 'md5': '721fededf2ab74ae4176c8c8cbfe092e', + 'info_dict': { + 'id': 'p3cThlMjE61VDi_SD9JlIteSNPWVDBB9', + 'ext': 'mp4', + 'title': 'Buddy Hield vs Steph Curry (and the world)', + 'description': 'Let’s dissect only the most important Final Four storylines.', + }, + }, { + 'md5': 'bf0c5cc115636af028be1bab79217ea9', + 'info_dict': { + 'id': 'BmbmVjMjE6esPHxdALGubTrouQ0jYLHj', + 'ext': 'mp4', + 'title': 'Chasing Cinderella 2016: Syracuse basketball', + 'description': 'md5:e02d56b026d51aa32c010676765a690d', + }, + }], + }, { + # volume embed, Brightcove Once + 'url': 'https://www.recode.net/2014/6/17/11628066/post-post-pc-ceo-the-full-code-conference-video-of-microsofts-satya', + 'md5': '01571a896281f77dc06e084138987ea2', + 'info_dict': { + 'id': '1231c973d', + 'ext': 'mp4', + 'title': 'Post-Post-PC CEO: The Full Code Conference Video of Microsoft\'s Satya Nadella', + 'description': 'The longtime veteran was chosen earlier this year as the software giant\'s third leader in its history.', + }, + 'add_ie': ['VoxMediaVolume'], + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = compat_urllib_parse_unquote(self._download_webpage(url, display_id)) + + def create_entry(provider_video_id, provider_video_type, title=None, description=None): + video_url = { + 'youtube': '%s', + 'ooyala': 'ooyala:%s', + 'volume': 'http://volume.vox-cdn.com/embed/%s', + }[provider_video_type] % provider_video_id + return { + '_type': 'url_transparent', + 'url': video_url, + 'title': title or self._og_search_title(webpage), + 'description': description or self._og_search_description(webpage), + } + + entries = [] + entries_data = self._search_regex([ + r'Chorus\.VideoContext\.addVideo\((\[{.+}\])\);', + r'var\s+entry\s*=\s*({.+});', + r'SBN\.VideoLinkset\.entryGroup\(\s*(\[.+\])', + ], webpage, 'video data', default=None) + if entries_data: + entries_data = self._parse_json(entries_data, display_id) + if isinstance(entries_data, dict): + entries_data = [entries_data] + for video_data in entries_data: + provider_video_id = video_data.get('provider_video_id') + provider_video_type = video_data.get('provider_video_type') + if provider_video_id and provider_video_type: + entries.append(create_entry( + provider_video_id, provider_video_type, + video_data.get('title'), video_data.get('description'))) + + provider_video_id = self._search_regex( + r'data-ooyala-id="([^"]+)"', webpage, 'ooyala id', default=None) + if provider_video_id: + entries.append(create_entry(provider_video_id, 'ooyala')) + + volume_uuid = self._search_regex( + r'data-volume-uuid="([^"]+)"', webpage, 'volume uuid', default=None) + if volume_uuid: + entries.append(create_entry(volume_uuid, 'volume')) + + if len(entries) == 1: + return entries[0] + else: + return self.playlist_result(entries, display_id, self._og_search_title(webpage), self._og_search_description(webpage)) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/vporn.py youtube-dl-2019.05.20/youtube_dl/extractor/vporn.py --- youtube-dl-2016.02.22/youtube_dl/extractor/vporn.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/vporn.py 2019-06-07 18:47:18.000000000 +0000 @@ -4,8 +4,10 @@ from .common import InfoExtractor from ..utils import ( + ExtractorError, parse_duration, str_to_int, + urljoin, ) @@ -21,13 +23,14 @@ 'ext': 'mp4', 'title': 'Violet on her 19th birthday', 'description': 'Violet dances in front of the camera which is sure to get you horny.', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'kileyGrope', 'categories': ['Masturbation', 'Teen'], 'duration': 393, 'age_limit': 18, 'view_count': int, - } + }, + 'skip': 'video removed', }, { 'url': 'http://www.vporn.com/female/hana-shower/523564/', @@ -38,9 +41,9 @@ 'ext': 'mp4', 'title': 'Hana Shower', 'description': 'Hana showers at the bathroom.', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'Hmmmmm', - 'categories': ['Big Boobs', 'Erotic', 'Teen', 'Female'], + 'categories': ['Big Boobs', 'Erotic', 'Teen', 'Female', '720p'], 'duration': 588, 'age_limit': 18, 'view_count': int, @@ -55,15 +58,18 @@ webpage = self._download_webpage(url, display_id) + errmsg = 'This video has been deleted due to Copyright Infringement or by the account owner!' + if errmsg in webpage: + raise ExtractorError('%s said: %s' % (self.IE_NAME, errmsg), expected=True) + title = self._html_search_regex( r'videoname\s*=\s*\'([^\']+)\'', webpage, 'title').strip() description = self._html_search_regex( r'class="(?:descr|description_txt)">(.*?)', webpage, 'description', fatal=False) - thumbnail = self._html_search_regex( - r'flashvars\.imageUrl\s*=\s*"([^"]+)"', webpage, 'description', fatal=False, default=None) - if thumbnail: - thumbnail = 'http://www.vporn.com' + thumbnail + thumbnail = urljoin('http://www.vporn.com', self._html_search_regex( + r'flashvars\.imageUrl\s*=\s*"([^"]+)"', webpage, 'description', + default=None)) uploader = self._html_search_regex( r'(?s)Uploaded by:.*?]*>(.+?)', diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/vrak.py youtube-dl-2019.05.20/youtube_dl/extractor/vrak.py --- youtube-dl-2016.02.22/youtube_dl/extractor/vrak.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/vrak.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,80 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .brightcove import BrightcoveNewIE +from ..utils import ( + int_or_none, + parse_age_limit, + smuggle_url, + unescapeHTML, +) + + +class VrakIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vrak\.tv/videos\?.*?\btarget=(?P[\d.]+)' + _TEST = { + 'url': 'http://www.vrak.tv/videos?target=1.2306782&filtre=emission&id=1.1806721', + 'info_dict': { + 'id': '5345661243001', + 'ext': 'mp4', + 'title': 'Obésité, film de hockey et Roseline Filion', + 'timestamp': 1488492126, + 'upload_date': '20170302', + 'uploader_id': '2890187628001', + 'creator': 'VRAK.TV', + 'age_limit': 8, + 'series': 'ALT (Actualité Légèrement Tordue)', + 'episode': 'Obésité, film de hockey et Roseline Filion', + 'tags': list, + }, + 'params': { + 'skip_download': True, + }, + } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/2890187628001/default_default/index.html?videoId=%s' + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex( + r']+\bclass=["\']videoTitle["\'][^>]*>([^<]+)', + webpage, 'title', default=None) or self._og_search_title(webpage) + + content = self._parse_json( + self._search_regex( + r'data-player-options-content=(["\'])(?P{.+?})\1', + webpage, 'content', default='{}', group='content'), + video_id, transform_source=unescapeHTML) + + ref_id = content.get('refId') or self._search_regex( + r'refId":"([^&]+)"', webpage, 'ref id') + + brightcove_id = self._search_regex( + r'''(?x) + java\.lang\.String\s+value\s*=\s*["']brightcove\.article\.\d+\.%s + [^>]* + java\.lang\.String\s+value\s*=\s*["'](\d+) + ''' % re.escape(ref_id), webpage, 'brightcove id') + + return { + '_type': 'url_transparent', + 'ie_key': BrightcoveNewIE.ie_key(), + 'url': smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + {'geo_countries': ['CA']}), + 'id': brightcove_id, + 'description': content.get('description'), + 'creator': content.get('brand'), + 'age_limit': parse_age_limit(content.get('rating')), + 'series': content.get('showName') or content.get( + 'episodeName'), # this is intentional + 'season_number': int_or_none(content.get('seasonNumber')), + 'episode': title, + 'episode_number': int_or_none(content.get('episodeNumber')), + 'tags': content.get('tags', []), + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/vrt.py youtube-dl-2019.05.20/youtube_dl/extractor/vrt.py --- youtube-dl-2016.02.22/youtube_dl/extractor/vrt.py 2016-02-20 13:02:05.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/vrt.py 2019-06-07 18:49:07.000000000 +0000 @@ -4,101 +4,84 @@ import re from .common import InfoExtractor -from ..utils import float_or_none +from ..utils import ( + extract_attributes, + float_or_none, + get_element_by_class, + strip_or_none, + unified_timestamp, +) class VRTIE(InfoExtractor): - _VALID_URL = r'https?://(?:deredactie|sporza|cobra(?:\.canvas)?)\.be/cm/(?:[^/]+/)+(?P[^/]+)/*' - _TESTS = [ - # deredactie.be - { - 'url': 'http://deredactie.be/cm/vrtnieuws/videozone/programmas/journaal/EP_141025_JOL', - 'md5': '4cebde1eb60a53782d4f3992cbd46ec8', - 'info_dict': { - 'id': '2129880', - 'ext': 'flv', - 'title': 'Het journaal L - 25/10/14', - 'description': None, - 'timestamp': 1414271750.949, - 'upload_date': '20141025', - 'duration': 929, - } + IE_DESC = 'VRT NWS, Flanders News, Flandern Info and Sporza' + _VALID_URL = r'https?://(?:www\.)?(?Pvrt\.be/vrtnws|sporza\.be)/[a-z]{2}/\d{4}/\d{2}/\d{2}/(?P[^/?&#]+)' + _TESTS = [{ + 'url': 'https://www.vrt.be/vrtnws/nl/2019/05/15/beelden-van-binnenkant-notre-dame-een-maand-na-de-brand/', + 'md5': 'e1663accf5cf13f375f3cd0d10476669', + 'info_dict': { + 'id': 'pbs-pub-7855fc7b-1448-49bc-b073-316cb60caa71$vid-2ca50305-c38a-4762-9890-65cbd098b7bd', + 'ext': 'mp4', + 'title': 'Beelden van binnenkant Notre-Dame, één maand na de brand', + 'description': 'Op maandagavond 15 april ging een deel van het dakgebinte van de Parijse kathedraal in vlammen op.', + 'timestamp': 1557924660, + 'upload_date': '20190515', + 'duration': 31.2, }, - # sporza.be - { - 'url': 'http://sporza.be/cm/sporza/videozone/programmas/extratime/EP_141020_Extra_time', - 'md5': '11f53088da9bf8e7cfc42456697953ff', - 'info_dict': { - 'id': '2124639', - 'ext': 'flv', - 'title': 'Bekijk Extra Time van 20 oktober', - 'description': 'md5:83ac5415a4f1816c6a93f8138aef2426', - 'timestamp': 1413835980.560, - 'upload_date': '20141020', - 'duration': 3238, - } + }, { + 'url': 'https://sporza.be/nl/2019/05/15/de-belgian-cats-zijn-klaar-voor-het-ek/', + 'md5': '910bba927566e9ab992278f647eb4b75', + 'info_dict': { + 'id': 'pbs-pub-f2c86a46-8138-413a-a4b9-a0015a16ce2c$vid-1f112b31-e58e-4379-908d-aca6d80f8818', + 'ext': 'mp4', + 'title': 'De Belgian Cats zijn klaar voor het EK mét Ann Wauters', + 'timestamp': 1557923760, + 'upload_date': '20190515', + 'duration': 115.17, }, - # cobra.be - { - 'url': 'http://cobra.be/cm/cobra/videozone/rubriek/film-videozone/141022-mv-ellis-cafecorsari', - 'md5': '78a2b060a5083c4f055449a72477409d', - 'info_dict': { - 'id': '2126050', - 'ext': 'flv', - 'title': 'Bret Easton Ellis in Café Corsari', - 'description': 'md5:f699986e823f32fd6036c1855a724ee9', - 'timestamp': 1413967500.494, - 'upload_date': '20141022', - 'duration': 661, - } - }, - { - 'url': 'http://cobra.canvas.be/cm/cobra/videozone/rubriek/film-videozone/1.2377055', - 'only_matching': True, - } - ] + }, { + 'url': 'https://www.vrt.be/vrtnws/en/2019/05/15/belgium_s-eurovision-entry-falls-at-the-first-hurdle/', + 'only_matching': True, + }, { + 'url': 'https://www.vrt.be/vrtnws/de/2019/05/15/aus-fuer-eliott-im-halbfinale-des-eurosongfestivals/', + 'only_matching': True, + }] + _CLIENT_MAP = { + 'vrt.be/vrtnws': 'vrtnieuws', + 'sporza.be': 'sporza', + } def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - video_id = self._search_regex( - r'data-video-id="([^"]+)_[^"]+"', webpage, 'video id', fatal=False) - - formats = [] - mobj = re.search( - r'data-video-iphone-server="(?P[^"]+)"\s+data-video-iphone-path="(?P[^"]+)"', - webpage) - if mobj: - formats.extend(self._extract_m3u8_formats( - '%s/%s' % (mobj.group('server'), mobj.group('path')), - video_id, 'mp4', m3u8_id='hls', fatal=False)) - mobj = re.search(r'data-video-src="(?P[^"]+)"', webpage) - if mobj: - formats.extend(self._extract_f4m_formats( - '%s/manifest.f4m' % mobj.group('src'), - video_id, f4m_id='hds', fatal=False)) - - if not formats and 'data-video-geoblocking="true"' in webpage: - self.raise_geo_restricted('This video is only available in Belgium') - - self._sort_formats(formats) - - title = self._og_search_title(webpage) - description = self._og_search_description(webpage, default=None) - thumbnail = self._og_search_thumbnail(webpage) - timestamp = float_or_none(self._search_regex( - r'data-video-sitestat-pubdate="(\d+)"', webpage, 'timestamp', fatal=False), 1000) - duration = float_or_none(self._search_regex( - r'data-video-duration="(\d+)"', webpage, 'duration', fatal=False), 1000) + site, display_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, display_id) + attrs = extract_attributes(self._search_regex( + r'(<[^>]+class="vrtvideo"[^>]*>)', webpage, 'vrt video')) + + asset_id = attrs['data-videoid'] + publication_id = attrs.get('data-publicationid') + if publication_id: + asset_id = publication_id + '$' + asset_id + client = attrs.get('data-client') or self._CLIENT_MAP[site] + + title = strip_or_none(get_element_by_class( + 'vrt-title', webpage) or self._html_search_meta( + ['og:title', 'twitter:title', 'name'], webpage)) + description = self._html_search_meta( + ['og:description', 'twitter:description', 'description'], webpage) + if description == '…': + description = None + timestamp = unified_timestamp(self._html_search_meta( + 'article:published_time', webpage)) return { - 'id': video_id, + '_type': 'url_transparent', + 'id': asset_id, + 'display_id': display_id, 'title': title, 'description': description, - 'thumbnail': thumbnail, + 'thumbnail': attrs.get('data-posterimage'), 'timestamp': timestamp, - 'duration': duration, - 'formats': formats, + 'duration': float_or_none(attrs.get('data-duration'), 1000), + 'url': 'https://mediazone.vrt.be/api/v1/%s/assets/%s' % (client, asset_id), + 'ie_key': 'Canvas', } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/vrv.py youtube-dl-2019.05.20/youtube_dl/extractor/vrv.py --- youtube-dl-2016.02.22/youtube_dl/extractor/vrv.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/vrv.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,269 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import base64 +import json +import hashlib +import hmac +import random +import string +import time + +from .common import InfoExtractor +from ..compat import ( + compat_HTTPError, + compat_urllib_parse_urlencode, + compat_urllib_parse, +) +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, +) + + +class VRVBaseIE(InfoExtractor): + _API_DOMAIN = None + _API_PARAMS = {} + _CMS_SIGNING = {} + _TOKEN = None + _TOKEN_SECRET = '' + + def _call_api(self, path, video_id, note, data=None): + # https://tools.ietf.org/html/rfc5849#section-3 + base_url = self._API_DOMAIN + '/core/' + path + query = [ + ('oauth_consumer_key', self._API_PARAMS['oAuthKey']), + ('oauth_nonce', ''.join([random.choice(string.ascii_letters) for _ in range(32)])), + ('oauth_signature_method', 'HMAC-SHA1'), + ('oauth_timestamp', int(time.time())), + ] + if self._TOKEN: + query.append(('oauth_token', self._TOKEN)) + encoded_query = compat_urllib_parse_urlencode(query) + headers = self.geo_verification_headers() + if data: + data = json.dumps(data).encode() + headers['Content-Type'] = 'application/json' + base_string = '&'.join([ + 'POST' if data else 'GET', + compat_urllib_parse.quote(base_url, ''), + compat_urllib_parse.quote(encoded_query, '')]) + oauth_signature = base64.b64encode(hmac.new( + (self._API_PARAMS['oAuthSecret'] + '&' + self._TOKEN_SECRET).encode('ascii'), + base_string.encode(), hashlib.sha1).digest()).decode() + encoded_query += '&oauth_signature=' + compat_urllib_parse.quote(oauth_signature, '') + try: + return self._download_json( + '?'.join([base_url, encoded_query]), video_id, + note='Downloading %s JSON metadata' % note, headers=headers, data=data) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + raise ExtractorError(json.loads(e.cause.read().decode())['message'], expected=True) + raise + + def _call_cms(self, path, video_id, note): + if not self._CMS_SIGNING: + self._CMS_SIGNING = self._call_api('index', video_id, 'CMS Signing')['cms_signing'] + return self._download_json( + self._API_DOMAIN + path, video_id, query=self._CMS_SIGNING, + note='Downloading %s JSON metadata' % note, headers=self.geo_verification_headers()) + + def _get_cms_resource(self, resource_key, video_id): + return self._call_api( + 'cms_resource', video_id, 'resource path', data={ + 'resource_key': resource_key, + })['__links__']['cms_resource']['href'] + + def _real_initialize(self): + webpage = self._download_webpage( + 'https://vrv.co/', None, headers=self.geo_verification_headers()) + self._API_PARAMS = self._parse_json(self._search_regex( + [ + r'window\.__APP_CONFIG__\s*=\s*({.+?})(?:|;)', + r'window\.__APP_CONFIG__\s*=\s*({.+})' + ], webpage, 'app config'), None)['cxApiParams'] + self._API_DOMAIN = self._API_PARAMS.get('apiDomain', 'https://api.vrv.co') + + +class VRVIE(VRVBaseIE): + IE_NAME = 'vrv' + _VALID_URL = r'https?://(?:www\.)?vrv\.co/watch/(?P[A-Z0-9]+)' + _TESTS = [{ + 'url': 'https://vrv.co/watch/GR9PNZ396/Hidden-America-with-Jonah-Ray:BOSTON-WHERE-THE-PAST-IS-THE-PRESENT', + 'info_dict': { + 'id': 'GR9PNZ396', + 'ext': 'mp4', + 'title': 'BOSTON: WHERE THE PAST IS THE PRESENT', + 'description': 'md5:4ec8844ac262ca2df9e67c0983c6b83f', + 'uploader_id': 'seeso', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # movie listing + 'url': 'https://vrv.co/watch/G6NQXZ1J6/Lily-CAT', + 'info_dict': { + 'id': 'G6NQXZ1J6', + 'title': 'Lily C.A.T', + 'description': 'md5:988b031e7809a6aeb60968be4af7db07', + }, + 'playlist_count': 2, + }] + _NETRC_MACHINE = 'vrv' + + def _real_initialize(self): + super(VRVIE, self)._real_initialize() + + email, password = self._get_login_info() + if email is None: + return + + token_credentials = self._call_api( + 'authenticate/by:credentials', None, 'Token Credentials', data={ + 'email': email, + 'password': password, + }) + self._TOKEN = token_credentials['oauth_token'] + self._TOKEN_SECRET = token_credentials['oauth_token_secret'] + + def _extract_vrv_formats(self, url, video_id, stream_format, audio_lang, hardsub_lang): + if not url or stream_format not in ('hls', 'dash', 'adaptive_hls'): + return [] + stream_id_list = [] + if audio_lang: + stream_id_list.append('audio-%s' % audio_lang) + if hardsub_lang: + stream_id_list.append('hardsub-%s' % hardsub_lang) + format_id = stream_format + if stream_id_list: + format_id += '-' + '-'.join(stream_id_list) + if 'hls' in stream_format: + adaptive_formats = self._extract_m3u8_formats( + url, video_id, 'mp4', m3u8_id=format_id, + note='Downloading %s information' % format_id, + fatal=False) + elif stream_format == 'dash': + adaptive_formats = self._extract_mpd_formats( + url, video_id, mpd_id=format_id, + note='Downloading %s information' % format_id, + fatal=False) + if audio_lang: + for f in adaptive_formats: + if f.get('acodec') != 'none': + f['language'] = audio_lang + return adaptive_formats + + def _real_extract(self, url): + video_id = self._match_id(url) + + object_data = self._call_cms(self._get_cms_resource( + 'cms:/objects/' + video_id, video_id), video_id, 'object')['items'][0] + resource_path = object_data['__links__']['resource']['href'] + video_data = self._call_cms(resource_path, video_id, 'video') + title = video_data['title'] + description = video_data.get('description') + + if video_data.get('__class__') == 'movie_listing': + items = self._call_cms( + video_data['__links__']['movie_listing/movies']['href'], + video_id, 'movie listing').get('items') or [] + if len(items) != 1: + entries = [] + for item in items: + item_id = item.get('id') + if not item_id: + continue + entries.append(self.url_result( + 'https://vrv.co/watch/' + item_id, + self.ie_key(), item_id, item.get('title'))) + return self.playlist_result(entries, video_id, title, description) + video_data = items[0] + + streams_path = video_data['__links__'].get('streams', {}).get('href') + if not streams_path: + self.raise_login_required() + streams_json = self._call_cms(streams_path, video_id, 'streams') + + audio_locale = streams_json.get('audio_locale') + formats = [] + for stream_type, streams in streams_json.get('streams', {}).items(): + if stream_type in ('adaptive_hls', 'adaptive_dash'): + for stream in streams.values(): + formats.extend(self._extract_vrv_formats( + stream.get('url'), video_id, stream_type.split('_')[1], + audio_locale, stream.get('hardsub_locale'))) + self._sort_formats(formats) + + subtitles = {} + for k in ('captions', 'subtitles'): + for subtitle in streams_json.get(k, {}).values(): + subtitle_url = subtitle.get('url') + if not subtitle_url: + continue + subtitles.setdefault(subtitle.get('locale', 'en-US'), []).append({ + 'url': subtitle_url, + 'ext': subtitle.get('format', 'ass'), + }) + + thumbnails = [] + for thumbnail in video_data.get('images', {}).get('thumbnails', []): + thumbnail_url = thumbnail.get('source') + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + 'description': description, + 'duration': float_or_none(video_data.get('duration_ms'), 1000), + 'uploader_id': video_data.get('channel_id'), + 'series': video_data.get('series_title'), + 'season': video_data.get('season_title'), + 'season_number': int_or_none(video_data.get('season_number')), + 'season_id': video_data.get('season_id'), + 'episode': title, + 'episode_number': int_or_none(video_data.get('episode_number')), + 'episode_id': video_data.get('production_episode_id'), + } + + +class VRVSeriesIE(VRVBaseIE): + IE_NAME = 'vrv:series' + _VALID_URL = r'https?://(?:www\.)?vrv\.co/series/(?P[A-Z0-9]+)' + _TEST = { + 'url': 'https://vrv.co/series/G68VXG3G6/The-Perfect-Insider', + 'info_dict': { + 'id': 'G68VXG3G6', + }, + 'playlist_mincount': 11, + } + + def _real_extract(self, url): + series_id = self._match_id(url) + + seasons_path = self._get_cms_resource( + 'cms:/seasons?series_id=' + series_id, series_id) + seasons_data = self._call_cms(seasons_path, series_id, 'seasons') + + entries = [] + for season in seasons_data.get('items', []): + episodes_path = season['__links__']['season/episodes']['href'] + episodes = self._call_cms(episodes_path, series_id, 'episodes') + for episode in episodes.get('items', []): + episode_id = episode['id'] + entries.append(self.url_result( + 'https://vrv.co/watch/' + episode_id, + 'VRV', episode_id, episode.get('title'))) + + return self.playlist_result(entries, series_id) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/vshare.py youtube-dl-2019.05.20/youtube_dl/extractor/vshare.py --- youtube-dl-2016.02.22/youtube_dl/extractor/vshare.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/vshare.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,74 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_chr +from ..utils import ( + decode_packed_codes, + ExtractorError, +) + + +class VShareIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vshare\.io/[dv]/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://vshare.io/d/0f64ce6', + 'md5': '17b39f55b5497ae8b59f5fbce8e35886', + 'info_dict': { + 'id': '0f64ce6', + 'title': 'vl14062007715967', + 'ext': 'mp4', + } + }, { + 'url': 'https://vshare.io/v/0f64ce6/width-650/height-430/1', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r']+?src=["\'](?P(?:https?:)?//(?:www\.)?vshare\.io/v/[^/?#&]+)', + webpage) + + def _extract_packed(self, webpage): + packed = self._search_regex( + r'(eval\(function.+)', webpage, 'packed code') + unpacked = decode_packed_codes(packed) + digits = self._search_regex(r'\[((?:\d+,?)+)\]', unpacked, 'digits') + digits = [int(digit) for digit in digits.split(',')] + key_digit = self._search_regex( + r'fromCharCode\(.+?(\d+)\)}', unpacked, 'key digit') + chars = [compat_chr(d - int(key_digit)) for d in digits] + return ''.join(chars) + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'https://vshare.io/v/%s/width-650/height-430/1' % video_id, + video_id, headers={'Referer': url}) + + title = self._html_search_regex( + r'([^<]+)', webpage, 'title') + title = title.split(' - ')[0] + + error = self._html_search_regex( + r'(?s)]+\bclass=["\']xxx-error[^>]+>(.+?)%s' % self._extract_packed(webpage), + video_id)[0] + + self._sort_formats(info['formats']) + + info.update({ + 'id': video_id, + 'title': title, + }) + + return info diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/vube.py youtube-dl-2019.05.20/youtube_dl/extractor/vube.py --- youtube-dl-2016.02.22/youtube_dl/extractor/vube.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/vube.py 2019-06-07 18:49:07.000000000 +0000 @@ -15,7 +15,7 @@ class VubeIE(InfoExtractor): IE_NAME = 'vube' IE_DESC = 'Vube.com' - _VALID_URL = r'http://vube\.com/(?:[^/]+/)+(?P[\da-zA-Z]{10})\b' + _VALID_URL = r'https?://vube\.com/(?:[^/]+/)+(?P[\da-zA-Z]{10})\b' _TESTS = [ { @@ -26,7 +26,7 @@ 'ext': 'mp4', 'title': 'Best Drummer Ever [HD]', 'description': 'md5:2d63c4b277b85c2277761c2cf7337d71', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'uploader': 'William', 'timestamp': 1406876915, 'upload_date': '20140801', @@ -45,7 +45,7 @@ 'ext': 'mp4', 'title': 'Chiara Grispo - Price Tag by Jessie J', 'description': 'md5:8ea652a1f36818352428cb5134933313', - 'thumbnail': 're:^http://frame\.thestaticvube\.com/snap/[0-9x]+/102e7e63057-5ebc-4f5c-4065-6ce4ebde131f\.jpg$', + 'thumbnail': r're:^http://frame\.thestaticvube\.com/snap/[0-9x]+/102e7e63057-5ebc-4f5c-4065-6ce4ebde131f\.jpg$', 'uploader': 'Chiara.Grispo', 'timestamp': 1388743358, 'upload_date': '20140103', @@ -65,7 +65,7 @@ 'ext': 'mp4', 'title': 'My 7 year old Sister and I singing "Alive" by Krewella', 'description': 'md5:40bcacb97796339f1690642c21d56f4a', - 'thumbnail': 're:^http://frame\.thestaticvube\.com/snap/[0-9x]+/102265d5a9f-0f17-4f6b-5753-adf08484ee1e\.jpg$', + 'thumbnail': r're:^http://frame\.thestaticvube\.com/snap/[0-9x]+/102265d5a9f-0f17-4f6b-5753-adf08484ee1e\.jpg$', 'uploader': 'Seraina', 'timestamp': 1396492438, 'upload_date': '20140403', @@ -84,7 +84,7 @@ 'ext': 'mp4', 'title': 'Frozen - Let It Go Cover by Siren Gene', 'description': 'My rendition of "Let It Go" originally sung by Idina Menzel.', - 'thumbnail': 're:^http://frame\.thestaticvube\.com/snap/[0-9x]+/10283ab622a-86c9-4681-51f2-30d1f65774af\.jpg$', + 'thumbnail': r're:^http://frame\.thestaticvube\.com/snap/[0-9x]+/10283ab622a-86c9-4681-51f2-30d1f65774af\.jpg$', 'uploader': 'Siren', 'timestamp': 1395448018, 'upload_date': '20140322', diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/vuclip.py youtube-dl-2019.05.20/youtube_dl/extractor/vuclip.py --- youtube-dl-2016.02.22/youtube_dl/extractor/vuclip.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/vuclip.py 2019-06-07 18:49:07.000000000 +0000 @@ -9,20 +9,20 @@ from ..utils import ( ExtractorError, parse_duration, - qualities, + remove_end, ) class VuClipIE(InfoExtractor): - _VALID_URL = r'http://(?:m\.)?vuclip\.com/w\?.*?cid=(?P[0-9]+)' + _VALID_URL = r'https?://(?:m\.)?vuclip\.com/w\?.*?cid=(?P[0-9]+)' _TEST = { - 'url': 'http://m.vuclip.com/w?cid=922692425&fid=70295&z=1010&nvar&frm=index.html', + 'url': 'http://m.vuclip.com/w?cid=1129900602&bu=8589892792&frm=w&z=34801&op=0&oc=843169247§ion=recommend', 'info_dict': { - 'id': '922692425', + 'id': '1129900602', 'ext': '3gp', - 'title': 'The Toy Soldiers - Hollywood Movie Trailer', - 'duration': 180, + 'title': 'Top 10 TV Convicts', + 'duration': 733, } } @@ -46,34 +46,21 @@ '%s said: %s' % (self.IE_NAME, error_msg), expected=True) # These clowns alternate between two page types - links_code = self._search_regex( - r'''(?xs) - (?: - | - \s* - ) - (.*?) - (?: - - ) - ''', webpage, 'links') - title = self._html_search_regex( - r'(.*?)-\s*Vuclip', webpage, 'title').strip() - - quality_order = qualities(['Reg', 'Hi']) - formats = [] - for url, q in re.findall( - r'[^"]+)".*?>(?:]*>)?(?P[^<]+)(?:)?', links_code): - format_id = compat_urllib_parse_urlparse(url).scheme + '-' + q - formats.append({ - 'format_id': format_id, - 'url': url, - 'quality': quality_order(q), - }) - self._sort_formats(formats) + video_url = self._search_regex( + r']+href="([^"]+)"[^>]*>]+src="[^"]*/play\.gif', + webpage, 'video URL', default=None) + if video_url: + formats = [{ + 'url': video_url, + }] + else: + formats = self._parse_html5_media_entries(url, webpage, video_id)[0]['formats'] - duration = parse_duration(self._search_regex( - r'\(([0-9:]+)\)', webpage, 'duration', fatal=False)) + title = remove_end(self._html_search_regex( + r'(.*?)-\s*Vuclip', webpage, 'title').strip(), ' - Video') + + duration = parse_duration(self._html_search_regex( + r'[(>]([0-9]+:[0-9]+)(?:[^/]+)/' - _TEST = { - 'url': 'http://video.vulture.com/video/Mindy-Kaling-s-Harvard-Speech/player?layout=compact&read_more=1', - 'md5': '8d997845642a2b5152820f7257871bc8', - 'info_dict': { - 'id': '6GHRQL3RV7MSD1H4', - 'ext': 'mp4', - 'title': 'kaling-speech-2-MAGNIFY STANDARD CONTAINER REVISED', - 'uploader_id': 'Sarah', - 'thumbnail': 're:^http://.*\.jpg$', - 'timestamp': 1401288564, - 'upload_date': '20140528', - 'description': 'Uplifting and witty, as predicted.', - 'duration': 1015, - } - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('display_id') - - webpage = self._download_webpage(url, display_id) - query_string = self._search_regex( - r"queryString\s*=\s*'([^']+)'", webpage, 'query string') - video_id = self._search_regex( - r'content=([^&]+)', query_string, 'video ID') - query_url = 'http://video.vulture.com/embed/player/container/1000/1000/?%s' % query_string - - query_webpage = self._download_webpage( - query_url, display_id, note='Downloading query page') - params_json = self._search_regex( - r'(?sm)new MagnifyEmbeddablePlayer\({.*?contentItem:\s*(\{.*?\})\n?,\n', - query_webpage, - 'player params') - params = json.loads(params_json) - - upload_timestamp = parse_iso8601(params['posted'].replace(' ', 'T')) - uploader_id = params.get('user', {}).get('handle') - - media_item = params['media_item'] - title = os.path.splitext(media_item['title'])[0] - duration = int_or_none(media_item.get('duration_seconds')) - - return { - 'id': video_id, - 'display_id': display_id, - 'url': media_item['pipeline_xid'], - 'title': title, - 'timestamp': upload_timestamp, - 'thumbnail': params.get('thumbnail_url'), - 'uploader_id': uploader_id, - 'description': params.get('description'), - 'duration': duration, - } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/vvvvid.py youtube-dl-2019.05.20/youtube_dl/extractor/vvvvid.py --- youtube-dl-2016.02.22/youtube_dl/extractor/vvvvid.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/vvvvid.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,158 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + str_or_none, +) + + +class VVVVIDIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vvvvid\.it/(?:#!)?(?:show|anime|film|series)/(?P\d+)/[^/]+/(?P\d+)/(?P[0-9]+)' + _TESTS = [{ + # video_type == 'video/vvvvid' + 'url': 'https://www.vvvvid.it/#!show/434/perche-dovrei-guardarlo-di-dario-moccia/437/489048/ping-pong', + 'md5': 'b8d3cecc2e981adc3835adf07f6df91b', + 'info_dict': { + 'id': '489048', + 'ext': 'mp4', + 'title': 'Ping Pong', + }, + 'params': { + 'skip_download': True, + }, + }, { + # video_type == 'video/rcs' + 'url': 'https://www.vvvvid.it/#!show/376/death-note-live-action/377/482493/episodio-01', + 'md5': '33e0edfba720ad73a8782157fdebc648', + 'info_dict': { + 'id': '482493', + 'ext': 'mp4', + 'title': 'Episodio 01', + }, + 'params': { + 'skip_download': True, + }, + }] + _conn_id = None + + def _real_initialize(self): + self._conn_id = self._download_json( + 'https://www.vvvvid.it/user/login', + None, headers=self.geo_verification_headers())['data']['conn_id'] + + def _real_extract(self, url): + show_id, season_id, video_id = re.match(self._VALID_URL, url).groups() + response = self._download_json( + 'https://www.vvvvid.it/vvvvid/ondemand/%s/season/%s' % (show_id, season_id), + video_id, headers=self.geo_verification_headers(), query={ + 'conn_id': self._conn_id, + }) + if response['result'] == 'error': + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, response['message']), expected=True) + + vid = int(video_id) + video_data = list(filter( + lambda episode: episode.get('video_id') == vid, response['data']))[0] + formats = [] + + # vvvvid embed_info decryption algorithm is reverse engineered from function $ds(h) at vvvvid.js + def ds(h): + g = "MNOPIJKL89+/4567UVWXQRSTEFGHABCDcdefYZabstuvopqr0123wxyzklmnghij" + + def f(m): + l = [] + o = 0 + b = False + m_len = len(m) + while ((not b) and o < m_len): + n = m[o] << 2 + o += 1 + k = -1 + j = -1 + if o < m_len: + n += m[o] >> 4 + o += 1 + if o < m_len: + k = (m[o - 1] << 4) & 255 + k += m[o] >> 2 + o += 1 + if o < m_len: + j = (m[o - 1] << 6) & 255 + j += m[o] + o += 1 + else: + b = True + else: + b = True + else: + b = True + l.append(n) + if k != -1: + l.append(k) + if j != -1: + l.append(j) + return l + + c = [] + for e in h: + c.append(g.index(e)) + + c_len = len(c) + for e in range(c_len * 2 - 1, -1, -1): + a = c[e % c_len] ^ c[(e + 1) % c_len] + c[e % c_len] = a + + c = f(c) + d = '' + for e in c: + d += chr(e) + + return d + + for quality in ('_sd', ''): + embed_code = video_data.get('embed_info' + quality) + if not embed_code: + continue + embed_code = ds(embed_code) + video_type = video_data.get('video_type') + if video_type in ('video/rcs', 'video/kenc'): + embed_code = re.sub(r'https?://([^/]+)/z/', r'https://\1/i/', embed_code).replace('/manifest.f4m', '/master.m3u8') + if video_type == 'video/kenc': + kenc = self._download_json( + 'https://www.vvvvid.it/kenc', video_id, query={ + 'action': 'kt', + 'conn_id': self._conn_id, + 'url': embed_code, + }, fatal=False) or {} + kenc_message = kenc.get('message') + if kenc_message: + embed_code += '?' + ds(kenc_message) + formats.extend(self._extract_m3u8_formats( + embed_code, video_id, 'mp4', + m3u8_id='hls', fatal=False)) + else: + formats.extend(self._extract_wowza_formats( + 'http://sb.top-ix.org/videomg/_definst_/mp4:%s/playlist.m3u8' % embed_code, video_id)) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_data['title'], + 'formats': formats, + 'thumbnail': video_data.get('thumbnail'), + 'duration': int_or_none(video_data.get('length')), + 'series': video_data.get('show_title'), + 'season_id': season_id, + 'season_number': video_data.get('season_number'), + 'episode_id': str_or_none(video_data.get('id')), + 'episode_number': int_or_none(video_data.get('number')), + 'episode_title': video_data['title'], + 'view_count': int_or_none(video_data.get('views')), + 'like_count': int_or_none(video_data.get('video_likes')), + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/vyborymos.py youtube-dl-2019.05.20/youtube_dl/extractor/vyborymos.py --- youtube-dl-2016.02.22/youtube_dl/extractor/vyborymos.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/vyborymos.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,55 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str + + +class VyboryMosIE(InfoExtractor): + _VALID_URL = r'https?://vybory\.mos\.ru/(?:#precinct/|account/channels\?.*?\bstation_id=)(?P\d+)' + _TESTS = [{ + 'url': 'http://vybory.mos.ru/#precinct/13636', + 'info_dict': { + 'id': '13636', + 'ext': 'mp4', + 'title': 're:^Участковая избирательная комиссия №2231 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 'Россия, Москва, улица Введенского, 32А', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + } + }, { + 'url': 'http://vybory.mos.ru/account/channels?station_id=13636', + 'only_matching': True, + }] + + def _real_extract(self, url): + station_id = self._match_id(url) + + channels = self._download_json( + 'http://vybory.mos.ru/account/channels?station_id=%s' % station_id, + station_id, 'Downloading channels JSON') + + formats = [] + for cam_num, (sid, hosts, name, _) in enumerate(channels, 1): + for num, host in enumerate(hosts, 1): + formats.append({ + 'url': 'http://%s/master.m3u8?sid=%s' % (host, sid), + 'ext': 'mp4', + 'format_id': 'camera%d-host%d' % (cam_num, num), + 'format_note': '%s, %s' % (name, host), + }) + + info = self._download_json( + 'http://vybory.mos.ru/json/voting_stations/%s/%s.json' + % (compat_str(station_id)[:3], station_id), + station_id, 'Downloading station JSON', fatal=False) + + return { + 'id': station_id, + 'title': self._live_title(info['name'] if info else station_id), + 'description': info.get('address'), + 'is_live': True, + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/vzaar.py youtube-dl-2019.05.20/youtube_dl/extractor/vzaar.py --- youtube-dl-2016.02.22/youtube_dl/extractor/vzaar.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/vzaar.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,91 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + float_or_none, + unified_timestamp, + url_or_none, +) + + +class VzaarIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www|view)\.)?vzaar\.com/(?:videos/)?(?P\d+)' + _TESTS = [{ + # HTTP and HLS + 'url': 'https://vzaar.com/videos/1152805', + 'md5': 'bde5ddfeb104a6c56a93a06b04901dbf', + 'info_dict': { + 'id': '1152805', + 'ext': 'mp4', + 'title': 'sample video (public)', + }, + }, { + 'url': 'https://view.vzaar.com/27272/player', + 'md5': '3b50012ac9bbce7f445550d54e0508f2', + 'info_dict': { + 'id': '27272', + 'ext': 'mp3', + 'title': 'MP3', + }, + }] + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r']+src=["\']((?:https?:)?//(?:view\.vzaar\.com)/[0-9]+)', + webpage) + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._download_json( + 'http://view.vzaar.com/v2/%s/video' % video_id, video_id) + + title = video_data['videoTitle'] + + formats = [] + + source_url = url_or_none(video_data.get('sourceUrl')) + if source_url: + f = { + 'url': source_url, + 'format_id': 'http', + } + if 'audio' in source_url: + f.update({ + 'vcodec': 'none', + 'ext': 'mp3', + }) + else: + f.update({ + 'width': int_or_none(video_data.get('width')), + 'height': int_or_none(video_data.get('height')), + 'ext': 'mp4', + 'fps': float_or_none(video_data.get('fps')), + }) + formats.append(f) + + video_guid = video_data.get('guid') + usp = video_data.get('usp') + if isinstance(video_guid, compat_str) and isinstance(usp, dict): + m3u8_url = ('http://fable.vzaar.com/v4/usp/%s/%s.ism/.m3u8?' + % (video_guid, video_id)) + '&'.join( + '%s=%s' % (k, v) for k, v in usp.items()) + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': self._proto_relative_url(video_data.get('poster')), + 'duration': float_or_none(video_data.get('videoDuration')), + 'timestamp': unified_timestamp(video_data.get('ts')), + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/wakanim.py youtube-dl-2019.05.20/youtube_dl/extractor/wakanim.py --- youtube-dl-2016.02.22/youtube_dl/extractor/wakanim.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/wakanim.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,66 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + merge_dicts, + urljoin, +) + + +class WakanimIE(InfoExtractor): + _VALID_URL = r'https://(?:www\.)?wakanim\.tv/[^/]+/v2/catalogue/episode/(?P\d+)' + _TESTS = [{ + 'url': 'https://www.wakanim.tv/de/v2/catalogue/episode/2997/the-asterisk-war-omu-staffel-1-episode-02-omu', + 'info_dict': { + 'id': '2997', + 'ext': 'mp4', + 'title': 'Episode 02', + 'description': 'md5:2927701ea2f7e901de8bfa8d39b2852d', + 'series': 'The Asterisk War (OmU.)', + 'season_number': 1, + 'episode': 'Episode 02', + 'episode_number': 2, + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + }, { + # DRM Protected + 'url': 'https://www.wakanim.tv/de/v2/catalogue/episode/7843/sword-art-online-alicization-omu-arc-2-folge-15-omu', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + m3u8_url = urljoin(url, self._search_regex( + r'file\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, 'm3u8 url', + group='url')) + # https://docs.microsoft.com/en-us/azure/media-services/previous/media-services-content-protection-overview#streaming-urls + encryption = self._search_regex( + r'encryption%3D(c(?:enc|bc(?:s-aapl)?))', + m3u8_url, 'encryption', default=None) + if encryption and encryption in ('cenc', 'cbcs-aapl'): + raise ExtractorError('This video is DRM protected.', expected=True) + + formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + + info = self._search_json_ld(webpage, video_id, default={}) + + title = self._search_regex( + (r']+\bclass=["\']episode_h1[^>]+\btitle=(["\'])(?P(?:(?!\1).)+)\1', + r'<span[^>]+\bclass=["\']episode_title["\'][^>]*>(?P<title>[^<]+)'), + webpage, 'title', default=None, group='title') + + return merge_dicts(info, { + 'id': video_id, + 'title': title, + 'formats': formats, + }) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/walla.py youtube-dl-2019.05.20/youtube_dl/extractor/walla.py --- youtube-dl-2016.02.22/youtube_dl/extractor/walla.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/walla.py 2019-06-07 18:49:07.000000000 +0000 @@ -11,7 +11,7 @@ class WallaIE(InfoExtractor): - _VALID_URL = r'http://vod\.walla\.co\.il/[^/]+/(?P<id>\d+)/(?P<display_id>.+)' + _VALID_URL = r'https?://vod\.walla\.co\.il/[^/]+/(?P<id>\d+)/(?P<display_id>.+)' _TEST = { 'url': 'http://vod.walla.co.il/movie/2642630/one-direction-all-for-one', 'info_dict': { @@ -20,7 +20,7 @@ 'ext': 'flv', 'title': 'וואן דיירקשן: ההיסטריה', 'description': 'md5:de9e2512a92442574cdb0913c49bc4d8', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'duration': 3600, }, 'params': { diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/washingtonpost.py youtube-dl-2019.05.20/youtube_dl/extractor/washingtonpost.py --- youtube-dl-2016.02.22/youtube_dl/extractor/washingtonpost.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/washingtonpost.py 2019-06-07 18:49:07.000000000 +0000 @@ -11,7 +11,102 @@ class WashingtonPostIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?washingtonpost\.com/.*?/(?P<id>[^/]+)/(?:$|[?#])' + IE_NAME = 'washingtonpost' + _VALID_URL = r'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/video/(?:[^/]+/)*)(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _EMBED_URL = r'https?://(?:www\.)?washingtonpost\.com/video/c/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' + _TEST = { + 'url': 'https://www.washingtonpost.com/video/c/video/480ba4ee-1ec7-11e6-82c2-a7dcb313287d', + 'md5': '6f537e1334b714eb15f9563bd4b9cdfa', + 'info_dict': { + 'id': '480ba4ee-1ec7-11e6-82c2-a7dcb313287d', + 'ext': 'mp4', + 'title': 'Egypt finds belongings, debris from plane crash', + 'description': 'md5:a17ceee432f215a5371388c1f680bd86', + 'upload_date': '20160520', + 'uploader': 'Reuters', + 'timestamp': 1463778452, + }, + } + + @classmethod + def _extract_urls(cls, webpage): + return re.findall( + r'<iframe[^>]+\bsrc=["\'](%s)' % cls._EMBED_URL, webpage) + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._download_json( + 'http://www.washingtonpost.com/posttv/c/videojson/%s?resType=jsonp' % video_id, + video_id, transform_source=strip_jsonp)[0]['contentConfig'] + title = video_data['title'] + + urls = [] + formats = [] + for s in video_data.get('streams', []): + s_url = s.get('url') + if not s_url or s_url in urls: + continue + urls.append(s_url) + video_type = s.get('type') + if video_type == 'smil': + continue + elif video_type in ('ts', 'hls') and ('_master.m3u8' in s_url or '_mobile.m3u8' in s_url): + m3u8_formats = self._extract_m3u8_formats( + s_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) + for m3u8_format in m3u8_formats: + width = m3u8_format.get('width') + if not width: + continue + vbr = self._search_regex( + r'%d_%d_(\d+)' % (width, m3u8_format['height']), m3u8_format['url'], 'vbr', default=None) + if vbr: + m3u8_format.update({ + 'vbr': int_or_none(vbr), + }) + formats.extend(m3u8_formats) + else: + width = int_or_none(s.get('width')) + vbr = int_or_none(s.get('bitrate')) + has_width = width != 0 + formats.append({ + 'format_id': ( + '%s-%d-%d' % (video_type, width, vbr) + if width + else video_type), + 'vbr': vbr if has_width else None, + 'width': width, + 'height': int_or_none(s.get('height')), + 'acodec': s.get('audioCodec'), + 'vcodec': s.get('videoCodec') if has_width else 'none', + 'filesize': int_or_none(s.get('fileSize')), + 'url': s_url, + 'ext': 'mp4', + 'protocol': 'm3u8_native' if video_type in ('ts', 'hls') else None, + }) + source_media_url = video_data.get('sourceMediaURL') + if source_media_url: + formats.append({ + 'format_id': 'source_media', + 'url': source_media_url, + }) + self._sort_formats( + formats, ('width', 'height', 'vbr', 'filesize', 'tbr', 'format_id')) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('blurb'), + 'uploader': video_data.get('credits', {}).get('source'), + 'formats': formats, + 'duration': int_or_none(video_data.get('videoDuration'), 100), + 'timestamp': int_or_none( + video_data.get('dateConfig', {}).get('dateFirstPublished'), 1000), + } + + +class WashingtonPostArticleIE(InfoExtractor): + IE_NAME = 'washingtonpost:article' + _VALID_URL = r'https?://(?:www\.)?washingtonpost\.com/(?:[^/]+/)*(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'http://www.washingtonpost.com/sf/national/2014/03/22/sinkhole-of-bureaucracy/', 'info_dict': { @@ -63,6 +158,10 @@ }] }] + @classmethod + def suitable(cls, url): + return False if WashingtonPostIE.suitable(url) else super(WashingtonPostArticleIE, cls).suitable(url) + def _real_extract(self, url): page_id = self._match_id(url) webpage = self._download_webpage(url, page_id) @@ -74,54 +173,7 @@ <div\s+class="posttv-video-embed[^>]*?data-uuid=| data-video-uuid= )"([^"]+)"''', webpage) - entries = [] - for i, uuid in enumerate(uuids, start=1): - vinfo_all = self._download_json( - 'http://www.washingtonpost.com/posttv/c/videojson/%s?resType=jsonp' % uuid, - page_id, - transform_source=strip_jsonp, - note='Downloading information of video %d/%d' % (i, len(uuids)) - ) - vinfo = vinfo_all[0]['contentConfig'] - uploader = vinfo.get('credits', {}).get('source') - timestamp = int_or_none( - vinfo.get('dateConfig', {}).get('dateFirstPublished'), 1000) - - formats = [{ - 'format_id': ( - '%s-%s-%s' % (s.get('type'), s.get('width'), s.get('bitrate')) - if s.get('width') - else s.get('type')), - 'vbr': s.get('bitrate') if s.get('width') != 0 else None, - 'width': s.get('width'), - 'height': s.get('height'), - 'acodec': s.get('audioCodec'), - 'vcodec': s.get('videoCodec') if s.get('width') != 0 else 'none', - 'filesize': s.get('fileSize'), - 'url': s.get('url'), - 'ext': 'mp4', - 'preference': -100 if s.get('type') == 'smil' else None, - 'protocol': { - 'MP4': 'http', - 'F4F': 'f4m', - }.get(s.get('type')), - } for s in vinfo.get('streams', [])] - source_media_url = vinfo.get('sourceMediaURL') - if source_media_url: - formats.append({ - 'format_id': 'source_media', - 'url': source_media_url, - }) - self._sort_formats(formats) - entries.append({ - 'id': uuid, - 'title': vinfo['title'], - 'description': vinfo.get('blurb'), - 'uploader': uploader, - 'formats': formats, - 'duration': int_or_none(vinfo.get('videoDuration'), 100), - 'timestamp': timestamp, - }) + entries = [self.url_result('washingtonpost:%s' % uuid, 'WashingtonPost', uuid) for uuid in uuids] return { '_type': 'playlist', diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/watchbox.py youtube-dl-2019.05.20/youtube_dl/extractor/watchbox.py --- youtube-dl-2016.02.22/youtube_dl/extractor/watchbox.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/watchbox.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,161 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + js_to_json, + strip_or_none, + try_get, + unescapeHTML, + unified_timestamp, +) + + +class WatchBoxIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?watchbox\.de/(?P<kind>serien|filme)/(?:[^/]+/)*[^/]+-(?P<id>\d+)' + _TESTS = [{ + # film + 'url': 'https://www.watchbox.de/filme/free-jimmy-12325.html', + 'info_dict': { + 'id': '341368', + 'ext': 'mp4', + 'title': 'Free Jimmy', + 'description': 'md5:bcd8bafbbf9dc0ef98063d344d7cc5f6', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 4890, + 'age_limit': 16, + 'release_year': 2009, + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + 'expected_warnings': ['Failed to download m3u8 information'], + }, { + # episode + 'url': 'https://www.watchbox.de/serien/ugly-americans-12231/staffel-1/date-in-der-hoelle-328286.html', + 'info_dict': { + 'id': '328286', + 'ext': 'mp4', + 'title': 'S01 E01 - Date in der Hölle', + 'description': 'md5:2f31c74a8186899f33cb5114491dae2b', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 1291, + 'age_limit': 12, + 'release_year': 2010, + 'series': 'Ugly Americans', + 'season_number': 1, + 'episode': 'Date in der Hölle', + 'episode_number': 1, + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + 'expected_warnings': ['Failed to download m3u8 information'], + }, { + 'url': 'https://www.watchbox.de/serien/ugly-americans-12231/staffel-2/der-ring-des-powers-328270', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + kind, video_id = mobj.group('kind', 'id') + + webpage = self._download_webpage(url, video_id) + + player_config = self._parse_json( + self._search_regex( + r'data-player-conf=(["\'])(?P<data>{.+?})\1', webpage, + 'player config', default='{}', group='data'), + video_id, transform_source=unescapeHTML, fatal=False) + + if not player_config: + player_config = self._parse_json( + self._search_regex( + r'playerConf\s*=\s*({.+?})\s*;', webpage, 'player config', + default='{}'), + video_id, transform_source=js_to_json, fatal=False) or {} + + source = player_config.get('source') or {} + + video_id = compat_str(source.get('videoId') or video_id) + + devapi = self._download_json( + 'http://api.watchbox.de/devapi/id/%s' % video_id, video_id, query={ + 'format': 'json', + 'apikey': 'hbbtv', + }, fatal=False) + + item = try_get(devapi, lambda x: x['items'][0], dict) or {} + + title = item.get('title') or try_get( + item, lambda x: x['movie']['headline_movie'], + compat_str) or source['title'] + + formats = [] + hls_url = item.get('media_videourl_hls') or source.get('hls') + if hls_url: + formats.extend(self._extract_m3u8_formats( + hls_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + dash_url = item.get('media_videourl_wv') or source.get('dash') + if dash_url: + formats.extend(self._extract_mpd_formats( + dash_url, video_id, mpd_id='dash', fatal=False)) + mp4_url = item.get('media_videourl') + if mp4_url: + formats.append({ + 'url': mp4_url, + 'format_id': 'mp4', + 'width': int_or_none(item.get('width')), + 'height': int_or_none(item.get('height')), + 'tbr': int_or_none(item.get('bitrate')), + }) + self._sort_formats(formats) + + description = strip_or_none(item.get('descr')) + thumbnail = item.get('media_content_thumbnail_large') or source.get('poster') or item.get('media_thumbnail') + duration = int_or_none(item.get('media_length') or source.get('length')) + timestamp = unified_timestamp(item.get('pubDate')) + view_count = int_or_none(item.get('media_views')) + age_limit = int_or_none(try_get(item, lambda x: x['movie']['fsk'])) + release_year = int_or_none(try_get(item, lambda x: x['movie']['rel_year'])) + + info = { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': timestamp, + 'view_count': view_count, + 'age_limit': age_limit, + 'release_year': release_year, + 'formats': formats, + } + + if kind.lower() == 'serien': + series = try_get( + item, lambda x: x['special']['title'], + compat_str) or source.get('format') + season_number = int_or_none(self._search_regex( + r'^S(\d{1,2})\s*E\d{1,2}', title, 'season number', + default=None) or self._search_regex( + r'/staffel-(\d+)/', url, 'season number', default=None)) + episode = source.get('title') + episode_number = int_or_none(self._search_regex( + r'^S\d{1,2}\s*E(\d{1,2})', title, 'episode number', + default=None)) + info.update({ + 'series': series, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, + }) + + return info diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/watchindianporn.py youtube-dl-2019.05.20/youtube_dl/extractor/watchindianporn.py --- youtube-dl-2016.02.22/youtube_dl/extractor/watchindianporn.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/watchindianporn.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,68 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import parse_duration + + +class WatchIndianPornIE(InfoExtractor): + IE_DESC = 'Watch Indian Porn' + _VALID_URL = r'https?://(?:www\.)?watchindianporn\.net/(?:[^/]+/)*video/(?P<display_id>[^/]+)-(?P<id>[a-zA-Z0-9]+)\.html' + _TEST = { + 'url': 'http://www.watchindianporn.net/video/hot-milf-from-kerala-shows-off-her-gorgeous-large-breasts-on-camera-RZa2avywNPa.html', + 'md5': '249589a164dde236ec65832bfce17440', + 'info_dict': { + 'id': 'RZa2avywNPa', + 'display_id': 'hot-milf-from-kerala-shows-off-her-gorgeous-large-breasts-on-camera', + 'ext': 'mp4', + 'title': 'Hot milf from kerala shows off her gorgeous large breasts on camera', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 226, + 'view_count': int, + 'categories': list, + 'age_limit': 18, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') + + webpage = self._download_webpage(url, display_id) + + info_dict = self._parse_html5_media_entries(url, webpage, video_id)[0] + + title = self._html_search_regex(( + r'<title>(.+?)\s*-\s*Indian\s+Porn', + r'

    (.+?)

    ' + ), webpage, 'title') + + duration = parse_duration(self._search_regex( + r'Time:\s*\s*(.+?)\s*', + webpage, 'duration', fatal=False)) + + view_count = int(self._search_regex( + r'(?s)Time:\s*.*?.*?\s*(\d+)\s*', + webpage, 'view count', fatal=False)) + + categories = re.findall( + r']+class=[\'"]categories[\'"][^>]*>\s*([^<]+)\s*', + webpage) + + info_dict.update({ + 'id': video_id, + 'display_id': display_id, + 'http_headers': { + 'Referer': url, + }, + 'title': title, + 'duration': duration, + 'view_count': view_count, + 'categories': categories, + 'age_limit': 18, + }) + + return info_dict diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/wat.py youtube-dl-2019.05.20/youtube_dl/extractor/wat.py --- youtube-dl-2016.02.22/youtube_dl/extractor/wat.py 2016-02-22 10:57:19.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/wat.py 2019-06-07 18:49:07.000000000 +0000 @@ -2,138 +2,156 @@ from __future__ import unicode_literals import re -import hashlib from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( ExtractorError, unified_strdate, + HEADRequest, + int_or_none, ) class WatIE(InfoExtractor): - _VALID_URL = r'(?:wat:(?P\d{8})|http://www\.wat\.tv/video/(?P.*)-(?P.*?)_.*?\.html)' + _VALID_URL = r'(?:wat:|https?://(?:www\.)?wat\.tv/video/.*-)(?P[0-9a-z]+)' IE_NAME = 'wat.tv' _TESTS = [ { 'url': 'http://www.wat.tv/video/soupe-figues-l-orange-aux-epices-6z1uz_2hvf7_.html', - 'md5': 'ce70e9223945ed26a8056d413ca55dc9', 'info_dict': { 'id': '11713067', - 'display_id': 'soupe-figues-l-orange-aux-epices', 'ext': 'mp4', 'title': 'Soupe de figues à l\'orange et aux épices', 'description': 'Retrouvez l\'émission "Petits plats en équilibre", diffusée le 18 août 2014.', 'upload_date': '20140819', 'duration': 120, }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'expected_warnings': ['HTTP Error 404'], }, { 'url': 'http://www.wat.tv/video/gregory-lemarchal-voix-ange-6z1v7_6ygkj_.html', - 'md5': 'fbc84e4378165278e743956d9c1bf16b', + 'md5': 'b16574df2c3cd1a36ca0098f2a791925', 'info_dict': { 'id': '11713075', - 'display_id': 'gregory-lemarchal-voix-ange', 'ext': 'mp4', 'title': 'Grégory Lemarchal, une voix d\'ange depuis 10 ans (1/3)', - 'description': 'md5:b7a849cf16a2b733d9cd10c52906dee3', 'upload_date': '20140816', - 'duration': 2910, }, - 'skip': "Ce contenu n'est pas disponible pour l'instant.", + 'expected_warnings': ["Ce contenu n'est pas disponible pour l'instant."], }, ] - def download_video_info(self, real_id): - # 'contentv4' is used in the website, but it also returns the related - # videos, we don't need them - info = self._download_json('http://www.wat.tv/interface/contentv3/' + real_id, real_id) - return info['media'] + _FORMATS = ( + (200, 416, 234), + (400, 480, 270), + (600, 640, 360), + (1200, 640, 360), + (1800, 960, 540), + (2500, 1280, 720), + ) def _real_extract(self, url): - def real_id_for_chapter(chapter): - return chapter['tc_start'].split('-')[0] - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('display_id') - real_id = mobj.group('real_id') - if not real_id: - short_id = mobj.group('short_id') - webpage = self._download_webpage(url, display_id or short_id) - real_id = self._search_regex(r'xtpage = ".*-(.*?)";', webpage, 'real id') + video_id = self._match_id(url) + video_id = video_id if video_id.isdigit() and len(video_id) > 6 else compat_str(int(video_id, 36)) - video_info = self.download_video_info(real_id) + # 'contentv4' is used in the website, but it also returns the related + # videos, we don't need them + video_data = self._download_json( + 'http://www.wat.tv/interface/contentv4s/' + video_id, video_id) + video_info = video_data['media'] error_desc = video_info.get('error_desc') if error_desc: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error_desc), expected=True) - - geo_list = video_info.get('geoList') - country = geo_list[0] if geo_list else '' + self.report_warning( + '%s returned error: %s' % (self.IE_NAME, error_desc)) chapters = video_info['chapters'] - first_chapter = chapters[0] - files = video_info['files'] - first_file = files[0] + if chapters: + first_chapter = chapters[0] - if real_id_for_chapter(first_chapter) != real_id: - self.to_screen('Multipart video detected') - chapter_urls = [] - for chapter in chapters: - chapter_id = real_id_for_chapter(chapter) - # Yes, when we this chapter is processed by WatIE, - # it will download the info again - chapter_info = self.download_video_info(chapter_id) - chapter_urls.append(chapter_info['url']) - entries = [self.url_result(chapter_url) for chapter_url in chapter_urls] - return self.playlist_result(entries, real_id, video_info['title']) - - upload_date = None - if 'date_diffusion' in first_chapter: - upload_date = unified_strdate(first_chapter['date_diffusion']) - # Otherwise we can continue and extract just one part, we have to use - # the short id for getting the video url - - formats = [{ - 'url': 'http://wat.tv/get/android5/%s.mp4' % real_id, - 'format_id': 'Mobile', - }] - - fmts = [('SD', 'web')] - if first_file.get('hasHD'): - fmts.append(('HD', 'webhd')) - - def compute_token(param): - timestamp = '%08x' % int(self._download_webpage( - 'http://www.wat.tv/servertime', real_id, - 'Downloading server time').split('|')[0]) - magic = '9b673b13fa4682ed14c3cfa5af5310274b514c4133e9b3a81e6e3aba009l2564' - return '%s/%s' % (hashlib.md5((magic + param + timestamp).encode('ascii')).hexdigest(), timestamp) - - for fmt in fmts: - webid = '/%s/%s' % (fmt[1], real_id) - video_url = self._download_webpage( - 'http://www.wat.tv/get%s?token=%s&getURL=1&country=%s' % (webid, compute_token(webid), country), - real_id, - 'Downloading %s video URL' % fmt[0], - 'Failed to download %s video URL' % fmt[0], - False) - if not video_url: - continue - formats.append({ - 'url': video_url, - 'ext': 'mp4', - 'format_id': fmt[0], - }) + def video_id_for_chapter(chapter): + return chapter['tc_start'].split('-')[0] + + if video_id_for_chapter(first_chapter) != video_id: + self.to_screen('Multipart video detected') + entries = [self.url_result('wat:%s' % video_id_for_chapter(chapter)) for chapter in chapters] + return self.playlist_result(entries, video_id, video_info['title']) + # Otherwise we can continue and extract just one part, we have to use + # the video id for getting the video url + else: + first_chapter = video_info + + title = first_chapter['title'] + + def extract_url(path_template, url_type): + req_url = 'http://www.wat.tv/get/%s' % (path_template % video_id) + head = self._request_webpage(HEADRequest(req_url), video_id, 'Extracting %s url' % url_type, fatal=False) + if head: + red_url = head.geturl() + if req_url != red_url: + return red_url + return None + + def remove_bitrate_limit(manifest_url): + return re.sub(r'(?:max|min)_bitrate=\d+&?', '', manifest_url) + + formats = [] + try: + alt_urls = lambda manifest_url: [re.sub(r'(?:wdv|ssm)?\.ism/', repl + '.ism/', manifest_url) for repl in ('', 'ssm')] + manifest_urls = self._download_json( + 'http://www.wat.tv/get/webhtml/' + video_id, video_id) + m3u8_url = manifest_urls.get('hls') + if m3u8_url: + m3u8_url = remove_bitrate_limit(m3u8_url) + for m3u8_alt_url in alt_urls(m3u8_url): + formats.extend(self._extract_m3u8_formats( + m3u8_alt_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + formats.extend(self._extract_f4m_formats( + m3u8_alt_url.replace('ios', 'web').replace('.m3u8', '.f4m'), + video_id, f4m_id='hds', fatal=False)) + mpd_url = manifest_urls.get('mpd') + if mpd_url: + mpd_url = remove_bitrate_limit(mpd_url) + for mpd_alt_url in alt_urls(mpd_url): + formats.extend(self._extract_mpd_formats( + mpd_alt_url, video_id, mpd_id='dash', fatal=False)) + self._sort_formats(formats) + except ExtractorError: + abr = 64 + for vbr, width, height in self._FORMATS: + tbr = vbr + abr + format_id = 'http-%s' % tbr + fmt_url = 'http://dnl.adv.tf1.fr/2/USP-0x0/%s/%s/%s/ssm/%s-%s-64k.mp4' % (video_id[-4:-2], video_id[-2:], video_id, video_id, vbr) + if self._is_valid_url(fmt_url, video_id, format_id): + formats.append({ + 'format_id': format_id, + 'url': fmt_url, + 'vbr': vbr, + 'abr': abr, + 'width': width, + 'height': height, + }) + + date_diffusion = first_chapter.get('date_diffusion') or video_data.get('configv4', {}).get('estatS4') + upload_date = unified_strdate(date_diffusion) if date_diffusion else None + duration = None + files = video_info['files'] + if files: + duration = int_or_none(files[0].get('duration')) return { - 'id': real_id, - 'display_id': display_id, - 'title': first_chapter['title'], - 'thumbnail': first_chapter['preview'], - 'description': first_chapter['description'], - 'view_count': video_info['views'], + 'id': video_id, + 'title': title, + 'thumbnail': first_chapter.get('preview'), + 'description': first_chapter.get('description'), + 'view_count': int_or_none(video_info.get('views')), 'upload_date': upload_date, - 'duration': first_file['duration'], + 'duration': duration, 'formats': formats, } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/wayofthemaster.py youtube-dl-2019.05.20/youtube_dl/extractor/wayofthemaster.py --- youtube-dl-2016.02.22/youtube_dl/extractor/wayofthemaster.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/wayofthemaster.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,52 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - - -class WayOfTheMasterIE(InfoExtractor): - _VALID_URL = r'https?://www\.wayofthemaster\.com/([^/?#]*/)*(?P[^/?#]+)\.s?html(?:$|[?#])' - - _TEST = { - 'url': 'http://www.wayofthemaster.com/hbks.shtml', - 'md5': '5316b57487ada8480606a93cb3d18d24', - 'info_dict': { - 'id': 'hbks', - 'ext': 'mp4', - 'title': 'Intelligent Design vs. Evolution', - }, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - webpage = self._download_webpage(url, video_id) - - title = self._search_regex( - r'(.*?)', webpage, 'page title') - - url_base = self._search_regex( - r'https?://www\d?\.(?:wdr\d?|funkhauseuropa)\.de/)(?P.+?)(?P%s)?\.html' % _PLAYER_REGEX + _VALID_URL = r'https?://deviceids-medp\.wdr\.de/ondemand/\d+/(?P\d+)\.js' + _GEO_COUNTRIES = ['DE'] + _TEST = { + 'url': 'http://deviceids-medp.wdr.de/ondemand/155/1557833.js', + 'info_dict': { + 'id': 'mdb-1557833', + 'ext': 'mp4', + 'title': 'Biathlon-Staffel verpasst Podest bei Olympia-Generalprobe', + 'upload_date': '20180112', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + metadata = self._download_json( + url, video_id, transform_source=strip_jsonp) + + is_live = metadata.get('mediaType') == 'live' + + tracker_data = metadata['trackerData'] + media_resource = metadata['mediaResource'] + + formats = [] + + # check if the metadata contains a direct URL to a file + for kind, media_resource in media_resource.items(): + if kind not in ('dflt', 'alt'): + continue + + for tag_name, medium_url in media_resource.items(): + if tag_name not in ('videoURL', 'audioURL'): + continue + + ext = determine_ext(medium_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + medium_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls')) + elif ext == 'f4m': + manifest_url = update_url_query( + medium_url, {'hdcore': '3.2.0', 'plugin': 'aasp-3.2.0.77.18'}) + formats.extend(self._extract_f4m_formats( + manifest_url, video_id, f4m_id='hds', fatal=False)) + elif ext == 'smil': + formats.extend(self._extract_smil_formats( + medium_url, 'stream', fatal=False)) + else: + a_format = { + 'url': medium_url + } + if ext == 'unknown_video': + urlh = self._request_webpage( + medium_url, video_id, note='Determining extension') + ext = urlhandle_detect_ext(urlh) + a_format['ext'] = ext + formats.append(a_format) + + self._sort_formats(formats) + + subtitles = {} + caption_url = media_resource.get('captionURL') + if caption_url: + subtitles['de'] = [{ + 'url': caption_url, + 'ext': 'ttml', + }] + + title = tracker_data['trackerClipTitle'] + + return { + 'id': tracker_data.get('trackerClipId', video_id), + 'title': self._live_title(title) if is_live else title, + 'alt_title': tracker_data.get('trackerClipSubcategory'), + 'formats': formats, + 'subtitles': subtitles, + 'upload_date': unified_strdate(tracker_data.get('trackerClipAirTime')), + 'is_live': is_live, + } + + +class WDRPageIE(InfoExtractor): + _CURRENT_MAUS_URL = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/){1,2}[^/?#]+\.php5' + _PAGE_REGEX = r'/(?:mediathek/)?(?:[^/]+/)*(?P[^/]+)\.html' + _VALID_URL = r'https?://(?:www\d?\.)?(?:wdr\d?|sportschau)\.de' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL _TESTS = [ { - 'url': 'http://www1.wdr.de/mediathek/video/sendungen/servicezeit/videoservicezeit560-videoplayer_size-L.html', + 'url': 'http://www1.wdr.de/mediathek/video/sendungen/doku-am-freitag/video-geheimnis-aachener-dom-100.html', + # HDS download, MD5 is unstable 'info_dict': { - 'id': 'mdb-362427', + 'id': 'mdb-1058683', 'ext': 'flv', - 'title': 'Servicezeit', - 'description': 'md5:c8f43e5e815eeb54d0b96df2fba906cb', - 'upload_date': '20140310', - 'is_live': False + 'display_id': 'doku-am-freitag/video-geheimnis-aachener-dom-100', + 'title': 'Geheimnis Aachener Dom', + 'alt_title': 'Doku am Freitag', + 'upload_date': '20160304', + 'description': 'md5:87be8ff14d8dfd7a7ee46f0299b52318', + 'is_live': False, + 'subtitles': {'de': [{ + 'url': 'http://ondemand-ww.wdr.de/medp/fsk0/105/1058683/1058683_12220974.xml', + 'ext': 'ttml', + }]}, }, - 'params': { - 'skip_download': True, + 'skip': 'HTTP Error 404: Not Found', + }, + { + 'url': 'http://www1.wdr.de/mediathek/audio/wdr3/wdr3-gespraech-am-samstag/audio-schriftstellerin-juli-zeh-100.html', + 'md5': 'f4c1f96d01cf285240f53ea4309663d8', + 'info_dict': { + 'id': 'mdb-1072000', + 'ext': 'mp3', + 'display_id': 'wdr3-gespraech-am-samstag/audio-schriftstellerin-juli-zeh-100', + 'title': 'Schriftstellerin Juli Zeh', + 'alt_title': 'WDR 3 Gespräch am Samstag', + 'upload_date': '20160312', + 'description': 'md5:e127d320bc2b1f149be697ce044a3dd7', + 'is_live': False, + 'subtitles': {} }, - 'skip': 'Page Not Found', + 'skip': 'HTTP Error 404: Not Found', }, { - 'url': 'http://www1.wdr.de/themen/av/videomargaspiegelisttot101-videoplayer.html', + 'url': 'http://www1.wdr.de/mediathek/video/live/index.html', 'info_dict': { - 'id': 'mdb-363194', - 'ext': 'flv', - 'title': 'Marga Spiegel ist tot', - 'description': 'md5:2309992a6716c347891c045be50992e4', - 'upload_date': '20140311', - 'is_live': False + 'id': 'mdb-1406149', + 'ext': 'mp4', + 'title': r're:^WDR Fernsehen im Livestream \(nur in Deutschland erreichbar\) [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'alt_title': 'WDR Fernsehen Live', + 'upload_date': '20150101', + 'is_live': True, }, 'params': { - 'skip_download': True, + 'skip_download': True, # m3u8 download }, - 'skip': 'Page Not Found', }, { - 'url': 'http://www1.wdr.de/themen/kultur/audioerlebtegeschichtenmargaspiegel100-audioplayer.html', - 'md5': '83e9e8fefad36f357278759870805898', + 'url': 'http://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html', + 'playlist_mincount': 7, 'info_dict': { - 'id': 'mdb-194332', - 'ext': 'mp3', - 'title': 'Erlebte Geschichten: Marga Spiegel (29.11.2009)', - 'description': 'md5:2309992a6716c347891c045be50992e4', - 'upload_date': '20091129', - 'is_live': False + 'id': 'aktuelle-stunde-120', }, }, { - 'url': 'http://www.funkhauseuropa.de/av/audioflaviacoelhoamaramar100-audioplayer.html', - 'md5': '99a1443ff29af19f6c52cf6f4dc1f4aa', + 'url': 'http://www.wdrmaus.de/aktuelle-sendung/index.php5', 'info_dict': { - 'id': 'mdb-478135', - 'ext': 'mp3', - 'title': 'Flavia Coelho: Amar é Amar', - 'description': 'md5:7b29e97e10dfb6e265238b32fa35b23a', - 'upload_date': '20140717', - 'is_live': False + 'id': 'mdb-1552552', + 'ext': 'mp4', + 'upload_date': 're:^[0-9]{8}$', + 'title': 're:^Die Sendung mit der Maus vom [0-9.]{10}$', }, - 'skip': 'Page Not Found', + 'skip': 'The id changes from week to week because of the new episode' }, { - 'url': 'http://www1.wdr.de/mediathek/video/sendungen/quarks_und_co/filterseite-quarks-und-co100.html', - 'playlist_mincount': 146, + 'url': 'http://www.wdrmaus.de/filme/sachgeschichten/achterbahn.php5', + 'md5': '803138901f6368ee497b4d195bb164f2', 'info_dict': { - 'id': 'mediathek/video/sendungen/quarks_und_co/filterseite-quarks-und-co100', + 'id': 'mdb-186083', + 'ext': 'mp4', + 'upload_date': '20130919', + 'title': 'Sachgeschichte - Achterbahn ', + }, + }, + { + 'url': 'http://www1.wdr.de/radio/player/radioplayer116~_layout-popupVersion.html', + # Live stream, MD5 unstable + 'info_dict': { + 'id': 'mdb-869971', + 'ext': 'mp4', + 'title': r're:^COSMO Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'upload_date': '20160101', + }, + 'params': { + 'skip_download': True, # m3u8 download } }, { - 'url': 'http://www1.wdr.de/mediathek/video/livestream/index.html', + 'url': 'http://www.sportschau.de/handballem2018/handball-nationalmannschaft-em-stolperstein-vorrunde-100.html', 'info_dict': { - 'id': 'mdb-103364', - 'title': 're:^WDR Fernsehen Live [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'description': 'md5:ae2ff888510623bf8d4b115f95a9b7c9', - 'ext': 'flv', - 'upload_date': '20150101', - 'is_live': True + 'id': 'mdb-1556012', + 'ext': 'mp4', + 'title': 'DHB-Vizepräsident Bob Hanning - "Die Weltspitze ist extrem breit"', + 'upload_date': '20180111', }, 'params': { 'skip_download': True, }, + }, + { + 'url': 'http://www.sportschau.de/handballem2018/audio-vorschau---die-handball-em-startet-mit-grossem-favoritenfeld-100.html', + 'only_matching': True, } ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - page_url = mobj.group('url') - page_id = mobj.group('id') + display_id = mobj.group('display_id') + webpage = self._download_webpage(url, display_id) - webpage = self._download_webpage(url, page_id) + entries = [] - if mobj.group('player') is None: - entries = [ - self.url_result(page_url + href, 'WDR') - for href in re.findall( - r'\s*]*>\s*\s*]+| + (["\'])videoLink\b.*?\2[\s]*>\n[^\n]* + )data-extension=(["\'])(?P(?:(?!\3).)+)\3 + ''', webpage): + media_link_obj = self._parse_json( + mobj.group('data'), display_id, transform_source=js_to_json, + fatal=False) + if not media_link_obj: + continue + jsonp_url = try_get( + media_link_obj, lambda x: x['mediaObj']['url'], compat_str) + if jsonp_url: + entries.append(self.url_result(jsonp_url, ie=WDRIE.ie_key())) - if upload_date: - upload_date = unified_strdate(upload_date) + # Playlist (e.g. https://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html) + if not entries: + entries = [ + self.url_result( + compat_urlparse.urljoin(url, mobj.group('href')), + ie=WDRPageIE.ie_key()) + for mobj in re.finditer( + r']+\bhref=(["\'])(?P(?:(?!\1).)+)\1[^>]+\bdata-extension=', + webpage) if re.match(self._PAGE_REGEX, mobj.group('href')) + ] - formats = [] - preference = qualities(['S', 'M', 'L', 'XL']) + return self.playlist_result(entries, playlist_id=display_id) - if video_url.endswith('.f4m'): - formats.extend(self._extract_f4m_formats( - video_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18', page_id, - f4m_id='hds', fatal=False)) - elif video_url.endswith('.smil'): - formats.extend(self._extract_smil_formats( - video_url, page_id, False, { - 'hdcore': '3.3.0', - 'plugin': 'aasp-3.3.0.99.43', - })) - else: - formats.append({ - 'url': video_url, - 'http_headers': { - 'User-Agent': 'mobile', - }, - }) - - m3u8_url = self._search_regex( - r'rel="adaptiv"[^>]+href="([^"]+)"', - webpage, 'm3u8 url', default=None) - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, page_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - - direct_urls = re.findall( - r'rel="web(S|M|L|XL)"[^>]+href="([^"]+)"', webpage) - if direct_urls: - for quality, video_url in direct_urls: - formats.append({ - 'url': video_url, - 'preference': preference(quality), - 'http_headers': { - 'User-Agent': 'mobile', - }, - }) - self._sort_formats(formats) +class WDRElefantIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)wdrmaus\.de/elefantenseite/#(?P.+)' + _TEST = { + 'url': 'http://www.wdrmaus.de/elefantenseite/#folge_ostern_2015', + 'info_dict': { + 'title': 'Folge Oster-Spezial 2015', + 'id': 'mdb-1088195', + 'ext': 'mp4', + 'age_limit': None, + 'upload_date': '20150406' + }, + 'params': { + 'skip_download': True, + }, + } - description = self._html_search_meta('Description', webpage, 'description') + def _real_extract(self, url): + display_id = self._match_id(url) - return { - 'id': page_id, - 'formats': formats, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'upload_date': upload_date, - 'is_live': is_live - } + # Table of Contents seems to always be at this address, so fetch it directly. + # The website fetches configurationJS.php5, which links to tableOfContentsJS.php5. + table_of_contents = self._download_json( + 'https://www.wdrmaus.de/elefantenseite/data/tableOfContentsJS.php5', + display_id) + if display_id not in table_of_contents: + raise ExtractorError( + 'No entry in site\'s table of contents for this URL. ' + 'Is the fragment part of the URL (after the #) correct?', + expected=True) + xml_metadata_path = table_of_contents[display_id]['xmlPath'] + xml_metadata = self._download_xml( + 'https://www.wdrmaus.de/elefantenseite/' + xml_metadata_path, + display_id) + zmdb_url_element = xml_metadata.find('./movie/zmdb_url') + if zmdb_url_element is None: + raise ExtractorError( + '%s is not a video' % display_id, expected=True) + return self.url_result(zmdb_url_element.text, ie=WDRIE.ie_key()) class WDRMobileIE(InfoExtractor): @@ -241,81 +328,3 @@ 'User-Agent': 'mobile', }, } - - -class WDRMausIE(InfoExtractor): - _VALID_URL = 'http://(?:www\.)?wdrmaus\.de/(?:[^/]+/){,2}(?P[^/?#]+)(?:/index\.php5|(?

    Sendedatum:\s*([0-9\.]+)

    ', - webpage, 'air date') - title_str = self._html_search_regex( - r'

    (.*?)

    ', webpage, 'title') - title = '%s - %s' % (title_date, title_str) - upload_date = unified_strdate( - self._html_search_meta('dc.date', webpage)) - - fields = compat_parse_qs(param_code) - video_url = fields['firstVideo'][0] - thumbnail = compat_urlparse.urljoin(url, fields['startPicture'][0]) - - formats = [{ - 'format_id': 'rtmp', - 'url': video_url, - }] - - jscode = self._download_webpage( - 'http://www.wdrmaus.de/codebase/js/extended-medien.min.js', - video_id, fatal=False, - note='Downloading URL translation table', - errnote='Could not download URL translation table') - if jscode: - for m in re.finditer( - r"stream:\s*'dslSrc=(?P[^']+)',\s*download:\s*'(?P
    [^']+)'\s*\}", - jscode): - if video_url.startswith(m.group('stream')): - http_url = video_url.replace( - m.group('stream'), m.group('dl')) - formats.append({ - 'format_id': 'http', - 'url': http_url, - }) - break - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'thumbnail': thumbnail, - 'upload_date': upload_date, - } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/webcaster.py youtube-dl-2019.05.20/youtube_dl/extractor/webcaster.py --- youtube-dl-2016.02.22/youtube_dl/extractor/webcaster.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/webcaster.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,102 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + xpath_text, +) + + +class WebcasterIE(InfoExtractor): + _VALID_URL = r'https?://bl\.webcaster\.pro/(?:quote|media)/start/free_(?P[^/]+)' + _TESTS = [{ + # http://video.khl.ru/quotes/393859 + 'url': 'http://bl.webcaster.pro/quote/start/free_c8cefd240aa593681c8d068cff59f407_hd/q393859/eb173f99dd5f558674dae55f4ba6806d/1480289104?sr%3D105%26fa%3D1%26type_id%3D18', + 'md5': '0c162f67443f30916ff1c89425dcd4cd', + 'info_dict': { + 'id': 'c8cefd240aa593681c8d068cff59f407_hd', + 'ext': 'mp4', + 'title': 'Сибирь - Нефтехимик. Лучшие моменты первого периода', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + }, { + 'url': 'http://bl.webcaster.pro/media/start/free_6246c7a4453ac4c42b4398f840d13100_hd/2_2991109016/e8d0d82587ef435480118f9f9c41db41/4635726126', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_xml(url, video_id) + + title = xpath_text(video, './/event_name', 'event name', fatal=True) + + def make_id(parts, separator): + return separator.join(filter(None, parts)) + + formats = [] + for format_id in (None, 'noise'): + track_tag = make_id(('track', format_id), '_') + for track in video.findall('.//iphone/%s' % track_tag): + track_url = track.text + if not track_url: + continue + if determine_ext(track_url) == 'm3u8': + m3u8_formats = self._extract_m3u8_formats( + track_url, video_id, 'mp4', + entry_protocol='m3u8_native', + m3u8_id=make_id(('hls', format_id), '-'), fatal=False) + for f in m3u8_formats: + f.update({ + 'source_preference': 0 if format_id == 'noise' else 1, + 'format_note': track.get('title'), + }) + formats.extend(m3u8_formats) + self._sort_formats(formats) + + thumbnail = xpath_text(video, './/image', 'thumbnail') + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'formats': formats, + } + + +class WebcasterFeedIE(InfoExtractor): + _VALID_URL = r'https?://bl\.webcaster\.pro/feed/start/free_(?P[^/]+)' + _TEST = { + 'url': 'http://bl.webcaster.pro/feed/start/free_c8cefd240aa593681c8d068cff59f407_hd/q393859/eb173f99dd5f558674dae55f4ba6806d/1480289104', + 'only_matching': True, + } + + @staticmethod + def _extract_url(ie, webpage): + mobj = re.search( + r'<(?:object|a[^>]+class=["\']webcaster-player["\'])[^>]+data(?:-config)?=(["\']).*?config=(?Phttps?://bl\.webcaster\.pro/feed/start/free_.*?)(?:[?&]|\1)', + webpage) + if mobj: + return mobj.group('url') + for secure in (True, False): + video_url = ie._og_search_video_url( + webpage, secure=secure, default=None) + if video_url: + mobj = re.search( + r'config=(?Phttps?://bl\.webcaster\.pro/feed/start/free_[^?&=]+)', + video_url) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + video_id = self._match_id(url) + + feed = self._download_xml(url, video_id) + + video_url = xpath_text( + feed, ('video_hd', 'video'), 'video url', fatal=True) + + return self.url_result(video_url, WebcasterIE.ie_key()) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/webofstories.py youtube-dl-2019.05.20/youtube_dl/extractor/webofstories.py --- youtube-dl-2016.02.22/youtube_dl/extractor/webofstories.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/webofstories.py 2019-06-07 18:49:07.000000000 +0000 @@ -4,7 +4,10 @@ import re from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + int_or_none, + orderedSet, +) class WebOfStoriesIE(InfoExtractor): @@ -12,38 +15,52 @@ _VIDEO_DOMAIN = 'http://eu-mobile.webofstories.com/' _GREAT_LIFE_STREAMER = 'rtmp://eu-cdn1.webofstories.com/cfx/st/' _USER_STREAMER = 'rtmp://eu-users.webofstories.com/cfx/st/' - _TESTS = [ - { - 'url': 'http://www.webofstories.com/play/hans.bethe/71', - 'md5': '373e4dd915f60cfe3116322642ddf364', - 'info_dict': { - 'id': '4536', - 'ext': 'mp4', - 'title': 'The temperature of the sun', - 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 'Hans Bethe talks about calculating the temperature of the sun', - 'duration': 238, - } + _TESTS = [{ + 'url': 'http://www.webofstories.com/play/hans.bethe/71', + 'md5': '373e4dd915f60cfe3116322642ddf364', + 'info_dict': { + 'id': '4536', + 'ext': 'mp4', + 'title': 'The temperature of the sun', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'Hans Bethe talks about calculating the temperature of the sun', + 'duration': 238, + } + }, { + 'url': 'http://www.webofstories.com/play/55908', + 'md5': '2985a698e1fe3211022422c4b5ed962c', + 'info_dict': { + 'id': '55908', + 'ext': 'mp4', + 'title': 'The story of Gemmata obscuriglobus', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'Planctomycete talks about The story of Gemmata obscuriglobus', + 'duration': 169, + }, + 'skip': 'notfound', + }, { + # malformed og:title meta + 'url': 'http://www.webofstories.com/play/54215?o=MS', + 'info_dict': { + 'id': '54215', + 'ext': 'mp4', + 'title': '"A Leg to Stand On"', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'Oliver Sacks talks about the death and resurrection of a limb', + 'duration': 97, }, - { - 'url': 'http://www.webofstories.com/play/55908', - 'md5': '2985a698e1fe3211022422c4b5ed962c', - 'info_dict': { - 'id': '55908', - 'ext': 'mp4', - 'title': 'The story of Gemmata obscuriglobus', - 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 'Planctomycete talks about The story of Gemmata obscuriglobus', - 'duration': 169, - } + 'params': { + 'skip_download': True, }, - ] + }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._og_search_title(webpage) + # Sometimes og:title meta is malformed + title = self._og_search_title(webpage, default=None) or self._html_search_regex( + r'(?s)Title:\s*(.+?)<', webpage, 'title') description = self._html_search_meta('description', webpage) thumbnail = self._og_search_thumbnail(webpage) @@ -119,8 +136,10 @@ webpage = self._download_webpage(url, playlist_id) entries = [ - self.url_result('http://www.webofstories.com/play/%s' % video_number, 'WebOfStories') - for video_number in set(re.findall('href="/playAll/%s\?sId=(\d+)"' % playlist_id, webpage)) + self.url_result( + 'http://www.webofstories.com/play/%s' % video_id, + 'WebOfStories', video_id=video_id) + for video_id in orderedSet(re.findall(r'\bid=["\']td_(\d+)', webpage)) ] title = self._search_regex( diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/weibo.py youtube-dl-2019.05.20/youtube_dl/extractor/weibo.py --- youtube-dl-2016.02.22/youtube_dl/extractor/weibo.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/weibo.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,49 +1,140 @@ # coding: utf-8 from __future__ import unicode_literals +from .common import InfoExtractor + +import json +import random import re -from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_str, +) +from ..utils import ( + js_to_json, + strip_jsonp, + urlencode_postdata, +) class WeiboIE(InfoExtractor): - """ - The videos in Weibo come from different sites, this IE just finds the link - to the external video and returns it. - """ - _VALID_URL = r'https?://video\.weibo\.com/v/weishipin/t_(?P.+?)\.htm' - + _VALID_URL = r'https?://(?:www\.)?weibo\.com/[0-9]+/(?P[a-zA-Z0-9]+)' _TEST = { - 'url': 'http://video.weibo.com/v/weishipin/t_zjUw2kZ.htm', + 'url': 'https://weibo.com/6275294458/Fp6RGfbff?type=comment', 'info_dict': { - 'id': '98322879', - 'ext': 'flv', - 'title': '魔声耳机最新广告“All Eyes On Us”', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': ['Sina'], + 'id': 'Fp6RGfbff', + 'ext': 'mp4', + 'title': 'You should have servants to massage you,... 来自Hosico_猫 - 微博', + } } - # Additional example videos from different sites - # Youku: http://video.weibo.com/v/weishipin/t_zQGDWQ8.htm - # 56.com: http://video.weibo.com/v/weishipin/t_zQ44HxN.htm + def _real_extract(self, url): + video_id = self._match_id(url) + # to get Referer url for genvisitor + webpage, urlh = self._download_webpage_handle(url, video_id) + + visitor_url = urlh.geturl() + + if 'passport.weibo.com' in visitor_url: + # first visit + visitor_data = self._download_json( + 'https://passport.weibo.com/visitor/genvisitor', video_id, + note='Generating first-visit data', + transform_source=strip_jsonp, + headers={'Referer': visitor_url}, + data=urlencode_postdata({ + 'cb': 'gen_callback', + 'fp': json.dumps({ + 'os': '2', + 'browser': 'Gecko57,0,0,0', + 'fonts': 'undefined', + 'screenInfo': '1440*900*24', + 'plugins': '', + }), + })) + + tid = visitor_data['data']['tid'] + cnfd = '%03d' % visitor_data['data']['confidence'] + + self._download_webpage( + 'https://passport.weibo.com/visitor/visitor', video_id, + note='Running first-visit callback', + query={ + 'a': 'incarnate', + 't': tid, + 'w': 2, + 'c': cnfd, + 'cb': 'cross_domain', + 'from': 'weibo', + '_rand': random.random(), + }) + + webpage = self._download_webpage( + url, video_id, note='Revisiting webpage') + + title = self._html_search_regex( + r'(.+?)', webpage, 'title') + + video_formats = compat_parse_qs(self._search_regex( + r'video-sources=\\\"(.+?)\"', webpage, 'video_sources')) + + formats = [] + supported_resolutions = (480, 720) + for res in supported_resolutions: + vid_urls = video_formats.get(compat_str(res)) + if not vid_urls or not isinstance(vid_urls, list): + continue + + vid_url = vid_urls[0] + formats.append({ + 'url': vid_url, + 'height': res, + }) + + self._sort_formats(formats) + + uploader = self._og_search_property( + 'nick-name', webpage, 'uploader', default=None) + + return { + 'id': video_id, + 'title': title, + 'uploader': uploader, + 'formats': formats + } + + +class WeiboMobileIE(InfoExtractor): + _VALID_URL = r'https?://m\.weibo\.cn/status/(?P[0-9]+)(\?.+)?' + _TEST = { + 'url': 'https://m.weibo.cn/status/4189191225395228?wm=3333_2001&sourcetype=weixin&featurecode=newtitle&from=singlemessage&isappinstalled=0', + 'info_dict': { + 'id': '4189191225395228', + 'ext': 'mp4', + 'title': '午睡当然是要甜甜蜜蜜的啦', + 'uploader': '柴犬柴犬' + } + } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE) - video_id = mobj.group('id') - info_url = 'http://video.weibo.com/?s=v&a=play_list&format=json&mix_video_id=t_%s' % video_id - info = self._download_json(info_url, video_id) - - videos_urls = map(lambda v: v['play_page_url'], info['result']['data']) - # Prefer sina video since they have thumbnails - videos_urls = sorted(videos_urls, key=lambda u: 'video.sina.com' in u) - player_url = videos_urls[-1] - m_sina = re.match(r'https?://video\.sina\.com\.cn/v/b/(\d+)-\d+\.html', - player_url) - if m_sina is not None: - self.to_screen('Sina video detected') - sina_id = m_sina.group(1) - player_url = 'http://you.video.sina.com.cn/swf/quotePlayer.swf?vid=%s' % sina_id - return self.url_result(player_url) + video_id = self._match_id(url) + # to get Referer url for genvisitor + webpage = self._download_webpage(url, video_id, note='visit the page') + + weibo_info = self._parse_json(self._search_regex( + r'var\s+\$render_data\s*=\s*\[({.*})\]\[0\]\s*\|\|\s*{};', + webpage, 'js_code', flags=re.DOTALL), + video_id, transform_source=js_to_json) + + status_data = weibo_info.get('status', {}) + page_info = status_data.get('page_info') + title = status_data['status_title'] + uploader = status_data.get('user', {}).get('screen_name') + + return { + 'id': video_id, + 'title': title, + 'uploader': uploader, + 'url': page_info['media_info']['stream_url'] + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/weiqitv.py youtube-dl-2019.05.20/youtube_dl/extractor/weiqitv.py --- youtube-dl-2016.02.22/youtube_dl/extractor/weiqitv.py 2016-01-23 10:08:46.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/weiqitv.py 2019-06-07 18:49:07.000000000 +0000 @@ -6,7 +6,7 @@ class WeiqiTVIE(InfoExtractor): IE_DESC = 'WQTV' - _VALID_URL = r'http://www\.weiqitv\.com/index/video_play\?videoId=(?P[A-Za-z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?weiqitv\.com/index/video_play\?videoId=(?P[A-Za-z0-9]+)' _TESTS = [{ 'url': 'http://www.weiqitv.com/index/video_play?videoId=53c744f09874f0e76a8b46f3', @@ -37,11 +37,11 @@ page = self._download_webpage(url, media_id) info_json_str = self._search_regex( - 'var\s+video\s*=\s*(.+});', page, 'info json str') + r'var\s+video\s*=\s*(.+});', page, 'info json str') info_json = self._parse_json(info_json_str, media_id) letvcloud_url = self._search_regex( - 'var\s+letvurl\s*=\s*"([^"]+)', page, 'letvcloud url') + r'var\s+letvurl\s*=\s*"([^"]+)', page, 'letvcloud url') return { '_type': 'url_transparent', diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/wimp.py youtube-dl-2019.05.20/youtube_dl/extractor/wimp.py --- youtube-dl-2016.02.22/youtube_dl/extractor/wimp.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/wimp.py 2019-06-07 18:49:07.000000000 +0000 @@ -5,25 +5,29 @@ class WimpIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?wimp\.com/(?P[^/]+)' + _VALID_URL = r'https?://(?:www\.)?wimp\.com/(?P[^/]+)' _TESTS = [{ - 'url': 'http://www.wimp.com/maruexhausted/', + 'url': 'http://www.wimp.com/maru-is-exhausted/', 'md5': 'ee21217ffd66d058e8b16be340b74883', 'info_dict': { - 'id': 'maruexhausted', + 'id': 'maru-is-exhausted', 'ext': 'mp4', 'title': 'Maru is exhausted.', 'description': 'md5:57e099e857c0a4ea312542b684a869b8', } }, { 'url': 'http://www.wimp.com/clowncar/', - 'md5': '4e2986c793694b55b37cf92521d12bb4', + 'md5': '5c31ad862a90dc5b1f023956faec13fe', 'info_dict': { - 'id': 'clowncar', - 'ext': 'mp4', - 'title': 'It\'s like a clown car.', - 'description': 'md5:0e56db1370a6e49c5c1d19124c0d2fb2', + 'id': 'cG4CEr2aiSg', + 'ext': 'webm', + 'title': 'Basset hound clown car...incredible!', + 'description': '5 of my Bassets crawled in this dog loo! www.bellinghambassets.com\n\nFor licensing/usage please contact: licensing(at)jukinmediadotcom', + 'upload_date': '20140303', + 'uploader': 'Gretchen Hoey', + 'uploader_id': 'gretchenandjeff1', }, + 'add_ie': ['Youtube'], }] def _real_extract(self, url): @@ -32,23 +36,19 @@ webpage = self._download_webpage(url, video_id) youtube_id = self._search_regex( - r"videoId\s*:\s*[\"']([0-9A-Za-z_-]{11})[\"']", + (r"videoId\s*:\s*[\"']([0-9A-Za-z_-]{11})[\"']", + r'data-id=["\']([0-9A-Za-z_-]{11})'), webpage, 'video URL', default=None) if youtube_id: - return { - '_type': 'url', - 'url': youtube_id, - 'ie_key': YoutubeIE.ie_key(), - } - - video_url = self._search_regex( - r']+>\s*]+src=(["\'])(?P.+?)\1', - webpage, 'video URL', group='url') + return self.url_result(youtube_id, YoutubeIE.ie_key()) + + info_dict = self._extract_jwplayer_data( + webpage, video_id, require_title=False) - return { + info_dict.update({ 'id': video_id, - 'url': video_url, 'title': self._og_search_title(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), 'description': self._og_search_description(webpage), - } + }) + + return info_dict diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/wistia.py youtube-dl-2019.05.20/youtube_dl/extractor/wistia.py --- youtube-dl-2016.02.22/youtube_dl/extractor/wistia.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/wistia.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,65 +1,126 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( ExtractorError, - sanitized_Request, + int_or_none, + float_or_none, + unescapeHTML, ) class WistiaIE(InfoExtractor): - _VALID_URL = r'https?://(?:fast\.)?wistia\.net/embed/iframe/(?P[a-z0-9]+)' - _API_URL = 'http://fast.wistia.com/embed/medias/{0:}.json' + _VALID_URL = r'(?:wistia:|https?://(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/)(?P[a-z0-9]+)' + _API_URL = 'http://fast.wistia.com/embed/medias/%s.json' + _IFRAME_URL = 'http://fast.wistia.net/embed/iframe/%s' - _TEST = { + _TESTS = [{ 'url': 'http://fast.wistia.net/embed/iframe/sh7fpupwlt', 'md5': 'cafeb56ec0c53c18c97405eecb3133df', 'info_dict': { 'id': 'sh7fpupwlt', 'ext': 'mov', 'title': 'Being Resourceful', + 'description': 'a Clients From Hell Video Series video from worldwidewebhosting', + 'upload_date': '20131204', + 'timestamp': 1386185018, 'duration': 117, }, - } + }, { + 'url': 'wistia:sh7fpupwlt', + 'only_matching': True, + }, { + # with hls video + 'url': 'wistia:807fafadvk', + 'only_matching': True, + }, { + 'url': 'http://fast.wistia.com/embed/iframe/sh7fpupwlt', + 'only_matching': True, + }, { + 'url': 'http://fast.wistia.net/embed/medias/sh7fpupwlt.json', + 'only_matching': True, + }] + + @staticmethod + def _extract_url(webpage): + match = re.search( + r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/iframe/.+?)\1', webpage) + if match: + return unescapeHTML(match.group('url')) + + match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P[^"\']+)', webpage) + if match: + return 'wistia:%s' % match.group('id') + + match = re.search( + r'''(?sx) + ]+src=(["'])(?:https?:)?//fast\.wistia\.com/assets/external/E-v1\.js\1[^>]*>.*? + ]+class=(["']).*?\bwistia_async_(?P[a-z0-9]+)\b.*?\2 + ''', webpage) + if match: + return 'wistia:%s' % match.group('id') def _real_extract(self, url): video_id = self._match_id(url) - request = sanitized_Request(self._API_URL.format(video_id)) - request.add_header('Referer', url) # Some videos require this. - data_json = self._download_json(request, video_id) + data_json = self._download_json( + self._API_URL % video_id, video_id, + # Some videos require this. + headers={ + 'Referer': url if url.startswith('http') else self._IFRAME_URL % video_id, + }) + if data_json.get('error'): - raise ExtractorError('Error while getting the playlist', - expected=True) + raise ExtractorError( + 'Error while getting the playlist', expected=True) + data = data_json['media'] + title = data['name'] formats = [] thumbnails = [] - for atype, a in data['assets'].items(): - if atype == 'still': - thumbnails.append({ - 'url': a['url'], - 'resolution': '%dx%d' % (a['width'], a['height']), - }) + for a in data['assets']: + aurl = a.get('url') + if not aurl: continue - if atype == 'preview': + astatus = a.get('status') + atype = a.get('type') + if (astatus is not None and astatus != 2) or atype in ('preview', 'storyboard'): continue - formats.append({ - 'format_id': atype, - 'url': a['url'], - 'width': a['width'], - 'height': a['height'], - 'filesize': a['size'], - 'ext': a['ext'], - 'preference': 1 if atype == 'original' else None, - }) + elif atype in ('still', 'still_image'): + thumbnails.append({ + 'url': aurl, + 'width': int_or_none(a.get('width')), + 'height': int_or_none(a.get('height')), + }) + else: + aext = a.get('ext') + is_m3u8 = a.get('container') == 'm3u8' or aext == 'm3u8' + formats.append({ + 'format_id': atype, + 'url': aurl, + 'tbr': int_or_none(a.get('bitrate')), + 'vbr': int_or_none(a.get('opt_vbitrate')), + 'width': int_or_none(a.get('width')), + 'height': int_or_none(a.get('height')), + 'filesize': int_or_none(a.get('size')), + 'vcodec': a.get('codec'), + 'container': a.get('container'), + 'ext': 'mp4' if is_m3u8 else aext, + 'protocol': 'm3u8' if is_m3u8 else None, + 'preference': 1 if atype == 'original' else None, + }) self._sort_formats(formats) return { 'id': video_id, - 'title': data['name'], + 'title': title, + 'description': data.get('seoDescription'), 'formats': formats, 'thumbnails': thumbnails, - 'duration': data.get('duration'), + 'duration': float_or_none(data.get('duration')), + 'timestamp': int_or_none(data.get('createdAt')), } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/worldstarhiphop.py youtube-dl-2019.05.20/youtube_dl/extractor/worldstarhiphop.py --- youtube-dl-2016.02.22/youtube_dl/extractor/worldstarhiphop.py 2016-02-14 22:26:17.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/worldstarhiphop.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,12 +1,10 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor class WorldStarHipHopIE(InfoExtractor): - _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/(?:videos|android)/video\.php\?v=(?P.*)' + _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/(?:videos|android)/video\.php\?.*?\bv=(?P[^&]+)' _TESTS = [{ 'url': 'http://www.worldstarhiphop.com/videos/video.php?v=wshh6a7q1ny0G34ZwuIO', 'md5': '9d04de741161603bf7071bbf4e883186', @@ -17,48 +15,26 @@ } }, { 'url': 'http://m.worldstarhiphop.com/android/video.php?v=wshh6a7q1ny0G34ZwuIO', - 'md5': 'dc1c76c83ecc4190bb1eb143899b87d3', - 'info_dict': { - 'id': 'wshh6a7q1ny0G34ZwuIO', - 'ext': 'mp4', - 'title': 'KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!' - } + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - m_vevo_id = re.search(r'videoId=(.*?)&?', webpage) - if m_vevo_id is not None: - return self.url_result('vevo:%s' % m_vevo_id.group(1), ie='Vevo') - - video_url = self._search_regex( - [r'so\.addVariable\("file","(.*?)"\)', - r'
    \s*]+href="([^"]+)">'], - webpage, 'video URL') + entries = self._parse_html5_media_entries(url, webpage, video_id) - if 'youtube' in video_url: - return self.url_result(video_url, ie='Youtube') + if not entries: + return self.url_result(url, 'Generic') - video_title = self._html_search_regex( + title = self._html_search_regex( [r'(?s)
    \s*

    (.*?)

    ', r']+class="tc-sp-pinned-title">(.*)'], webpage, 'title') - # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video. - thumbnail = self._html_search_regex( - r'rel="image_src" href="(.*)" />', webpage, 'thumbnail', - default=None) - if not thumbnail: - _title = r'candytitles.*>(.*)' - mobj = re.search(_title, webpage) - if mobj is not None: - video_title = mobj.group(1) - - return { + info = entries[0] + info.update({ 'id': video_id, - 'url': video_url, - 'title': video_title, - 'thumbnail': thumbnail, - } + 'title': title, + }) + return info diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/wrzuta.py youtube-dl-2019.05.20/youtube_dl/extractor/wrzuta.py --- youtube-dl-2016.02.22/youtube_dl/extractor/wrzuta.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/wrzuta.py 2019-06-07 18:47:18.000000000 +0000 @@ -1,12 +1,14 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( + ExtractorError, int_or_none, qualities, + remove_start, ) @@ -26,16 +28,17 @@ 'uploader_id': 'laboratoriumdextera', 'description': 'md5:7fb5ef3c21c5893375fda51d9b15d9cd', }, + 'skip': 'Redirected to wrzuta.pl', }, { - 'url': 'http://jolka85.wrzuta.pl/audio/063jOPX5ue2/liber_natalia_szroeder_-_teraz_ty', - 'md5': 'bc78077859bea7bcfe4295d7d7fc9025', + 'url': 'http://vexling.wrzuta.pl/audio/01xBFabGXu6/james_horner_-_into_the_na_39_vi_world_bonus', + 'md5': 'f80564fb5a2ec6ec59705ae2bf2ba56d', 'info_dict': { - 'id': '063jOPX5ue2', - 'ext': 'ogg', - 'title': 'Liber & Natalia Szroeder - Teraz Ty', - 'duration': 203, - 'uploader_id': 'jolka85', - 'description': 'md5:2d2b6340f9188c8c4cd891580e481096', + 'id': '01xBFabGXu6', + 'ext': 'mp3', + 'title': 'James Horner - Into The Na\'vi World [Bonus]', + 'description': 'md5:30a70718b2cd9df3120fce4445b0263b', + 'duration': 95, + 'uploader_id': 'vexling', }, }] @@ -45,7 +48,10 @@ typ = mobj.group('typ') uploader = mobj.group('uploader') - webpage = self._download_webpage(url, video_id) + webpage, urlh = self._download_webpage_handle(url, video_id) + + if urlh.geturl() == 'http://www.wrzuta.pl/': + raise ExtractorError('Video removed', expected=True) quality = qualities(['SD', 'MQ', 'HQ', 'HD']) @@ -80,3 +86,73 @@ 'description': self._og_search_description(webpage), 'age_limit': embedpage.get('minimalAge', 0), } + + +class WrzutaPlaylistIE(InfoExtractor): + """ + this class covers extraction of wrzuta playlist entries + the extraction process bases on following steps: + * collect information of playlist size + * download all entries provided on + the playlist webpage (the playlist is split + on two pages: first directly reached from webpage + second: downloaded on demand by ajax call and rendered + using the ajax call response) + * in case size of extracted entries not reached total number of entries + use the ajax call to collect the remaining entries + """ + + IE_NAME = 'wrzuta.pl:playlist' + _VALID_URL = r'https?://(?P[0-9a-zA-Z]+)\.wrzuta\.pl/playlista/(?P[0-9a-zA-Z]+)' + _TESTS = [{ + 'url': 'http://miromak71.wrzuta.pl/playlista/7XfO4vE84iR/moja_muza', + 'playlist_mincount': 14, + 'info_dict': { + 'id': '7XfO4vE84iR', + 'title': 'Moja muza', + }, + }, { + 'url': 'http://heroesf70.wrzuta.pl/playlista/6Nj3wQHx756/lipiec_-_lato_2015_muzyka_swiata', + 'playlist_mincount': 144, + 'info_dict': { + 'id': '6Nj3wQHx756', + 'title': 'Lipiec - Lato 2015 Muzyka Świata', + }, + }, { + 'url': 'http://miromak71.wrzuta.pl/playlista/7XfO4vE84iR', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + playlist_id = mobj.group('id') + uploader = mobj.group('uploader') + + webpage = self._download_webpage(url, playlist_id) + + playlist_size = int_or_none(self._html_search_regex( + (r']+class=["\']playlist-counter["\'][^>]*>\d+/(\d+)', + r']+class=["\']all-counter["\'][^>]*>(.+?)
    '), + webpage, 'playlist size', default=None)) + + playlist_title = remove_start( + self._og_search_title(webpage), 'Playlista: ') + + entries = [] + if playlist_size: + entries = [ + self.url_result(entry_url) + for _, entry_url in re.findall( + r']+href=(["\'])(http.+?)\1[^>]+class=["\']playlist-file-page', + webpage)] + if playlist_size > len(entries): + playlist_content = self._download_json( + 'http://%s.wrzuta.pl/xhr/get_playlist_offset/%s' % (uploader, playlist_id), + playlist_id, + 'Downloading playlist JSON', + 'Unable to download playlist JSON') + entries.extend([ + self.url_result(entry['filelink']) + for entry in playlist_content.get('files', []) if entry.get('filelink')]) + + return self.playlist_result(entries, playlist_id, playlist_title) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/wsj.py youtube-dl-2019.05.20/youtube_dl/extractor/wsj.py --- youtube-dl-2016.02.22/youtube_dl/extractor/wsj.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/wsj.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,19 +1,27 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( int_or_none, + float_or_none, unified_strdate, ) class WSJIE(InfoExtractor): - _VALID_URL = r'https?://video-api\.wsj\.com/api-video/player/iframe\.html\?guid=(?P[a-zA-Z0-9-]+)' + _VALID_URL = r'''(?x) + (?: + https?://video-api\.wsj\.com/api-video/player/iframe\.html\?.*?\bguid=| + https?://(?:www\.)?(?:wsj|barrons)\.com/video/(?:[^/]+/)+| + wsj: + ) + (?P[a-fA-F0-9-]{36}) + ''' IE_DESC = 'Wall Street Journal' - _TEST = { + _TESTS = [{ 'url': 'http://video-api.wsj.com/api-video/player/iframe.html?guid=1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A', - 'md5': '9747d7a6ebc2f4df64b981e1dde9efa9', + 'md5': 'e230a5bb249075e40793b655a54a02e4', 'info_dict': { 'id': '1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A', 'ext': 'mp4', @@ -24,65 +32,92 @@ 'duration': 90, 'title': 'Bills Coach Rex Ryan Updates His Old Jets Tattoo', }, - } + }, { + 'url': 'http://www.wsj.com/video/can-alphabet-build-a-smarter-city/359DDAA8-9AC1-489C-82E6-0429C1E430E0.html', + 'only_matching': True, + }, { + 'url': 'http://www.barrons.com/video/capitalism-deserves-more-respect-from-millennials/F301217E-6F46-43AE-B8D2-B7180D642EE9.html', + 'only_matching': True, + }, { + 'url': 'https://www.wsj.com/video/series/a-brief-history-of/the-modern-cell-carrier-how-we-got-here/980E2187-401D-48A1-B82B-1486CEE06CB9', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - bitrates = [128, 174, 264, 320, 464, 664, 1264] - api_url = ( - 'http://video-api.wsj.com/api-video/find_all_videos.asp?' - 'type=guid&count=1&query=%s&' - 'fields=hls,adZone,thumbnailList,guid,state,secondsUntilStartTime,' - 'author,description,name,linkURL,videoStillURL,duration,videoURL,' - 'adCategory,catastrophic,linkShortURL,doctypeID,youtubeID,' - 'titletag,rssURL,wsj-section,wsj-subsection,allthingsd-section,' - 'allthingsd-subsection,sm-section,sm-subsection,provider,' - 'formattedCreationDate,keywords,keywordsOmniture,column,editor,' - 'emailURL,emailPartnerID,showName,omnitureProgramName,' - 'omnitureVideoFormat,linkRelativeURL,touchCastID,' - 'omniturePublishDate,%s') % ( - video_id, ','.join('video%dkMP4Url' % br for br in bitrates)) - info = self._download_json(api_url, video_id)['items'][0] - - # Thumbnails are conveniently in the correct format already - thumbnails = info.get('thumbnailList') - creator = info.get('author') - uploader_id = info.get('editor') - categories = info.get('keywords') - duration = int_or_none(info.get('duration')) - upload_date = unified_strdate( - info.get('formattedCreationDate'), day_first=False) + info = self._download_json( + 'http://video-api.wsj.com/api-video/find_all_videos.asp', video_id, + query={ + 'type': 'guid', + 'count': 1, + 'query': video_id, + 'fields': ','.join(( + 'type', 'hls', 'videoMP4List', 'thumbnailList', 'author', + 'description', 'name', 'duration', 'videoURL', 'titletag', + 'formattedCreationDate', 'keywords', 'editor')), + })['items'][0] title = info.get('name', info.get('titletag')) - formats = [{ - 'format_id': 'f4m', - 'format_note': 'f4m (meta URL)', - 'url': info['videoURL'], - }] - if info.get('hls'): + formats = [] + + f4m_url = info.get('videoURL') + if f4m_url: + formats.extend(self._extract_f4m_formats( + f4m_url, video_id, f4m_id='hds', fatal=False)) + + m3u8_url = info.get('hls') + if m3u8_url: formats.extend(self._extract_m3u8_formats( info['hls'], video_id, ext='mp4', - preference=0, entry_protocol='m3u8_native')) - for br in bitrates: - field = 'video%dkMP4Url' % br - if info.get(field): - formats.append({ - 'format_id': 'mp4-%d' % br, - 'container': 'mp4', - 'tbr': br, - 'url': info[field], - }) + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + + for v in info.get('videoMP4List', []): + mp4_url = v.get('url') + if not mp4_url: + continue + tbr = int_or_none(v.get('bitrate')) + formats.append({ + 'url': mp4_url, + 'format_id': 'http' + ('-%d' % tbr if tbr else ''), + 'tbr': tbr, + 'width': int_or_none(v.get('width')), + 'height': int_or_none(v.get('height')), + 'fps': float_or_none(v.get('fps')), + }) self._sort_formats(formats) return { 'id': video_id, 'formats': formats, - 'thumbnails': thumbnails, - 'creator': creator, - 'uploader_id': uploader_id, - 'duration': duration, - 'upload_date': upload_date, + # Thumbnails are conveniently in the correct format already + 'thumbnails': info.get('thumbnailList'), + 'creator': info.get('author'), + 'uploader_id': info.get('editor'), + 'duration': int_or_none(info.get('duration')), + 'upload_date': unified_strdate(info.get( + 'formattedCreationDate'), day_first=False), 'title': title, - 'categories': categories, + 'categories': info.get('keywords'), + } + + +class WSJArticleIE(InfoExtractor): + _VALID_URL = r'(?i)https?://(?:www\.)?wsj\.com/articles/(?P[^/?#&]+)' + _TEST = { + 'url': 'https://www.wsj.com/articles/dont-like-china-no-pandas-for-you-1490366939?', + 'info_dict': { + 'id': '4B13FA62-1D8C-45DB-8EA1-4105CB20B362', + 'ext': 'mp4', + 'upload_date': '20170221', + 'uploader_id': 'ralcaraz', + 'title': 'Bao Bao the Panda Leaves for China', } + } + + def _real_extract(self, url): + article_id = self._match_id(url) + webpage = self._download_webpage(url, article_id) + video_id = self._search_regex( + r'data-src=["\']([a-fA-F0-9-]{36})', webpage, 'video id') + return self.url_result('wsj:%s' % video_id, WSJIE.ie_key(), video_id) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/wwe.py youtube-dl-2019.05.20/youtube_dl/extractor/wwe.py --- youtube-dl-2016.02.22/youtube_dl/extractor/wwe.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/wwe.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,140 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + try_get, + unescapeHTML, + url_or_none, + urljoin, +) + + +class WWEBaseIE(InfoExtractor): + _SUBTITLE_LANGS = { + 'English': 'en', + 'Deutsch': 'de', + } + + def _extract_entry(self, data, url, video_id=None): + video_id = compat_str(video_id or data['nid']) + title = data['title'] + + formats = self._extract_m3u8_formats( + data['file'], video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + + description = data.get('description') + thumbnail = urljoin(url, data.get('image')) + series = data.get('show_name') + episode = data.get('episode_name') + + subtitles = {} + tracks = data.get('tracks') + if isinstance(tracks, list): + for track in tracks: + if not isinstance(track, dict): + continue + if track.get('kind') != 'captions': + continue + track_file = url_or_none(track.get('file')) + if not track_file: + continue + label = track.get('label') + lang = self._SUBTITLE_LANGS.get(label, label) or 'en' + subtitles.setdefault(lang, []).append({ + 'url': track_file, + }) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'series': series, + 'episode': episode, + 'formats': formats, + 'subtitles': subtitles, + } + + +class WWEIE(WWEBaseIE): + _VALID_URL = r'https?://(?:[^/]+\.)?wwe\.com/(?:[^/]+/)*videos/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.wwe.com/videos/daniel-bryan-vs-andrade-cien-almas-smackdown-live-sept-4-2018', + 'md5': '92811c6a14bfc206f7a6a9c5d9140184', + 'info_dict': { + 'id': '40048199', + 'ext': 'mp4', + 'title': 'Daniel Bryan vs. Andrade "Cien" Almas: SmackDown LIVE, Sept. 4, 2018', + 'description': 'md5:2d7424dbc6755c61a0e649d2a8677f67', + 'thumbnail': r're:^https?://.*\.jpg$', + } + }, { + 'url': 'https://de.wwe.com/videos/gran-metalik-vs-tony-nese-wwe-205-live-sept-4-2018', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + landing = self._parse_json( + self._html_search_regex( + r'(?s)Drupal\.settings\s*,\s*({.+?})\s*\)\s*;', + webpage, 'drupal settings'), + display_id)['WWEVideoLanding'] + + data = landing['initialVideo']['playlist'][0] + video_id = landing.get('initialVideoId') + + info = self._extract_entry(data, url, video_id) + info['display_id'] = display_id + return info + + +class WWEPlaylistIE(WWEBaseIE): + _VALID_URL = r'https?://(?:[^/]+\.)?wwe\.com/(?:[^/]+/)*(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.wwe.com/shows/raw/2018-11-12', + 'info_dict': { + 'id': '2018-11-12', + }, + 'playlist_mincount': 11, + }, { + 'url': 'http://www.wwe.com/article/walk-the-prank-wwe-edition', + 'only_matching': True, + }, { + 'url': 'https://www.wwe.com/shows/wwenxt/article/matt-riddle-interview', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if WWEIE.suitable(url) else super(WWEPlaylistIE, cls).suitable(url) + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + entries = [] + for mobj in re.finditer( + r'data-video\s*=\s*(["\'])(?P{.+?})\1', webpage): + video = self._parse_json( + mobj.group('data'), display_id, transform_source=unescapeHTML, + fatal=False) + if not video: + continue + data = try_get(video, lambda x: x['playlist'][0], dict) + if not data: + continue + try: + entry = self._extract_entry(data, url) + except Exception: + continue + entry['extractor_key'] = WWEIE.ie_key() + entries.append(entry) + + return self.playlist_result(entries, display_id) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/xbef.py youtube-dl-2019.05.20/youtube_dl/extractor/xbef.py --- youtube-dl-2016.02.22/youtube_dl/extractor/xbef.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/xbef.py 2019-06-07 18:49:07.000000000 +0000 @@ -5,7 +5,7 @@ class XBefIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?xbef\.com/video/(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?xbef\.com/video/(?P[0-9]+)' _TEST = { 'url': 'http://xbef.com/video/5119-glamourous-lesbians-smoking-drinking-and-fucking', 'md5': 'a478b565baff61634a98f5e5338be995', @@ -14,7 +14,7 @@ 'ext': 'mp4', 'title': 'md5:7358a9faef8b7b57acda7c04816f170e', 'age_limit': 18, - 'thumbnail': 're:^http://.*\.jpg', + 'thumbnail': r're:^http://.*\.jpg', } } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/xboxclips.py youtube-dl-2019.05.20/youtube_dl/extractor/xboxclips.py --- youtube-dl-2016.02.22/youtube_dl/extractor/xboxclips.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/xboxclips.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor @@ -12,7 +12,7 @@ class XboxClipsIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?xboxclips\.com/(?:video\.php\?.*vid=|[^/]+/)(?P[\w-]{36})' _TEST = { - 'url': 'https://xboxclips.com/video.php?uid=2533274823424419&gamertag=Iabdulelah&vid=074a69a9-5faf-46aa-b93b-9909c1720325', + 'url': 'http://xboxclips.com/video.php?uid=2533274823424419&gamertag=Iabdulelah&vid=074a69a9-5faf-46aa-b93b-9909c1720325', 'md5': 'fbe1ec805e920aeb8eced3c3e657df5d', 'info_dict': { 'id': '074a69a9-5faf-46aa-b93b-9909c1720325', diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/xfileshare.py youtube-dl-2019.05.20/youtube_dl/extractor/xfileshare.py --- youtube-dl-2016.02.22/youtube_dl/extractor/xfileshare.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/xfileshare.py 2019-06-07 18:49:07.000000000 +0000 @@ -4,33 +4,52 @@ import re from .common import InfoExtractor -from ..compat import compat_urllib_parse from ..utils import ( + decode_packed_codes, + determine_ext, ExtractorError, - encode_dict, int_or_none, - sanitized_Request, + NO_DEFAULT, + urlencode_postdata, ) class XFileShareIE(InfoExtractor): - IE_DESC = 'XFileShare based sites: GorillaVid.in, daclips.in, movpod.in, fastvideo.in, realvid.net, filehoot.com and vidto.me' - _VALID_URL = r'''(?x) - https?://(?P(?:www\.)? - (?:daclips\.in|gorillavid\.in|movpod\.in|fastvideo\.in|realvid\.net|filehoot\.com|vidto\.me))/ - (?:embed-)?(?P[0-9a-zA-Z]+)(?:-[0-9]+x[0-9]+\.html)? - ''' - - _FILE_NOT_FOUND_REGEX = r'>(?:404 - )?File Not Found<' + _SITES = ( + (r'daclips\.(?:in|com)', 'DaClips'), + (r'filehoot\.com', 'FileHoot'), + (r'gorillavid\.(?:in|com)', 'GorillaVid'), + (r'movpod\.in', 'MovPod'), + (r'powerwatch\.pw', 'PowerWatch'), + (r'rapidvideo\.ws', 'Rapidvideo.ws'), + (r'thevideobee\.to', 'TheVideoBee'), + (r'vidto\.(?:me|se)', 'Vidto'), + (r'streamin\.to', 'Streamin.To'), + (r'xvidstage\.com', 'XVIDSTAGE'), + (r'vidabc\.com', 'Vid ABC'), + (r'vidbom\.com', 'VidBom'), + (r'vidlo\.us', 'vidlo'), + (r'rapidvideo\.(?:cool|org)', 'RapidVideo.TV'), + (r'fastvideo\.me', 'FastVideo.me'), + ) + + IE_DESC = 'XFileShare based sites: %s' % ', '.join(list(zip(*_SITES))[1]) + _VALID_URL = (r'https?://(?P(?:www\.)?(?:%s))/(?:embed-)?(?P[0-9a-zA-Z]+)' + % '|'.join(site for site in list(zip(*_SITES))[0])) + + _FILE_NOT_FOUND_REGEXES = ( + r'>(?:404 - )?File Not Found<', + r'>The file was removed by administrator<', + ) _TESTS = [{ 'url': 'http://gorillavid.in/06y9juieqpmi', 'md5': '5ae4a3580620380619678ee4875893ba', 'info_dict': { 'id': '06y9juieqpmi', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Rebecca Black My Moment Official Music Video Reaction-6GK87Rc8bzQ', - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', }, }, { 'url': 'http://gorillavid.in/embed-z08zf8le23c6-960x480.html', @@ -42,28 +61,9 @@ 'id': '3rso4kdn6f9m', 'ext': 'mp4', 'title': 'Micro Pig piglets ready on 16th July 2009-bG0PdrCdxUc', - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', } }, { - # video with countdown timeout - 'url': 'http://fastvideo.in/1qmdn1lmsmbw', - 'md5': '8b87ec3f6564a3108a0e8e66594842ba', - 'info_dict': { - 'id': '1qmdn1lmsmbw', - 'ext': 'mp4', - 'title': 'Man of Steel - Trailer', - 'thumbnail': 're:http://.*\.jpg', - }, - }, { - 'url': 'http://realvid.net/ctn2y6p2eviw', - 'md5': 'b2166d2cf192efd6b6d764c18fd3710e', - 'info_dict': { - 'id': 'ctn2y6p2eviw', - 'ext': 'flv', - 'title': 'rdx 1955', - 'thumbnail': 're:http://.*\.jpg', - }, - }, { 'url': 'http://movpod.in/0wguyyxi1yca', 'only_matching': True, }, { @@ -72,8 +72,9 @@ 'id': '3ivfabn7573c', 'ext': 'mp4', 'title': 'youtube-dl test video \'äBaW_jenozKc.mp4.mp4', - 'thumbnail': 're:http://.*\.jpg', - } + 'thumbnail': r're:http://.*\.jpg', + }, + 'skip': 'Video removed', }, { 'url': 'http://vidto.me/ku5glz52nqe1.html', 'info_dict': { @@ -81,8 +82,54 @@ 'ext': 'mp4', 'title': 'test' } + }, { + 'url': 'http://powerwatch.pw/duecjibvicbu', + 'info_dict': { + 'id': 'duecjibvicbu', + 'ext': 'mp4', + 'title': 'Big Buck Bunny trailer', + }, + }, { + 'url': 'http://xvidstage.com/e0qcnl03co6z', + 'info_dict': { + 'id': 'e0qcnl03co6z', + 'ext': 'mp4', + 'title': 'Chucky Prank 2015.mp4', + }, + }, { + # removed by administrator + 'url': 'http://xvidstage.com/amfy7atlkx25', + 'only_matching': True, + }, { + 'url': 'http://vidabc.com/i8ybqscrphfv', + 'info_dict': { + 'id': 'i8ybqscrphfv', + 'ext': 'mp4', + 'title': 're:Beauty and the Beast 2017', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.rapidvideo.cool/b667kprndr8w', + 'only_matching': True, + }, { + 'url': 'http://www.fastvideo.me/k8604r8nk8sn/FAST_FURIOUS_8_-_Trailer_italiano_ufficiale.mp4.html', + 'only_matching': True, + }, { + 'url': 'http://vidto.se/1tx1pf6t12cg.html', + 'only_matching': True, }] + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r']+\bsrc=(["\'])(?P(?:https?:)?//(?:%s)/embed-[0-9a-zA-Z]+.*?)\1' + % '|'.join(site for site in list(zip(*XFileShareIE._SITES))[0]), + webpage)] + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') @@ -90,7 +137,7 @@ url = 'http://%s/%s' % (mobj.group('host'), video_id) webpage = self._download_webpage(url, video_id) - if re.search(self._FILE_NOT_FOUND_REGEX, webpage) is not None: + if any(re.search(p, webpage) for p in self._FILE_NOT_FOUND_REGEXES): raise ExtractorError('Video %s does not exist' % video_id, expected=True) fields = self._hidden_inputs(webpage) @@ -102,32 +149,62 @@ if countdown: self._sleep(countdown, video_id) - post = compat_urllib_parse.urlencode(encode_dict(fields)) - - req = sanitized_Request(url, post) - req.add_header('Content-type', 'application/x-www-form-urlencoded') - - webpage = self._download_webpage(req, video_id, 'Downloading video page') + webpage = self._download_webpage( + url, video_id, 'Downloading video page', + data=urlencode_postdata(fields), headers={ + 'Referer': url, + 'Content-type': 'application/x-www-form-urlencoded', + }) title = (self._search_regex( - [r'style="z-index: [0-9]+;">([^<]+)', + (r'style="z-index: [0-9]+;">([^<]+)', r'([^<]+)', + r'h4-fine[^>]*>([^<]+)<', r'>Watch (.+) ', - r'

    ([^<]+)

    '], - webpage, 'title', default=None) or self._og_search_title(webpage)).strip() - video_url = self._search_regex( - [r'file\s*:\s*["\'](http[^"\']+)["\'],', - r'file_link\s*=\s*\'(https?:\/\/[0-9a-zA-z.\/\-_]+)'], - webpage, 'file url') + r'

    ([^<]+)

    ', + r'

    ]*>([^<]+)<'), # streamin.to + webpage, 'title', default=None) or self._og_search_title( + webpage, default=None) or video_id).strip() + + def extract_formats(default=NO_DEFAULT): + urls = [] + for regex in ( + r'(?:file|src)\s*:\s*(["\'])(?Phttp(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1', + r'file_link\s*=\s*(["\'])(?Phttp(?:(?!\1).)+)\1', + r'addVariable\((\\?["\'])file\1\s*,\s*(\\?["\'])(?Phttp(?:(?!\2).)+)\2\)', + r']+src=(["\'])(?Phttp(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1'): + for mobj in re.finditer(regex, webpage): + video_url = mobj.group('url') + if video_url not in urls: + urls.append(video_url) + formats = [] + for video_url in urls: + if determine_ext(video_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) + else: + formats.append({ + 'url': video_url, + 'format_id': 'sd', + }) + if not formats and default is not NO_DEFAULT: + return default + self._sort_formats(formats) + return formats + + formats = extract_formats(default=None) + + if not formats: + webpage = decode_packed_codes(self._search_regex( + r"(}\('(.+)',(\d+),(\d+),'[^']*\b(?:file|embed)\b[^']*'\.split\('\|'\))", + webpage, 'packed code')) + formats = extract_formats() + thumbnail = self._search_regex( r'image\s*:\s*["\'](http[^"\']+)["\'],', webpage, 'thumbnail', default=None) - formats = [{ - 'format_id': 'sd', - 'url': video_url, - 'quality': 1, - }] - return { 'id': video_id, 'title': title, diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/xhamster.py youtube-dl-2019.05.20/youtube_dl/extractor/xhamster.py --- youtube-dl-2016.02.22/youtube_dl/extractor/xhamster.py 2016-01-14 09:35:12.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/xhamster.py 2019-06-07 18:49:07.000000000 +0000 @@ -3,64 +3,190 @@ import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( - float_or_none, + clean_html, + determine_ext, + dict_get, + ExtractorError, int_or_none, + parse_duration, + try_get, unified_strdate, + url_or_none, ) class XHamsterIE(InfoExtractor): - _VALID_URL = r'(?Phttps?)://(?:.+?\.)?xhamster\.com/movies/(?P[0-9]+)/(?P.+?)\.html(?:\?.*)?' - _TESTS = [ - { - 'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html', - 'info_dict': { - 'id': '1509445', - 'ext': 'mp4', - 'title': 'FemaleAgent Shy beauty takes the bait', - 'upload_date': '20121014', - 'uploader': 'Ruseful2011', - 'duration': 893.52, - 'age_limit': 18, - } + _VALID_URL = r'''(?x) + https?:// + (?:.+?\.)?xhamster\.(?:com|one)/ + (?: + movies/(?P\d+)/(?P[^/]*)\.html| + videos/(?P[^/]*)-(?P\d+) + ) + ''' + + _TESTS = [{ + 'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html', + 'md5': '8281348b8d3c53d39fffb377d24eac4e', + 'info_dict': { + 'id': '1509445', + 'display_id': 'femaleagent_shy_beauty_takes_the_bait', + 'ext': 'mp4', + 'title': 'FemaleAgent Shy beauty takes the bait', + 'timestamp': 1350194821, + 'upload_date': '20121014', + 'uploader': 'Ruseful2011', + 'duration': 893, + 'age_limit': 18, + 'categories': ['Fake Hub', 'Amateur', 'MILFs', 'POV', 'Beauti', 'Beauties', 'Beautiful', 'Boss', 'Office', 'Oral', 'Reality', 'Sexy', 'Taking'], }, - { - 'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd', - 'info_dict': { - 'id': '2221348', - 'ext': 'mp4', - 'title': 'Britney Spears Sexy Booty', - 'upload_date': '20130914', - 'uploader': 'jojo747400', - 'duration': 200.48, - 'age_limit': 18, - } + }, { + 'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd', + 'info_dict': { + 'id': '2221348', + 'display_id': 'britney_spears_sexy_booty', + 'ext': 'mp4', + 'title': 'Britney Spears Sexy Booty', + 'timestamp': 1379123460, + 'upload_date': '20130914', + 'uploader': 'jojo747400', + 'duration': 200, + 'age_limit': 18, + 'categories': ['Britney Spears', 'Celebrities', 'HD Videos', 'Sexy', 'Sexy Booty'], }, - { - 'url': 'https://xhamster.com/movies/2272726/amber_slayed_by_the_knight.html', - 'only_matching': True, + 'params': { + 'skip_download': True, }, - ] + }, { + # empty seo + 'url': 'http://xhamster.com/movies/5667973/.html', + 'info_dict': { + 'id': '5667973', + 'ext': 'mp4', + 'title': '....', + 'timestamp': 1454948101, + 'upload_date': '20160208', + 'uploader': 'parejafree', + 'duration': 72, + 'age_limit': 18, + 'categories': ['Amateur', 'Blowjobs'], + }, + 'params': { + 'skip_download': True, + }, + }, { + # mobile site + 'url': 'https://m.xhamster.com/videos/cute-teen-jacqueline-solo-masturbation-8559111', + 'only_matching': True, + }, { + 'url': 'https://xhamster.com/movies/2272726/amber_slayed_by_the_knight.html', + 'only_matching': True, + }, { + # This video is visible for marcoalfa123456's friends only + 'url': 'https://it.xhamster.com/movies/7263980/la_mia_vicina.html', + 'only_matching': True, + }, { + # new URL schema + 'url': 'https://pt.xhamster.com/videos/euro-pedal-pumping-7937821', + 'only_matching': True, + }, { + 'url': 'https://xhamster.one/videos/femaleagent-shy-beauty-takes-the-bait-1509445', + 'only_matching': True, + }] def _real_extract(self, url): - def extract_video_url(webpage, name): - return self._search_regex( - [r'''file\s*:\s*(?P["'])(?P.+?)(?P=q)''', - r'''["'])(?P.+?)(?P=q)\s+class=["']mp4Thumb''', - r''']+file=(?P["'])(?P.+?)(?P=q)[^>]*>'''], - webpage, name, group='mp4') + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') or mobj.group('id_2') + display_id = mobj.group('display_id') or mobj.group('display_id_2') - def is_hd(webpage): - return '
    ]+id=["\']videoClosed["\'][^>]*>(.+?)
    ', + webpage, 'error', default=None) + if error: + raise ExtractorError(error, expected=True) + + age_limit = self._rta_search(webpage) + + def get_height(s): + return int_or_none(self._search_regex( + r'^(\d+)[pP]', s, 'height', default=None)) + + initials = self._parse_json( + self._search_regex( + r'window\.initials\s*=\s*({.+?})\s*;\s*\n', webpage, 'initials', + default='{}'), + video_id, fatal=False) + if initials: + video = initials['videoModel'] + title = video['title'] + formats = [] + for format_id, formats_dict in video['sources'].items(): + if not isinstance(formats_dict, dict): + continue + for quality, format_item in formats_dict.items(): + if format_id == 'download': + # Download link takes some time to be generated, + # skipping for now + continue + if not isinstance(format_item, dict): + continue + format_url = format_item.get('link') + filesize = int_or_none( + format_item.get('size'), invscale=1000000) + else: + format_url = format_item + filesize = None + format_url = url_or_none(format_url) + if not format_url: + continue + formats.append({ + 'format_id': '%s-%s' % (format_id, quality), + 'url': format_url, + 'ext': determine_ext(format_url, 'mp4'), + 'height': get_height(quality), + 'filesize': filesize, + }) + self._sort_formats(formats) + + categories_list = video.get('categories') + if isinstance(categories_list, list): + categories = [] + for c in categories_list: + if not isinstance(c, dict): + continue + c_name = c.get('name') + if isinstance(c_name, compat_str): + categories.append(c_name) + else: + categories = None + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': video.get('description'), + 'timestamp': int_or_none(video.get('created')), + 'uploader': try_get( + video, lambda x: x['author']['name'], compat_str), + 'thumbnail': video.get('thumbURL'), + 'duration': int_or_none(video.get('duration')), + 'view_count': int_or_none(video.get('views')), + 'like_count': int_or_none(try_get( + video, lambda x: x['rating']['likes'], int)), + 'dislike_count': int_or_none(try_get( + video, lambda x: x['rating']['dislikes'], int)), + 'comment_count': int_or_none(video.get('views')), + 'age_limit': age_limit, + 'categories': categories, + 'formats': formats, + } - video_id = mobj.group('id') - seo = mobj.group('seo') - proto = mobj.group('proto') - mrss_url = '%s://xhamster.com/movies/%s/%s.html' % (proto, video_id, seo) - webpage = self._download_webpage(mrss_url, video_id) + # Old layout fallback title = self._html_search_regex( [r']*>([^<]+)

    ', @@ -68,6 +194,39 @@ r']*>(.+?)(?:,\s*[^,]*?\s*Porn\s*[^,]*?:\s*xHamster[^<]*| - xHamster\.com)'], webpage, 'title') + formats = [] + format_urls = set() + + sources = self._parse_json( + self._search_regex( + r'sources\s*:\s*({.+?})\s*,?\s*\n', webpage, 'sources', + default='{}'), + video_id, fatal=False) + for format_id, format_url in sources.items(): + format_url = url_or_none(format_url) + if not format_url: + continue + if format_url in format_urls: + continue + format_urls.add(format_url) + formats.append({ + 'format_id': format_id, + 'url': format_url, + 'height': get_height(format_id), + }) + + video_url = self._search_regex( + [r'''file\s*:\s*(?P["'])(?P.+?)(?P=q)''', + r'''["'])(?P.+?)(?P=q)\s+class=["']mp4Thumb''', + r''']+file=(?P["'])(?P.+?)(?P=q)[^>]*>'''], + webpage, 'video url', group='mp4', default=None) + if video_url and video_url not in format_urls: + formats.append({ + 'url': video_url, + }) + + self._sort_formats(formats) + # Only a few videos have an description mobj = re.search(r'Description: ([^<]+)', webpage) description = mobj.group(1) if mobj else None @@ -77,56 +236,38 @@ webpage, 'upload date', fatal=False)) uploader = self._html_search_regex( - r']+itemprop=["\']author[^>]+>]+href=["\'].+?xhamster\.com/user/[^>]+>(?P.+?)
    ', + r']+itemprop=["\']author[^>]+>]+>]+>([^<]+)', webpage, 'uploader', default='anonymous') thumbnail = self._search_regex( - [r'''thumb\s*:\s*(?P["'])(?P.+?)(?P=q)''', - r''']+poster=(?P["'])(?P.+?)(?P=q)[^>]*>'''], + [r'''["']thumbUrl["']\s*:\s*(?P["'])(?P.+?)(?P=q)''', + r''']+"poster"=(?P["'])(?P.+?)(?P=q)[^>]*>'''], webpage, 'thumbnail', fatal=False, group='thumbnail') - duration = float_or_none(self._search_regex( - r'(["\'])duration\1\s*:\s*(["\'])(?P.+?)\2', - webpage, 'duration', fatal=False, group='duration')) + duration = parse_duration(self._search_regex( + [r'<[^<]+\bitemprop=["\']duration["\'][^<]+\bcontent=["\'](.+?)["\']', + r'Runtime:\s*\s*([\d:]+)'], webpage, + 'duration', fatal=False)) view_count = int_or_none(self._search_regex( r'content=["\']User(?:View|Play)s:(\d+)', webpage, 'view count', fatal=False)) - mobj = re.search(r"hint='(?P\d+) Likes / (?P\d+) Dislikes'", webpage) + mobj = re.search(r'hint=[\'"](?P\d+) Likes / (?P\d+) Dislikes', webpage) (like_count, dislike_count) = (mobj.group('likecount'), mobj.group('dislikecount')) if mobj else (None, None) mobj = re.search(r'Comments \((?P\d+)\)
    ', webpage) comment_count = mobj.group('commentcount') if mobj else 0 - age_limit = self._rta_search(webpage) - - hd = is_hd(webpage) - - format_id = 'hd' if hd else 'sd' - - video_url = extract_video_url(webpage, format_id) - formats = [{ - 'url': video_url, - 'format_id': 'hd' if hd else 'sd', - 'preference': 1, - }] - - if not hd: - mrss_url = self._search_regex(r'Categories:.+?)', webpage, + 'categories', default=None) + categories = [clean_html(category) for category in re.findall( + r']+>(.+?)', categories_html)] if categories_html else None return { 'id': video_id, + 'display_id': display_id, 'title': title, 'description': description, 'upload_date': upload_date, @@ -138,20 +279,22 @@ 'dislike_count': int_or_none(dislike_count), 'comment_count': int_or_none(comment_count), 'age_limit': age_limit, + 'categories': categories, 'formats': formats, } class XHamsterEmbedIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?xhamster\.com/xembed\.php\?video=(?P\d+)' + _VALID_URL = r'https?://(?:.+?\.)?xhamster\.com/xembed\.php\?video=(?P\d+)' _TEST = { 'url': 'http://xhamster.com/xembed.php?video=3328539', 'info_dict': { 'id': '3328539', 'ext': 'mp4', 'title': 'Pen Masturbation', + 'timestamp': 1406581861, 'upload_date': '20140728', - 'uploader_id': 'anonymous', + 'uploader': 'ManyakisArt', 'duration': 5, 'age_limit': 18, } @@ -169,7 +312,13 @@ webpage = self._download_webpage(url, video_id) video_url = self._search_regex( - r'href="(https?://xhamster\.com/movies/%s/[^"]+\.html[^"]*)"' % video_id, - webpage, 'xhamster url') + r'href="(https?://xhamster\.com/(?:movies/{0}/[^"]*\.html|videos/[^/]*-{0})[^"]*)"'.format(video_id), + webpage, 'xhamster url', default=None) + + if not video_url: + vars = self._parse_json( + self._search_regex(r'vars\s*:\s*({.+?})\s*,\s*\n', webpage, 'vars'), + video_id) + video_url = dict_get(vars, ('downloadLink', 'homepageLink', 'commentsLink', 'shareUrl')) return self.url_result(video_url, 'XHamster') diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/xiami.py youtube-dl-2019.05.20/youtube_dl/extractor/xiami.py --- youtube-dl-2016.02.22/youtube_dl/extractor/xiami.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/xiami.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,201 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote +from ..utils import int_or_none + + +class XiamiBaseIE(InfoExtractor): + _API_BASE_URL = 'http://www.xiami.com/song/playlist/cat/json/id' + + def _download_webpage_handle(self, *args, **kwargs): + webpage = super(XiamiBaseIE, self)._download_webpage_handle(*args, **kwargs) + if '>Xiami is currently not available in your country.<' in webpage: + self.raise_geo_restricted('Xiami is currently not available in your country') + return webpage + + def _extract_track(self, track, track_id=None): + track_name = track.get('songName') or track.get('name') or track['subName'] + artist = track.get('artist') or track.get('artist_name') or track.get('singers') + title = '%s - %s' % (artist, track_name) if artist else track_name + track_url = self._decrypt(track['location']) + + subtitles = {} + lyrics_url = track.get('lyric_url') or track.get('lyric') + if lyrics_url and lyrics_url.startswith('http'): + subtitles['origin'] = [{'url': lyrics_url}] + + return { + 'id': track.get('song_id') or track_id, + 'url': track_url, + 'title': title, + 'thumbnail': track.get('pic') or track.get('album_pic'), + 'duration': int_or_none(track.get('length')), + 'creator': track.get('artist', '').split(';')[0], + 'track': track_name, + 'track_number': int_or_none(track.get('track')), + 'album': track.get('album_name') or track.get('title'), + 'artist': artist, + 'subtitles': subtitles, + } + + def _extract_tracks(self, item_id, referer, typ=None): + playlist = self._download_json( + '%s/%s%s' % (self._API_BASE_URL, item_id, '/type/%s' % typ if typ else ''), + item_id, headers={ + 'Referer': referer, + }) + return [ + self._extract_track(track, item_id) + for track in playlist['data']['trackList']] + + @staticmethod + def _decrypt(origin): + n = int(origin[0]) + origin = origin[1:] + short_lenth = len(origin) // n + long_num = len(origin) - short_lenth * n + l = tuple() + for i in range(0, n): + length = short_lenth + if i < long_num: + length += 1 + l += (origin[0:length], ) + origin = origin[length:] + ans = '' + for i in range(0, short_lenth + 1): + for j in range(0, n): + if len(l[j]) > i: + ans += l[j][i] + return compat_urllib_parse_unquote(ans).replace('^', '0') + + +class XiamiSongIE(XiamiBaseIE): + IE_NAME = 'xiami:song' + IE_DESC = '虾米音乐' + _VALID_URL = r'https?://(?:www\.)?xiami\.com/song/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'http://www.xiami.com/song/1775610518', + 'md5': '521dd6bea40fd5c9c69f913c232cb57e', + 'info_dict': { + 'id': '1775610518', + 'ext': 'mp3', + 'title': 'HONNE - Woman', + 'thumbnail': r're:http://img\.xiami\.net/images/album/.*\.jpg', + 'duration': 265, + 'creator': 'HONNE', + 'track': 'Woman', + 'album': 'Woman', + 'artist': 'HONNE', + 'subtitles': { + 'origin': [{ + 'ext': 'lrc', + }], + }, + }, + 'skip': 'Georestricted', + }, { + 'url': 'http://www.xiami.com/song/1775256504', + 'md5': '932a3abd45c6aa2b1fdbe028fcb4c4fc', + 'info_dict': { + 'id': '1775256504', + 'ext': 'mp3', + 'title': '戴荃 - 悟空', + 'thumbnail': r're:http://img\.xiami\.net/images/album/.*\.jpg', + 'duration': 200, + 'creator': '戴荃', + 'track': '悟空', + 'album': '悟空', + 'artist': '戴荃', + 'subtitles': { + 'origin': [{ + 'ext': 'lrc', + }], + }, + }, + 'skip': 'Georestricted', + }, { + 'url': 'http://www.xiami.com/song/1775953850', + 'info_dict': { + 'id': '1775953850', + 'ext': 'mp3', + 'title': 'До Скону - Чума Пожирает Землю', + 'thumbnail': r're:http://img\.xiami\.net/images/album/.*\.jpg', + 'duration': 683, + 'creator': 'До Скону', + 'track': 'Чума Пожирает Землю', + 'track_number': 7, + 'album': 'Ад', + 'artist': 'До Скону', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.xiami.com/song/xLHGwgd07a1', + 'only_matching': True, + }] + + def _real_extract(self, url): + return self._extract_tracks(self._match_id(url), url)[0] + + +class XiamiPlaylistBaseIE(XiamiBaseIE): + def _real_extract(self, url): + item_id = self._match_id(url) + return self.playlist_result(self._extract_tracks(item_id, url, self._TYPE), item_id) + + +class XiamiAlbumIE(XiamiPlaylistBaseIE): + IE_NAME = 'xiami:album' + IE_DESC = '虾米音乐 - 专辑' + _VALID_URL = r'https?://(?:www\.)?xiami\.com/album/(?P[^/?#&]+)' + _TYPE = '1' + _TESTS = [{ + 'url': 'http://www.xiami.com/album/2100300444', + 'info_dict': { + 'id': '2100300444', + }, + 'playlist_count': 10, + 'skip': 'Georestricted', + }, { + 'url': 'http://www.xiami.com/album/512288?spm=a1z1s.6843761.1110925389.6.hhE9p9', + 'only_matching': True, + }, { + 'url': 'http://www.xiami.com/album/URVDji2a506', + 'only_matching': True, + }] + + +class XiamiArtistIE(XiamiPlaylistBaseIE): + IE_NAME = 'xiami:artist' + IE_DESC = '虾米音乐 - 歌手' + _VALID_URL = r'https?://(?:www\.)?xiami\.com/artist/(?P[^/?#&]+)' + _TYPE = '2' + _TESTS = [{ + 'url': 'http://www.xiami.com/artist/2132?spm=0.0.0.0.dKaScp', + 'info_dict': { + 'id': '2132', + }, + 'playlist_count': 20, + 'skip': 'Georestricted', + }, { + 'url': 'http://www.xiami.com/artist/bC5Tk2K6eb99', + 'only_matching': True, + }] + + +class XiamiCollectionIE(XiamiPlaylistBaseIE): + IE_NAME = 'xiami:collection' + IE_DESC = '虾米音乐 - 精选集' + _VALID_URL = r'https?://(?:www\.)?xiami\.com/collect/(?P[^/?#&]+)' + _TYPE = '3' + _TEST = { + 'url': 'http://www.xiami.com/collect/156527391?spm=a1z1s.2943601.6856193.12.4jpBnr', + 'info_dict': { + 'id': '156527391', + }, + 'playlist_mincount': 29, + 'skip': 'Georestricted', + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/ximalaya.py youtube-dl-2019.05.20/youtube_dl/extractor/ximalaya.py --- youtube-dl-2016.02.22/youtube_dl/extractor/ximalaya.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/ximalaya.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,233 @@ +# coding: utf-8 + +from __future__ import unicode_literals + +import itertools +import re + +from .common import InfoExtractor + + +class XimalayaBaseIE(InfoExtractor): + _GEO_COUNTRIES = ['CN'] + + +class XimalayaIE(XimalayaBaseIE): + IE_NAME = 'ximalaya' + IE_DESC = '喜马拉雅FM' + _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/(?P[0-9]+)/sound/(?P[0-9]+)' + _USER_URL_FORMAT = '%s://www.ximalaya.com/zhubo/%i/' + _TESTS = [ + { + 'url': 'http://www.ximalaya.com/61425525/sound/47740352/', + 'info_dict': { + 'id': '47740352', + 'ext': 'm4a', + 'uploader': '小彬彬爱听书', + 'uploader_id': 61425525, + 'uploader_url': 'http://www.ximalaya.com/zhubo/61425525/', + 'title': '261.唐诗三百首.卷八.送孟浩然之广陵.李白', + 'description': "contains:《送孟浩然之广陵》\n作者:李白\n故人西辞黄鹤楼,烟花三月下扬州。\n孤帆远影碧空尽,惟见长江天际流。", + 'thumbnails': [ + { + 'name': 'cover_url', + 'url': r're:^https?://.*\.jpg$', + }, + { + 'name': 'cover_url_142', + 'url': r're:^https?://.*\.jpg$', + 'width': 180, + 'height': 180 + } + ], + 'categories': ['renwen', '人文'], + 'duration': 93, + 'view_count': int, + 'like_count': int, + } + }, + { + 'url': 'http://m.ximalaya.com/61425525/sound/47740352/', + 'info_dict': { + 'id': '47740352', + 'ext': 'm4a', + 'uploader': '小彬彬爱听书', + 'uploader_id': 61425525, + 'uploader_url': 'http://www.ximalaya.com/zhubo/61425525/', + 'title': '261.唐诗三百首.卷八.送孟浩然之广陵.李白', + 'description': "contains:《送孟浩然之广陵》\n作者:李白\n故人西辞黄鹤楼,烟花三月下扬州。\n孤帆远影碧空尽,惟见长江天际流。", + 'thumbnails': [ + { + 'name': 'cover_url', + 'url': r're:^https?://.*\.jpg$', + }, + { + 'name': 'cover_url_142', + 'url': r're:^https?://.*\.jpg$', + 'width': 180, + 'height': 180 + } + ], + 'categories': ['renwen', '人文'], + 'duration': 93, + 'view_count': int, + 'like_count': int, + } + }, + { + 'url': 'https://www.ximalaya.com/11045267/sound/15705996/', + 'info_dict': { + 'id': '15705996', + 'ext': 'm4a', + 'uploader': '李延隆老师', + 'uploader_id': 11045267, + 'uploader_url': 'https://www.ximalaya.com/zhubo/11045267/', + 'title': 'Lesson 1 Excuse me!', + 'description': "contains:Listen to the tape then answer\xa0this question. Whose handbag is it?\n" + "听录音,然后回答问题,这是谁的手袋?", + 'thumbnails': [ + { + 'name': 'cover_url', + 'url': r're:^https?://.*\.jpg$', + }, + { + 'name': 'cover_url_142', + 'url': r're:^https?://.*\.jpg$', + 'width': 180, + 'height': 180 + } + ], + 'categories': ['train', '外语'], + 'duration': 40, + 'view_count': int, + 'like_count': int, + } + }, + ] + + def _real_extract(self, url): + + is_m = 'm.ximalaya' in url + scheme = 'https' if url.startswith('https') else 'http' + + audio_id = self._match_id(url) + webpage = self._download_webpage(url, audio_id, + note='Download sound page for %s' % audio_id, + errnote='Unable to get sound page') + + audio_info_file = '%s://m.ximalaya.com/tracks/%s.json' % (scheme, audio_id) + audio_info = self._download_json(audio_info_file, audio_id, + 'Downloading info json %s' % audio_info_file, + 'Unable to download info file') + + formats = [] + for bps, k in (('24k', 'play_path_32'), ('64k', 'play_path_64')): + if audio_info.get(k): + formats.append({ + 'format_id': bps, + 'url': audio_info[k], + }) + + thumbnails = [] + for k in audio_info.keys(): + # cover pics kyes like: cover_url', 'cover_url_142' + if k.startswith('cover_url'): + thumbnail = {'name': k, 'url': audio_info[k]} + if k == 'cover_url_142': + thumbnail['width'] = 180 + thumbnail['height'] = 180 + thumbnails.append(thumbnail) + + audio_uploader_id = audio_info.get('uid') + + if is_m: + audio_description = self._html_search_regex(r'(?s)]+>(.+?)', + webpage, 'audio_description', fatal=False) + else: + audio_description = self._html_search_regex(r'(?s)]*>(.+?)', + webpage, 'audio_description', fatal=False) + + if not audio_description: + audio_description_file = '%s://www.ximalaya.com/sounds/%s/rich_intro' % (scheme, audio_id) + audio_description = self._download_webpage(audio_description_file, audio_id, + note='Downloading description file %s' % audio_description_file, + errnote='Unable to download descrip file', + fatal=False) + audio_description = audio_description.strip() if audio_description else None + + return { + 'id': audio_id, + 'uploader': audio_info.get('nickname'), + 'uploader_id': audio_uploader_id, + 'uploader_url': self._USER_URL_FORMAT % (scheme, audio_uploader_id) if audio_uploader_id else None, + 'title': audio_info['title'], + 'thumbnails': thumbnails, + 'description': audio_description, + 'categories': list(filter(None, (audio_info.get('category_name'), audio_info.get('category_title')))), + 'duration': audio_info.get('duration'), + 'view_count': audio_info.get('play_count'), + 'like_count': audio_info.get('favorites_count'), + 'formats': formats, + } + + +class XimalayaAlbumIE(XimalayaBaseIE): + IE_NAME = 'ximalaya:album' + IE_DESC = '喜马拉雅FM 专辑' + _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/(?P[0-9]+)/album/(?P[0-9]+)' + _TEMPLATE_URL = '%s://www.ximalaya.com/%s/album/%s/' + _BASE_URL_TEMPL = '%s://www.ximalaya.com%s' + _LIST_VIDEO_RE = r']+?href="(?P/%s/sound/(?P\d+)/?)"[^>]+?title="(?P[^>]+)">' + _TESTS = [{ + 'url': 'http://www.ximalaya.com/61425525/album/5534601/', + 'info_dict': { + 'title': '唐诗三百首(含赏析)', + 'id': '5534601', + }, + 'playlist_count': 312, + }, { + 'url': 'http://m.ximalaya.com/61425525/album/5534601', + 'info_dict': { + 'title': '唐诗三百首(含赏析)', + 'id': '5534601', + }, + 'playlist_count': 312, + }, + ] + + def _real_extract(self, url): + self.scheme = scheme = 'https' if url.startswith('https') else 'http' + + mobj = re.match(self._VALID_URL, url) + uid, playlist_id = mobj.group('uid'), mobj.group('id') + + webpage = self._download_webpage(self._TEMPLATE_URL % (scheme, uid, playlist_id), playlist_id, + note='Download album page for %s' % playlist_id, + errnote='Unable to get album info') + + title = self._html_search_regex(r'detailContent_title[^>]*><h1(?:[^>]+)?>([^<]+)</h1>', + webpage, 'title', fatal=False) + + return self.playlist_result(self._entries(webpage, playlist_id, uid), playlist_id, title) + + def _entries(self, page, playlist_id, uid): + html = page + for page_num in itertools.count(1): + for entry in self._process_page(html, uid): + yield entry + + next_url = self._search_regex(r'<a\s+href=(["\'])(?P<more>[\S]+)\1[^>]+rel=(["\'])next\3', + html, 'list_next_url', default=None, group='more') + if not next_url: + break + + next_full_url = self._BASE_URL_TEMPL % (self.scheme, next_url) + html = self._download_webpage(next_full_url, playlist_id) + + def _process_page(self, html, uid): + find_from = html.index('album_soundlist') + for mobj in re.finditer(self._LIST_VIDEO_RE % uid, html[find_from:]): + yield self.url_result(self._BASE_URL_TEMPL % (self.scheme, mobj.group('url')), + XimalayaIE.ie_key(), + mobj.group('id'), + mobj.group('title')) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/xminus.py youtube-dl-2019.05.20/youtube_dl/extractor/xminus.py --- youtube-dl-2016.02.22/youtube_dl/extractor/xminus.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/xminus.py 2019-06-07 18:49:07.000000000 +0000 @@ -2,15 +2,15 @@ from __future__ import unicode_literals import re +import time from .common import InfoExtractor from ..compat import ( - compat_chr, compat_ord, ) from ..utils import ( int_or_none, - parse_filesize, + parse_duration, ) @@ -22,7 +22,7 @@ 'info_dict': { 'id': '4542', 'ext': 'mp3', - 'title': 'Леонид Агутин-Песенка шофера', + 'title': 'Леонид Агутин-Песенка шофёра', 'duration': 156, 'tbr': 320, 'filesize_approx': 5900000, @@ -36,38 +36,41 @@ webpage = self._download_webpage(url, video_id) artist = self._html_search_regex( - r'minus_track\.artist="(.+?)"', webpage, 'artist') + r'<a[^>]+href="/artist/\d+">([^<]+)</a>', webpage, 'artist') title = artist + '-' + self._html_search_regex( - r'minus_track\.title="(.+?)"', webpage, 'title') - duration = int_or_none(self._html_search_regex( - r'minus_track\.dur_sec=\'([0-9]*?)\'', + r'<span[^>]+class="minustrack-full-title(?:\s+[^"]+)?"[^>]*>([^<]+)', webpage, 'title') + duration = parse_duration(self._html_search_regex( + r'<span[^>]+class="player-duration(?:\s+[^"]+)?"[^>]*>([^<]+)', webpage, 'duration', fatal=False)) - filesize_approx = parse_filesize(self._html_search_regex( - r'<div id="finfo"[^>]*>\s*↓\s*([0-9.]+\s*[a-zA-Z][bB])', - webpage, 'approximate filesize', fatal=False)) - tbr = int_or_none(self._html_search_regex( - r'<div class="quality[^"]*"></div>\s*([0-9]+)\s*kbps', - webpage, 'bitrate', fatal=False)) + mobj = re.search( + r'<div[^>]+class="dw-info(?:\s+[^"]+)?"[^>]*>(?P<tbr>\d+)\s*кбит/c\s+(?P<filesize>[0-9.]+)\s*мб</div>', + webpage) + tbr = filesize_approx = None + if mobj: + filesize_approx = float(mobj.group('filesize')) * 1000000 + tbr = float(mobj.group('tbr')) view_count = int_or_none(self._html_search_regex( - r'<div class="quality.*?► ([0-9]+)', + r'<span><[^>]+class="icon-chart-bar".*?>(\d+)</span>', webpage, 'view count', fatal=False)) description = self._html_search_regex( - r'(?s)<div id="song_texts">(.*?)</div><br', + r'(?s)<pre[^>]+id="lyrics-original"[^>]*>(.*?)</pre>', webpage, 'song lyrics', fatal=False) if description: description = re.sub(' *\r *', '\n', description) - enc_token = self._html_search_regex( - r'minus_track\.s?tkn="(.+?)"', webpage, 'enc_token') - token = ''.join( - c if pos == 3 else compat_chr(compat_ord(c) - 1) - for pos, c in enumerate(reversed(enc_token))) - video_url = 'http://x-minus.org/dwlf/%s/%s.mp3' % (video_id, token) + k = self._search_regex( + r'<div[^>]+id="player-bottom"[^>]+data-k="([^"]+)">', webpage, + 'encoded data') + h = time.time() / 3600 + a = sum(map(int, [compat_ord(c) for c in k])) + int(video_id) + h + video_url = 'http://x-minus.me/dl/minus?id=%s&tkn2=%df%d' % (video_id, a, h) return { 'id': video_id, 'title': title, 'url': video_url, + # The extension is unknown until actual downloading + 'ext': 'mp3', 'duration': duration, 'filesize_approx': filesize_approx, 'tbr': tbr, diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/xnxx.py youtube-dl-2019.05.20/youtube_dl/extractor/xnxx.py --- youtube-dl-2016.02.22/youtube_dl/extractor/xnxx.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/xnxx.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,42 +1,84 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote +from ..utils import ( + determine_ext, + int_or_none, + NO_DEFAULT, + str_to_int, +) class XNXXIE(InfoExtractor): - _VALID_URL = r'^https?://(?:video|www)\.xnxx\.com/video(?P<id>[0-9]+)/(.*)' - _TEST = { - 'url': 'http://video.xnxx.com/video1135332/lida_naked_funny_actress_5_', - 'md5': '0831677e2b4761795f68d417e0b7b445', + _VALID_URL = r'https?://(?:video|www)\.xnxx\.com/video-?(?P<id>[0-9a-z]+)/' + _TESTS = [{ + 'url': 'http://www.xnxx.com/video-55awb78/skyrim_test_video', + 'md5': '7583e96c15c0f21e9da3453d9920fbba', 'info_dict': { - 'id': '1135332', - 'ext': 'flv', - 'title': 'lida » Naked Funny Actress (5)', + 'id': '55awb78', + 'ext': 'mp4', + 'title': 'Skyrim Test Video', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 469, + 'view_count': int, 'age_limit': 18, - } - } + }, + }, { + 'url': 'http://video.xnxx.com/video1135332/lida_naked_funny_actress_5_', + 'only_matching': True, + }, { + 'url': 'http://www.xnxx.com/video-55awb78/', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - video_url = self._search_regex(r'flv_url=(.*?)&', - webpage, 'video URL') - video_url = compat_urllib_parse_unquote(video_url) - video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XNXX.COM', - webpage, 'title') + webpage = self._download_webpage(url, video_id) - video_thumbnail = self._search_regex(r'url_bigthumb=(.*?)&', - webpage, 'thumbnail', fatal=False) + def get(meta, default=NO_DEFAULT, fatal=True): + return self._search_regex( + r'set%s\s*\(\s*(["\'])(?P<value>(?:(?!\1).)+)\1' % meta, + webpage, meta, default=default, fatal=fatal, group='value') + + title = self._og_search_title( + webpage, default=None) or get('VideoTitle') + + formats = [] + for mobj in re.finditer( + r'setVideo(?:Url(?P<id>Low|High)|HLS)\s*\(\s*(?P<q>["\'])(?P<url>(?:https?:)?//.+?)(?P=q)', webpage): + format_url = mobj.group('url') + if determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + preference=1, m3u8_id='hls', fatal=False)) + else: + format_id = mobj.group('id') + if format_id: + format_id = format_id.lower() + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'quality': -1 if format_id == 'low' else 0, + }) + self._sort_formats(formats) + + thumbnail = self._og_search_thumbnail(webpage, default=None) or get( + 'ThumbUrl', fatal=False) or get('ThumbUrl169', fatal=False) + duration = int_or_none(self._og_search_property('duration', webpage)) + view_count = str_to_int(self._search_regex( + r'id=["\']nb-views-number[^>]+>([\d,.]+)', webpage, 'view count', + default=None)) return { 'id': video_id, - 'url': video_url, - 'title': video_title, - 'ext': 'flv', - 'thumbnail': video_thumbnail, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'view_count': view_count, 'age_limit': 18, + 'formats': formats, } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/xtube.py youtube-dl-2019.05.20/youtube_dl/extractor/xtube.py --- youtube-dl-2016.02.22/youtube_dl/extractor/xtube.py 2016-02-20 13:02:05.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/xtube.py 2019-06-07 18:49:07.000000000 +0000 @@ -4,17 +4,24 @@ import re from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote from ..utils import ( int_or_none, + js_to_json, orderedSet, + parse_duration, sanitized_Request, str_to_int, ) class XTubeIE(InfoExtractor): - _VALID_URL = r'(?:xtube:|https?://(?:www\.)?xtube\.com/(?:watch\.php\?.*\bv=|video-watch/(?P<display_id>[^/]+)-))(?P<id>[^/?&#]+)' + _VALID_URL = r'''(?x) + (?: + xtube:| + https?://(?:www\.)?xtube\.com/(?:watch\.php\?.*\bv=|video-watch/(?:embedded/)?(?P<display_id>[^/]+)-) + ) + (?P<id>[^/?&#]+) + ''' _TESTS = [{ # old URL schema @@ -27,15 +34,39 @@ 'description': 'contains:an ET kind of thing', 'uploader': 'greenshowers', 'duration': 450, + 'view_count': int, + 'comment_count': int, 'age_limit': 18, } }, { + # FLV videos with duplicated formats + 'url': 'http://www.xtube.com/video-watch/A-Super-Run-Part-1-YT-9299752', + 'md5': 'a406963eb349dd43692ec54631efd88b', + 'info_dict': { + 'id': '9299752', + 'display_id': 'A-Super-Run-Part-1-YT', + 'ext': 'flv', + 'title': 'A Super Run - Part 1 (YT)', + 'description': 'md5:ca0d47afff4a9b2942e4b41aa970fd93', + 'uploader': 'tshirtguy59', + 'duration': 579, + 'view_count': int, + 'comment_count': int, + 'age_limit': 18, + }, + }, { # new URL schema 'url': 'http://www.xtube.com/video-watch/strange-erotica-625837', 'only_matching': True, }, { 'url': 'xtube:625837', 'only_matching': True, + }, { + 'url': 'xtube:kVTUy_G222_', + 'only_matching': True, + }, { + 'url': 'https://www.xtube.com/video-watch/embedded/milf-tara-and-teen-shared-and-cum-covered-extreme-bukkake-32203482?embedsize=big', + 'only_matching': True, }] def _real_extract(self, url): @@ -45,29 +76,46 @@ if not display_id: display_id = video_id - url = 'http://www.xtube.com/watch.php?v=%s' % video_id - - req = sanitized_Request(url) - req.add_header('Cookie', 'age_verified=1; cookiesAccepted=1') - webpage = self._download_webpage(req, display_id) - - flashvars = self._parse_json( - self._search_regex( - r'xt\.playerOps\s*=\s*({.+?});', webpage, 'player ops'), - video_id)['flashvars'] - - title = flashvars.get('title') or self._search_regex( - r'<h1>([^<]+)</h1>', webpage, 'title') - video_url = compat_urllib_parse_unquote(flashvars['video_url']) - duration = int_or_none(flashvars.get('video_duration')) - uploader = self._search_regex( - r'<input[^>]+name="contentOwnerId"[^>]+value="([^"]+)"', - webpage, 'uploader', fatal=False) + if video_id.isdigit() and len(video_id) < 11: + url_pattern = 'http://www.xtube.com/video-watch/-%s' + else: + url_pattern = 'http://www.xtube.com/watch.php?v=%s' + + webpage = self._download_webpage( + url_pattern % video_id, display_id, headers={ + 'Cookie': 'age_verified=1; cookiesAccepted=1', + }) + + sources = self._parse_json(self._search_regex( + r'(["\'])?sources\1?\s*:\s*(?P<sources>{.+?}),', + webpage, 'sources', group='sources'), video_id, + transform_source=js_to_json) + + formats = [] + for format_id, format_url in sources.items(): + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'height': int_or_none(format_id), + }) + self._remove_duplicate_formats(formats) + self._sort_formats(formats) + + title = self._search_regex( + (r'<h1>\s*(?P<title>[^<]+?)\s*</h1>', r'videoTitle\s*:\s*(["\'])(?P<title>.+?)\1'), + webpage, 'title', group='title') description = self._search_regex( r'</h1>\s*<p>([^<]+)', webpage, 'description', fatal=False) + uploader = self._search_regex( + (r'<input[^>]+name="contentOwnerId"[^>]+value="([^"]+)"', + r'<span[^>]+class="nickname"[^>]*>([^<]+)'), + webpage, 'uploader', fatal=False) + duration = parse_duration(self._search_regex( + r'<dt>Runtime:?</dt>\s*<dd>([^<]+)</dd>', + webpage, 'duration', fatal=False)) view_count = str_to_int(self._search_regex( - r'<dt>Views:</dt>\s*<dd>([\d,\.]+)</dd>', + r'<dt>Views:?</dt>\s*<dd>([\d,\.]+)</dd>', webpage, 'view count', fatal=False)) comment_count = str_to_int(self._html_search_regex( r'>Comments? \(([\d,\.]+)\)<', @@ -76,7 +124,6 @@ return { 'id': video_id, 'display_id': display_id, - 'url': video_url, 'title': title, 'description': description, 'uploader': uploader, @@ -84,6 +131,7 @@ 'view_count': view_count, 'comment_count': comment_count, 'age_limit': 18, + 'formats': formats, } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/xuite.py youtube-dl-2019.05.20/youtube_dl/extractor/xuite.py --- youtube-dl-2016.02.22/youtube_dl/extractor/xuite.py 2016-02-01 10:41:30.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/xuite.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,14 +1,13 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals -import base64 - from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote from ..utils import ( ExtractorError, + float_or_none, + get_element_by_attribute, parse_iso8601, - parse_duration, + remove_end, ) @@ -24,7 +23,8 @@ 'id': '3860914', 'ext': 'mp3', 'title': '孤單南半球-歐德陽', - 'thumbnail': 're:^https?://.*\.jpg$', + 'description': '孤單南半球-歐德陽', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 247.246, 'timestamp': 1314932940, 'upload_date': '20110902', @@ -40,11 +40,11 @@ 'id': '25925099', 'ext': 'mp4', 'title': 'BigBuckBunny_320x180', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 596.458, 'timestamp': 1454242500, 'upload_date': '20160131', - 'uploader': 'yan12125', + 'uploader': '屁姥', 'uploader_id': '12158353', 'categories': ['個人短片'], 'description': 'http://download.blender.org/peach/bigbuckbunny_movies/BigBuckBunny_320x180.mp4', @@ -58,7 +58,7 @@ 'ext': 'mp4', 'title': '暗殺教室 02', 'description': '字幕:【極影字幕社】', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 1384.907, 'timestamp': 1421481240, 'upload_date': '20150117', @@ -66,35 +66,29 @@ 'uploader_id': '242127761', 'categories': ['電玩動漫'], }, + 'skip': 'Video removed', + }, { + # Video with encoded media id + # from http://forgetfulbc.blogspot.com/2016/06/date.html + 'url': 'http://vlog.xuite.net/embed/cE1xbENoLTI3NDQ3MzM2LmZsdg==?ar=0&as=0', + 'info_dict': { + 'id': '27447336', + 'ext': 'mp4', + 'title': '男女平權只是口號?專家解釋約會時男生是否該幫女生付錢 (中字)', + 'description': 'md5:1223810fa123b179083a3aed53574706', + 'timestamp': 1466160960, + 'upload_date': '20160617', + 'uploader': 'B.C. & Lowy', + 'uploader_id': '232279340', + }, }, { 'url': 'http://vlog.xuite.net/play/S1dDUjdyLTMyOTc3NjcuZmx2/%E5%AD%AB%E7%87%95%E5%A7%BF-%E7%9C%BC%E6%B7%9A%E6%88%90%E8%A9%A9', 'only_matching': True, }] - @staticmethod - def base64_decode_utf8(data): - return base64.b64decode(data.encode('utf-8')).decode('utf-8') - - @staticmethod - def base64_encode_utf8(data): - return base64.b64encode(data.encode('utf-8')).decode('utf-8') - - def _extract_flv_config(self, media_id): - base64_media_id = self.base64_encode_utf8(media_id) - flv_config = self._download_xml( - 'http://vlog.xuite.net/flash/player?media=%s' % base64_media_id, - 'flv config') - prop_dict = {} - for prop in flv_config.findall('./property'): - prop_id = self.base64_decode_utf8(prop.attrib['id']) - # CDATA may be empty in flv config - if not prop.text: - continue - encoded_content = self.base64_decode_utf8(prop.text) - prop_dict[prop_id] = compat_urllib_parse_unquote(encoded_content) - return prop_dict - def _real_extract(self, url): + # /play/ URLs provide embedded video URL and more metadata + url = url.replace('/embed/', '/play/') video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -107,46 +101,53 @@ '%s returned error: %s' % (self.IE_NAME, error_msg), expected=True) - video_id = self._html_search_regex( - r'data-mediaid="(\d+)"', webpage, 'media id') - flv_config = self._extract_flv_config(video_id) - - FORMATS = { - 'audio': 'mp3', - 'video': 'mp4', - } + media_info = self._parse_json(self._search_regex( + r'var\s+mediaInfo\s*=\s*({.*});', webpage, 'media info'), video_id) + + video_id = media_info['MEDIA_ID'] formats = [] - for format_tag in ('src', 'hq_src'): - video_url = flv_config.get(format_tag) + for key in ('html5Url', 'html5HQUrl'): + video_url = media_info.get(key) if not video_url: continue format_id = self._search_regex( - r'\bq=(.+?)\b', video_url, 'format id', default=format_tag) + r'\bq=(.+?)\b', video_url, 'format id', default=None) formats.append({ 'url': video_url, - 'ext': FORMATS.get(flv_config['type'], 'mp4'), + 'ext': 'mp4' if format_id.isnumeric() else format_id, 'format_id': format_id, 'height': int(format_id) if format_id.isnumeric() else None, }) self._sort_formats(formats) - timestamp = flv_config.get('publish_datetime') + timestamp = media_info.get('PUBLISH_DATETIME') if timestamp: timestamp = parse_iso8601(timestamp + ' +0800', ' ') - category = flv_config.get('category') + category = media_info.get('catName') categories = [category] if category else [] + uploader = media_info.get('NICKNAME') + uploader_url = None + + author_div = get_element_by_attribute('itemprop', 'author', webpage) + if author_div: + uploader = uploader or self._html_search_meta('name', author_div) + uploader_url = self._html_search_regex( + r'<link[^>]+itemprop="url"[^>]+href="([^"]+)"', author_div, + 'uploader URL', fatal=False) + return { 'id': video_id, - 'title': flv_config['title'], - 'description': flv_config.get('description'), - 'thumbnail': flv_config.get('thumb'), + 'title': media_info['TITLE'], + 'description': remove_end(media_info.get('metaDesc'), ' (Xuite 影音)'), + 'thumbnail': media_info.get('ogImageUrl'), 'timestamp': timestamp, - 'uploader': flv_config.get('author_name'), - 'uploader_id': flv_config.get('author_id'), - 'duration': parse_duration(flv_config.get('duration')), + 'uploader': uploader, + 'uploader_id': media_info.get('MEMBER_ID'), + 'uploader_url': uploader_url, + 'duration': float_or_none(media_info.get('MEDIA_DURATION'), 1000000), 'categories': categories, 'formats': formats, } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/xvideos.py youtube-dl-2019.05.20/youtube_dl/extractor/xvideos.py --- youtube-dl-2016.02.22/youtube_dl/extractor/xvideos.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/xvideos.py 2019-06-07 18:49:07.000000000 +0000 @@ -6,68 +6,105 @@ from ..compat import compat_urllib_parse_unquote from ..utils import ( clean_html, - ExtractorError, determine_ext, - sanitized_Request, + ExtractorError, + int_or_none, + parse_duration, ) class XVideosIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?xvideos\.com/video(?P<id>[0-9]+)(?:.*)' - _TEST = { + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?xvideos\.com/video| + flashservice\.xvideos\.com/embedframe/| + static-hw\.xvideos\.com/swf/xv-player\.swf\?.*?\bid_video= + ) + (?P<id>[0-9]+) + ''' + _TESTS = [{ 'url': 'http://www.xvideos.com/video4588838/biker_takes_his_girl', - 'md5': '4b46ae6ea5e6e9086e714d883313c0c9', + 'md5': '14cea69fcb84db54293b1e971466c2e1', 'info_dict': { 'id': '4588838', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Biker Takes his Girl', + 'duration': 108, 'age_limit': 18, } - } - - _ANDROID_USER_AGENT = 'Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19' + }, { + 'url': 'https://flashservice.xvideos.com/embedframe/4588838', + 'only_matching': True, + }, { + 'url': 'http://static-hw.xvideos.com/swf/xv-player.swf?id_video=4588838', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + + webpage = self._download_webpage( + 'https://www.xvideos.com/video%s/' % video_id, video_id) mobj = re.search(r'<h1 class="inlineError">(.+?)</h1>', webpage) if mobj: raise ExtractorError('%s said: %s' % (self.IE_NAME, clean_html(mobj.group(1))), expected=True) - video_url = compat_urllib_parse_unquote( - self._search_regex(r'flv_url=(.+?)&', webpage, 'video URL')) - video_title = self._html_search_regex( - r'<title>(.*?)\s+-\s+XVID', webpage, 'title') - video_thumbnail = self._search_regex( - r'url_bigthumb=(.+?)&', webpage, 'thumbnail', fatal=False) - - formats = [{ - 'url': video_url, - }] - - android_req = sanitized_Request(url) - android_req.add_header('User-Agent', self._ANDROID_USER_AGENT) - android_webpage = self._download_webpage(android_req, video_id, fatal=False) - - if android_webpage is not None: - player_params_str = self._search_regex( - 'mobileReplacePlayerDivTwoQual\(([^)]+)\)', - android_webpage, 'player parameters', default='') - player_params = list(map(lambda s: s.strip(' \''), player_params_str.split(','))) - if player_params: - formats.extend([{ - 'url': param, - 'preference': -10, - } for param in player_params if determine_ext(param) == 'mp4']) + title = self._html_search_regex( + (r'<title>(?P<title>.+?)\s+-\s+XVID', + r'setVideoTitle\s*\(\s*(["\'])(?P<title>(?:(?!\1).)+)\1'), + webpage, 'title', default=None, + group='title') or self._og_search_title(webpage) + + thumbnails = [] + for preference, thumbnail in enumerate(('', '169')): + thumbnail_url = self._search_regex( + r'setThumbUrl%s\(\s*(["\'])(?P<thumbnail>(?:(?!\1).)+)\1' % thumbnail, + webpage, 'thumbnail', default=None, group='thumbnail') + if thumbnail_url: + thumbnails.append({ + 'url': thumbnail_url, + 'preference': preference, + }) + + duration = int_or_none(self._og_search_property( + 'duration', webpage, default=None)) or parse_duration( + self._search_regex( + r'<span[^>]+class=["\']duration["\'][^>]*>.*?(\d[^<]+)', + webpage, 'duration', fatal=False)) + + formats = [] + + video_url = compat_urllib_parse_unquote(self._search_regex( + r'flv_url=(.+?)&', webpage, 'video URL', default='')) + if video_url: + formats.append({ + 'url': video_url, + 'format_id': 'flv', + }) + + for kind, _, format_url in re.findall( + r'setVideo([^(]+)\((["\'])(http.+?)\2\)', webpage): + format_id = kind.lower() + if format_id == 'hls': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + elif format_id in ('urllow', 'urlhigh'): + formats.append({ + 'url': format_url, + 'format_id': '%s-%s' % (determine_ext(format_url, 'mp4'), format_id[3:]), + 'quality': -2 if format_id.endswith('low') else None, + }) self._sort_formats(formats) return { 'id': video_id, 'formats': formats, - 'title': video_title, - 'ext': 'flv', - 'thumbnail': video_thumbnail, + 'title': title, + 'duration': duration, + 'thumbnails': thumbnails, 'age_limit': 18, } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/xxxymovies.py youtube-dl-2019.05.20/youtube_dl/extractor/xxxymovies.py --- youtube-dl-2016.02.22/youtube_dl/extractor/xxxymovies.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/xxxymovies.py 2019-06-07 18:49:07.000000000 +0000 @@ -39,8 +39,8 @@ r"video_url\s*:\s*'([^']+)'", webpage, 'video URL') title = self._html_search_regex( - [r'<div class="block_header">\s*<h1>([^<]+)</h1>', - r'<title>(.*?)\s*-\s*XXXYMovies\.com'], + [r']+\bclass="block_header"[^>]*>\s*

    ([^<]+)<', + r'(.*?)\s*-\s*(?:XXXYMovies\.com|XXX\s+Movies)'], webpage, 'title') thumbnail = self._search_regex( diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/yahoo.py youtube-dl-2019.05.20/youtube_dl/extractor/yahoo.py --- youtube-dl-2016.02.22/youtube_dl/extractor/yahoo.py 2016-01-23 10:08:46.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/yahoo.py 2019-06-07 18:49:07.000000000 +0000 @@ -12,18 +12,25 @@ ) from ..utils import ( clean_html, - unescapeHTML, + determine_ext, ExtractorError, + extract_attributes, int_or_none, mimetype2ext, + smuggle_url, + unescapeHTML, ) +from .brightcove import ( + BrightcoveLegacyIE, + BrightcoveNewIE, +) from .nbc import NBCSportsVPlayerIE class YahooIE(InfoExtractor): IE_DESC = 'Yahoo screen and movies' - _VALID_URL = r'(?P(?Phttps?://(?:[a-zA-Z]{2}\.)?[\da-zA-Z_-]+\.yahoo\.com)/(?:[^/]+/)*(?P.+)?-(?P[0-9]+)(?:-[a-z]+)?\.html)' + _VALID_URL = r'(?Phttps?://(?:(?P[a-zA-Z]{2})\.)?[\da-zA-Z_-]+\.yahoo\.com)/(?:[^/]+/)*(?:(?P.+)?-)?(?P[0-9]+)(?:-[a-z]+)?(?:\.html)?' _TESTS = [ { 'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', @@ -37,7 +44,7 @@ }, { 'url': 'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html', - 'md5': 'd6e6fc6e1313c608f316ddad7b82b306', + 'md5': '251af144a19ebc4a033e8ba91ac726bb', 'info_dict': { 'id': 'd1dedf8c-d58c-38c3-8963-e899929ae0a9', 'ext': 'mp4', @@ -45,10 +52,11 @@ 'description': 'md5:66b627ab0a282b26352136ca96ce73c1', 'duration': 151, }, + 'skip': 'HTTP Error 404', }, { 'url': 'https://screen.yahoo.com/community/community-sizzle-reel-203225340.html?format=embed', - 'md5': '60e8ac193d8fb71997caa8fce54c6460', + 'md5': '7993e572fac98e044588d0b5260f4352', 'info_dict': { 'id': '4fe78544-8d48-39d8-97cd-13f205d9fcdb', 'ext': 'mp4', @@ -58,22 +66,22 @@ } }, { - 'url': 'https://tw.screen.yahoo.com/election-2014-askmayor/敢問市長-黃秀霜批賴清德-非常高傲-033009720.html', - 'md5': '3a09cf59349cfaddae1797acc3c087fc', + 'url': 'https://tw.news.yahoo.com/%E6%95%A2%E5%95%8F%E5%B8%82%E9%95%B7%20%E9%BB%83%E7%A7%80%E9%9C%9C%E6%89%B9%E8%B3%B4%E6%B8%85%E5%BE%B7%20%E9%9D%9E%E5%B8%B8%E9%AB%98%E5%82%B2-034024051.html', + 'md5': '45c024bad51e63e9b6f6fad7a43a8c23', 'info_dict': { 'id': 'cac903b3-fcf4-3c14-b632-643ab541712f', 'ext': 'mp4', 'title': '敢問市長/黃秀霜批賴清德「非常高傲」', 'description': '直言台南沒捷運 交通居五都之末', 'duration': 396, - } + }, }, { 'url': 'https://uk.screen.yahoo.com/editor-picks/cute-raccoon-freed-drain-using-091756545.html', - 'md5': '0b51660361f0e27c9789e7037ef76f4b', + 'md5': '71298482f7c64cbb7fa064e4553ff1c1', 'info_dict': { 'id': 'b3affa53-2e14-3590-852b-0e0db6cd1a58', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'Cute Raccoon Freed From Drain\u00a0Using Angle Grinder', 'description': 'md5:f66c890e1490f4910a9953c941dee944', 'duration': 97, @@ -88,17 +96,32 @@ 'title': 'Program that makes hockey more affordable not offered in Manitoba', 'description': 'md5:c54a609f4c078d92b74ffb9bf1f496f4', 'duration': 121, - } + }, + 'skip': 'Video gone', }, { 'url': 'https://ca.finance.yahoo.com/news/hackers-sony-more-trouble-well-154609075.html', - 'md5': '226a895aae7e21b0129e2a2006fe9690', 'info_dict': { - 'id': 'e624c4bc-3389-34de-9dfc-025f74943409', - 'ext': 'mp4', - 'title': '\'The Interview\' TV Spot: War', - 'description': 'The Interview', - 'duration': 30, - } + 'id': '154609075', + }, + 'playlist': [{ + 'md5': '000887d0dc609bc3a47c974151a40fb8', + 'info_dict': { + 'id': 'e624c4bc-3389-34de-9dfc-025f74943409', + 'ext': 'mp4', + 'title': '\'The Interview\' TV Spot: War', + 'description': 'The Interview', + 'duration': 30, + }, + }, { + 'md5': '81bc74faf10750fe36e4542f9a184c66', + 'info_dict': { + 'id': '1fc8ada0-718e-3abe-a450-bf31f246d1a9', + 'ext': 'mp4', + 'title': '\'The Interview\' TV Spot: Guys', + 'description': 'The Interview', + 'duration': 30, + }, + }], }, { 'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html', 'md5': '88e209b417f173d86186bef6e4d1f160', @@ -118,7 +141,8 @@ 'title': 'Connect the Dots: Dark Side of Virgo', 'description': 'md5:1428185051cfd1949807ad4ff6d3686a', 'duration': 201, - } + }, + 'skip': 'Domain name in.lifestyle.yahoo.com gone', }, { 'url': 'https://www.yahoo.com/movies/v/true-story-trailer-173000497.html', 'md5': '989396ae73d20c6f057746fb226aa215', @@ -140,6 +164,9 @@ 'ext': 'flv', 'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d', 'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson', + 'upload_date': '20150313', + 'uploader': 'NBCU-SPORTS', + 'timestamp': 1426270238, } }, { 'url': 'https://tw.news.yahoo.com/-100120367.html', @@ -165,31 +192,119 @@ 'description': 'While they play feuding fathers in \'Daddy\'s Home,\' star Will Ferrell & Mark Wahlberg share their true feelings on parenthood.', }, }, + { + # config['models']['applet_model']['data']['sapi'] has no query + 'url': 'https://www.yahoo.com/music/livenation/event/galactic-2016', + 'md5': 'dac0c72d502bc5facda80c9e6d5c98db', + 'info_dict': { + 'id': 'a6015640-e9e5-3efb-bb60-05589a183919', + 'ext': 'mp4', + 'description': 'Galactic', + 'title': 'Dolla Diva (feat. Maggie Koerner)', + }, + 'skip': 'redirect to https://www.yahoo.com/music', + }, + { + # yahoo://article/ + 'url': 'https://www.yahoo.com/movies/video/true-story-trailer-173000497.html', + 'info_dict': { + 'id': '071c4013-ce30-3a93-a5b2-e0413cd4a9d1', + 'ext': 'mp4', + 'title': "'True Story' Trailer", + 'description': 'True Story', + }, + 'params': { + 'skip_download': True, + }, + }, + { + # ytwnews://cavideo/ + 'url': 'https://tw.video.yahoo.com/movie-tw/單車天使-中文版預-092316541.html', + 'info_dict': { + 'id': 'ba133ff2-0793-3510-b636-59dfe9ff6cff', + 'ext': 'mp4', + 'title': '單車天使 - 中文版預', + 'description': '中文版預', + }, + 'params': { + 'skip_download': True, + }, + }, + { + # custom brightcove + 'url': 'https://au.tv.yahoo.com/plus7/sunrise/-/watch/37083565/clown-entertainers-say-it-is-hurting-their-business/', + 'info_dict': { + 'id': '5575377707001', + 'ext': 'mp4', + 'title': "Clown entertainers say 'It' is hurting their business", + 'description': 'Stephen King s horror film has much to answer for. Jelby and Mr Loopy the Clowns join us.', + 'timestamp': 1505341164, + 'upload_date': '20170913', + 'uploader_id': '2376984109001', + }, + 'params': { + 'skip_download': True, + }, + }, + { + # custom brightcove, geo-restricted to Australia, bypassable + 'url': 'https://au.tv.yahoo.com/plus7/sunrise/-/watch/37263964/sunrise-episode-wed-27-sep/', + 'only_matching': True, + } ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('display_id') or self._match_id(url) page_id = mobj.group('id') - url = mobj.group('url') + display_id = mobj.group('display_id') or page_id host = mobj.group('host') - webpage = self._download_webpage(url, display_id) + webpage, urlh = self._download_webpage_handle(url, display_id) + if 'err=404' in urlh.geturl(): + raise ExtractorError('Video gone', expected=True) # Look for iframed media first - iframe_m = re.search(r']+src="(/video/.+?-\d+\.html\?format=embed.*?)"', webpage) - if iframe_m: - iframepage = self._download_webpage( - host + iframe_m.group(1), display_id, 'Downloading iframe webpage') - items_json = self._search_regex( - r'mediaItems: (\[.+?\])$', iframepage, 'items', flags=re.MULTILINE, default=None) - if items_json: - items = json.loads(items_json) - video_id = items[0]['id'] - return self._get_info(video_id, display_id, webpage) + entries = [] + iframe_urls = re.findall(r']+src="(/video/.+?-\d+\.html\?format=embed.*?)"', webpage) + for idx, iframe_url in enumerate(iframe_urls): + entries.append(self.url_result(host + iframe_url, 'Yahoo')) + if entries: + return self.playlist_result(entries, page_id) + # Look for NBCSports iframes nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage) if nbc_sports_url: - return self.url_result(nbc_sports_url, 'NBCSportsVPlayer') + return self.url_result(nbc_sports_url, NBCSportsVPlayerIE.ie_key()) + + # Look for Brightcove Legacy Studio embeds + bc_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) + if bc_url: + return self.url_result(bc_url, BrightcoveLegacyIE.ie_key()) + + def brightcove_url_result(bc_url): + return self.url_result( + smuggle_url(bc_url, {'geo_countries': [mobj.group('country')]}), + BrightcoveNewIE.ie_key()) + + # Look for Brightcove New Studio embeds + bc_url = BrightcoveNewIE._extract_url(self, webpage) + if bc_url: + return brightcove_url_result(bc_url) + + brightcove_iframe = self._search_regex( + r'(]+data-video-id=["\']\d+[^>]+>)', webpage, + 'brightcove iframe', default=None) + if brightcove_iframe: + attr = extract_attributes(brightcove_iframe) + src = attr.get('src') + if src: + parsed_src = compat_urlparse.urlparse(src) + qs = compat_urlparse.parse_qs(parsed_src.query) + account_id = qs.get('accountId', ['2376984109001'])[0] + brightcove_id = attr.get('data-video-id') or qs.get('videoId', [None])[0] + if account_id and brightcove_id: + return brightcove_url_result( + 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' + % (account_id, brightcove_id)) # Query result is often embedded in webpage as JSON. Sometimes explicit requests # to video API results in a failure with geo restriction reason therefore using @@ -201,8 +316,10 @@ config = self._parse_json(config_json, display_id, fatal=False) if config: sapi = config.get('models', {}).get('applet_model', {}).get('data', {}).get('sapi') - if sapi: - return self._extract_info(display_id, sapi, webpage) + if sapi and 'query' in sapi: + info = self._extract_info(display_id, sapi, webpage) + self._sort_formats(info['formats']) + return info items_json = self._search_regex( r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE, @@ -222,7 +339,8 @@ r'"first_videoid"\s*:\s*"([^"]+)"', r'%s[^}]*"ccm_id"\s*:\s*"([^"]+)"' % re.escape(page_id), r']data-uuid=["\']([^"\']+)', - r'yahoo://article/view\?.*\buuid=([^&"\']+)', + r']+yahoo://article/view\?.*\buuid=([^&"\']+)', + r']+["\']ytwnews://cavideo/(?:[^/]+/)+([\da-fA-F-]+)[&"\']', ] video_id = self._search_regex( CONTENT_ID_REGEXES, webpage, 'content ID') @@ -248,15 +366,17 @@ formats = [] for s in info['streams']: + tbr = int_or_none(s.get('bitrate')) format_info = { 'width': int_or_none(s.get('width')), 'height': int_or_none(s.get('height')), - 'tbr': int_or_none(s.get('bitrate')), + 'tbr': tbr, } host = s['host'] path = s['path'] if host.startswith('rtmp'): + fmt = 'rtmp' format_info.update({ 'url': host, 'play_path': path, @@ -264,14 +384,18 @@ }) else: if s.get('format') == 'm3u8_playlist': - format_info['protocol'] = 'm3u8_native' - format_info['ext'] = 'mp4' + fmt = 'hls' + format_info.update({ + 'protocol': 'm3u8_native', + 'ext': 'mp4', + }) + else: + fmt = format_info['ext'] = determine_ext(path) format_url = compat_urlparse.urljoin(host, path) format_info['url'] = format_url + format_info['format_id'] = fmt + ('-%d' % tbr if tbr else '') formats.append(format_info) - self._sort_formats(formats) - closed_captions = self._html_search_regex( r'"closedcaptions":(\[[^\]]+\])', webpage, 'closed captions', default='[]') @@ -302,17 +426,25 @@ def _get_info(self, video_id, display_id, webpage): region = self._search_regex( r'\\?"region\\?"\s*:\s*\\?"([^"]+?)\\?"', - webpage, 'region', fatal=False, default='US') - data = compat_urllib_parse.urlencode({ - 'protocol': 'http', - 'region': region, - }) - query_url = ( - 'https://video.media.yql.yahoo.com/v1/video/sapi/streams/' - '{id}?{data}'.format(id=video_id, data=data)) - query_result = self._download_json( - query_url, display_id, 'Downloading video info') - return self._extract_info(display_id, query_result, webpage) + webpage, 'region', fatal=False, default='US').upper() + formats = [] + info = {} + for fmt in ('webm', 'mp4'): + query_result = self._download_json( + 'https://video.media.yql.yahoo.com/v1/video/sapi/streams/' + video_id, + display_id, 'Downloading %s video info' % fmt, query={ + 'protocol': 'http', + 'region': region, + 'format': fmt, + }) + info = self._extract_info(display_id, query_result, webpage) + formats.extend(info['formats']) + formats.extend(self._extract_m3u8_formats( + 'http://video.media.yql.yahoo.com/v1/hls/%s?region=%s' % (video_id, region), + video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + self._sort_formats(formats) + info['formats'] = formats + return info class YahooSearchIE(SearchInfoExtractor): @@ -345,3 +477,82 @@ 'id': query, 'entries': entries, } + + +class YahooGyaOPlayerIE(InfoExtractor): + IE_NAME = 'yahoo:gyao:player' + _VALID_URL = r'https?://(?:gyao\.yahoo\.co\.jp/(?:player|episode/[^/]+)|streaming\.yahoo\.co\.jp/c/y)/(?P\d+/v\d+/v\d+|[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _TESTS = [{ + 'url': 'https://gyao.yahoo.co.jp/player/00998/v00818/v0000000000000008564/', + 'info_dict': { + 'id': '5993125228001', + 'ext': 'mp4', + 'title': 'フューリー 【字幕版】', + 'description': 'md5:21e691c798a15330eda4db17a8fe45a5', + 'uploader_id': '4235717419001', + 'upload_date': '20190124', + 'timestamp': 1548294365, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'https://streaming.yahoo.co.jp/c/y/01034/v00133/v0000000000000000706/', + 'only_matching': True, + }, { + 'url': 'https://gyao.yahoo.co.jp/episode/%E3%81%8D%E3%81%AE%E3%81%86%E4%BD%95%E9%A3%9F%E3%81%B9%E3%81%9F%EF%BC%9F%20%E7%AC%AC2%E8%A9%B1%202019%2F4%2F12%E6%94%BE%E9%80%81%E5%88%86/5cb02352-b725-409e-9f8d-88f947a9f682', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url).replace('/', ':') + video = self._download_json( + 'https://gyao.yahoo.co.jp/dam/v1/videos/' + video_id, + video_id, query={ + 'fields': 'longDescription,title,videoId', + }, headers={ + 'X-User-Agent': 'Unknown Pc GYAO!/2.0.0 Web', + }) + return { + '_type': 'url_transparent', + 'id': video_id, + 'title': video['title'], + 'url': smuggle_url( + 'http://players.brightcove.net/4235717419001/default_default/index.html?videoId=' + video['videoId'], + {'geo_countries': ['JP']}), + 'description': video.get('longDescription'), + 'ie_key': BrightcoveNewIE.ie_key(), + } + + +class YahooGyaOIE(InfoExtractor): + IE_NAME = 'yahoo:gyao' + _VALID_URL = r'https?://(?:gyao\.yahoo\.co\.jp/(?:p|title/[^/]+)|streaming\.yahoo\.co\.jp/p/y)/(?P\d+/v\d+|[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _TESTS = [{ + 'url': 'https://gyao.yahoo.co.jp/p/00449/v03102/', + 'info_dict': { + 'id': '00449:v03102', + }, + 'playlist_count': 2, + }, { + 'url': 'https://streaming.yahoo.co.jp/p/y/01034/v00133/', + 'only_matching': True, + }, { + 'url': 'https://gyao.yahoo.co.jp/title/%E3%81%97%E3%82%83%E3%81%B9%E3%81%8F%E3%82%8A007/5b025a49-b2e5-4dc7-945c-09c6634afacf', + 'only_matching': True, + }] + + def _real_extract(self, url): + program_id = self._match_id(url).replace('/', ':') + videos = self._download_json( + 'https://gyao.yahoo.co.jp/api/programs/%s/videos' % program_id, program_id)['videos'] + entries = [] + for video in videos: + video_id = video.get('id') + if not video_id: + continue + entries.append(self.url_result( + 'https://gyao.yahoo.co.jp/player/%s/' % video_id.replace(':', '/'), + YahooGyaOPlayerIE.ie_key(), video_id)) + return self.playlist_result(entries, program_id) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/yam.py youtube-dl-2019.05.20/youtube_dl/extractor/yam.py --- youtube-dl-2016.02.22/youtube_dl/extractor/yam.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/yam.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,123 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_urlparse -from ..utils import ( - float_or_none, - month_by_abbreviation, - ExtractorError, - get_element_by_attribute, -) - - -class YamIE(InfoExtractor): - IE_DESC = '蕃薯藤yam天空部落' - _VALID_URL = r'http://mymedia.yam.com/m/(?P\d+)' - - _TESTS = [{ - # An audio hosted on Yam - 'url': 'http://mymedia.yam.com/m/2283921', - 'md5': 'c011b8e262a52d5473d9c2e3c9963b9c', - 'info_dict': { - 'id': '2283921', - 'ext': 'mp3', - 'title': '發現 - 趙薇 京華煙雲主題曲', - 'description': '發現 - 趙薇 京華煙雲主題曲', - 'uploader_id': 'princekt', - 'upload_date': '20080807', - 'duration': 313.0, - } - }, { - # An external video hosted on YouTube - 'url': 'http://mymedia.yam.com/m/3599430', - 'md5': '03127cf10d8f35d120a9e8e52e3b17c6', - 'info_dict': { - 'id': 'CNpEoQlrIgA', - 'ext': 'mp4', - 'upload_date': '20150306', - 'uploader': '新莊社大瑜伽社', - 'description': 'md5:11e2e405311633ace874f2e6226c8b17', - 'uploader_id': '2323agoy', - 'title': '20090412陽明山二子坪-1', - }, - 'skip': 'Video does not exist', - }, { - 'url': 'http://mymedia.yam.com/m/3598173', - 'info_dict': { - 'id': '3598173', - 'ext': 'mp4', - }, - 'skip': 'cause Yam system error', - }, { - 'url': 'http://mymedia.yam.com/m/3599437', - 'info_dict': { - 'id': '3599437', - 'ext': 'mp4', - }, - 'skip': 'invalid YouTube URL', - }, { - 'url': 'http://mymedia.yam.com/m/2373534', - 'md5': '7ff74b91b7a817269d83796f8c5890b1', - 'info_dict': { - 'id': '2373534', - 'ext': 'mp3', - 'title': '林俊傑&蔡卓妍-小酒窩', - 'description': 'md5:904003395a0fcce6cfb25028ff468420', - 'upload_date': '20080928', - 'uploader_id': 'onliner2', - } - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - page = self._download_webpage(url, video_id) - - # Check for errors - system_msg = self._html_search_regex( - r'系統訊息(?:
    |\n|\r)*([^<>]+)
    ', page, 'system message', - default=None) - if system_msg: - raise ExtractorError(system_msg, expected=True) - - # Is it hosted externally on YouTube? - youtube_url = self._html_search_regex( - r']+class="heading"[^>]*>\s*(.+)\s*

    ', page, 'title') - - api_page = self._download_webpage( - 'http://mymedia.yam.com/api/a/?pID=' + video_id, video_id, - note='Downloading API page') - api_result_obj = compat_urlparse.parse_qs(api_page) - - info_table = get_element_by_attribute('class', 'info', page) - uploader_id = self._html_search_regex( - r':[\n ]+(?P[A-Z][a-z]{2})\s+' + - r'(?P\d{1,2}), (?P\d{4})', page) - if mobj: - upload_date = '%s%02d%02d' % ( - mobj.group('year'), - month_by_abbreviation(mobj.group('mon')), - int(mobj.group('day'))) - else: - upload_date = None - duration = float_or_none(api_result_obj['totaltime'][0], scale=1000) - - return { - 'id': video_id, - 'url': api_result_obj['mp3file'][0], - 'title': title, - 'description': self._html_search_meta('description', page), - 'duration': duration, - 'uploader_id': uploader_id, - 'upload_date': upload_date, - } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/yandexdisk.py youtube-dl-2019.05.20/youtube_dl/extractor/yandexdisk.py --- youtube-dl-2016.02.22/youtube_dl/extractor/yandexdisk.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/yandexdisk.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,118 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + float_or_none, + int_or_none, + try_get, + urlencode_postdata, +) + + +class YandexDiskIE(InfoExtractor): + _VALID_URL = r'https?://yadi\.sk/[di]/(?P[^/?#&]+)' + + _TESTS = [{ + 'url': 'https://yadi.sk/i/VdOeDou8eZs6Y', + 'md5': '33955d7ae052f15853dc41f35f17581c', + 'info_dict': { + 'id': 'VdOeDou8eZs6Y', + 'ext': 'mp4', + 'title': '4.mp4', + 'duration': 168.6, + 'uploader': 'y.botova', + 'uploader_id': '300043621', + 'view_count': int, + }, + }, { + 'url': 'https://yadi.sk/d/h3WAXvDS3Li3Ce', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + status = self._download_webpage( + 'https://disk.yandex.com/auth/status', video_id, query={ + 'urlOrigin': url, + 'source': 'public', + 'md5': 'false', + }) + + sk = self._search_regex( + r'(["\'])sk(?:External)?\1\s*:\s*(["\'])(?P(?:(?!\2).)+)\2', + status, 'sk', group='value') + + webpage = self._download_webpage(url, video_id) + + models = self._parse_json( + self._search_regex( + r']+id=["\']models-client[^>]+>\s*(\[.+?\])\s*\d+)/track/(?P\d+)' @@ -27,53 +57,89 @@ 'info_dict': { 'id': '4878838', 'ext': 'mp3', - 'title': 'Carlo Ambrosio - Gypsy Eyes 1', + 'title': 'Carlo Ambrosio, Carlo Ambrosio & Fabio Di Bari - Gypsy Eyes 1', 'filesize': 4628061, 'duration': 193.04, - } + 'track': 'Gypsy Eyes 1', + 'album': 'Gypsy Soul', + 'album_artist': 'Carlo Ambrosio', + 'artist': 'Carlo Ambrosio, Carlo Ambrosio & Fabio Di Bari', + 'release_year': 2009, + }, + 'skip': 'Travis CI servers blocked by YandexMusic', } - def _get_track_url(self, storage_dir, track_id): - data = self._download_json( - 'http://music.yandex.ru/api/v1.5/handlers/api-jsonp.jsx?action=getTrackSrc&p=download-info/%s' - % storage_dir, - track_id, 'Downloading track location JSON') + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + album_id, track_id = mobj.group('album_id'), mobj.group('id') - key = hashlib.md5(('XGRlBW9FXlekgbPrRHuSiA' + data['path'][1:] + data['s']).encode('utf-8')).hexdigest() - storage = storage_dir.split('.') + track = self._download_json( + 'http://music.yandex.ru/handlers/track.jsx?track=%s:%s' % (track_id, album_id), + track_id, 'Downloading track JSON')['track'] + track_title = track['title'] - return ('http://%s/get-mp3/%s/%s?track-id=%s&from=service-10-track&similarities-experiment=default' - % (data['host'], key, data['ts'] + data['path'], storage[1])) + download_data = self._download_json( + 'https://music.yandex.ru/api/v2.1/handlers/track/%s:%s/web-album_track-track-track-main/download/m' % (track_id, album_id), + track_id, 'Downloading track location url JSON', + headers={'X-Retpath-Y': url}) + + fd_data = self._download_json( + download_data['src'], track_id, + 'Downloading track location JSON', + query={'format': 'json'}) + key = hashlib.md5(('XGRlBW9FXlekgbPrRHuSiA' + fd_data['path'][1:] + fd_data['s']).encode('utf-8')).hexdigest() + storage = track['storageDir'].split('.') + f_url = 'http://%s/get-mp3/%s/%s?track-id=%s ' % (fd_data['host'], key, fd_data['ts'] + fd_data['path'], storage[1]) - def _get_track_info(self, track): thumbnail = None cover_uri = track.get('albums', [{}])[0].get('coverUri') if cover_uri: thumbnail = cover_uri.replace('%%', 'orig') if not thumbnail.startswith('http'): thumbnail = 'http://' + thumbnail - return { - 'id': track['id'], + + track_info = { + 'id': track_id, 'ext': 'mp3', - 'url': self._get_track_url(track['storageDir'], track['id']), - 'title': '%s - %s' % (track['artists'][0]['name'], track['title']), + 'url': f_url, 'filesize': int_or_none(track.get('fileSize')), 'duration': float_or_none(track.get('durationMs'), 1000), 'thumbnail': thumbnail, + 'track': track_title, + 'acodec': download_data.get('codec'), + 'abr': int_or_none(download_data.get('bitrate')), } - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - album_id, track_id = mobj.group('album_id'), mobj.group('id') - - track = self._download_json( - 'http://music.yandex.ru/handlers/track.jsx?track=%s:%s' % (track_id, album_id), - track_id, 'Downloading track JSON')['track'] + def extract_artist(artist_list): + if artist_list and isinstance(artist_list, list): + artists_names = [a['name'] for a in artist_list if a.get('name')] + if artists_names: + return ', '.join(artists_names) + + albums = track.get('albums') + if albums and isinstance(albums, list): + album = albums[0] + if isinstance(album, dict): + year = album.get('year') + track_info.update({ + 'album': album.get('title'), + 'album_artist': extract_artist(album.get('artists')), + 'release_year': int_or_none(year), + }) + + track_artist = extract_artist(track.get('artists')) + if track_artist: + track_info.update({ + 'artist': track_artist, + 'title': '%s - %s' % (track_artist, track_title), + }) + else: + track_info['title'] = track_title - return self._get_track_info(track) + return track_info -class YandexMusicPlaylistBaseIE(InfoExtractor): +class YandexMusicPlaylistBaseIE(YandexMusicBaseIE): def _build_playlist(self, tracks): return [ self.url_result( @@ -93,6 +159,7 @@ 'title': 'Carlo Ambrosio - Gypsy Soul (2009)', }, 'playlist_count': 50, + 'skip': 'Travis CI servers blocked by YandexMusic', } def _real_extract(self, url): @@ -115,7 +182,7 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): IE_NAME = 'yandexmusic:playlist' IE_DESC = 'Яндекс.Музыка - Плейлист' - _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/users/[^/]+/playlists/(?P\d+)' + _VALID_URL = r'https?://music\.yandex\.(?Pru|kz|ua|by)/users/(?P[^/]+)/playlists/(?P\d+)' _TESTS = [{ 'url': 'http://music.yandex.ru/users/music.partners/playlists/1245', @@ -125,54 +192,74 @@ 'description': 'md5:3b9f27b0efbe53f2ee1e844d07155cc9', }, 'playlist_count': 6, + 'skip': 'Travis CI servers blocked by YandexMusic', }, { # playlist exceeding the limit of 150 tracks shipped with webpage (see - # https://github.com/rg3/youtube-dl/issues/6666) + # https://github.com/ytdl-org/youtube-dl/issues/6666) 'url': 'https://music.yandex.ru/users/ya.playlist/playlists/1036', 'info_dict': { 'id': '1036', 'title': 'Музыка 90-х', }, - 'playlist_count': 310, + 'playlist_mincount': 300, + 'skip': 'Travis CI servers blocked by YandexMusic', }] def _real_extract(self, url): - playlist_id = self._match_id(url) - - webpage = self._download_webpage(url, playlist_id) - - mu = self._parse_json( - self._search_regex( - r'var\s+Mu\s*=\s*({.+?});\s*', webpage, 'player'), - playlist_id) + mobj = re.match(self._VALID_URL, url) + tld = mobj.group('tld') + user = mobj.group('user') + playlist_id = mobj.group('id') + + playlist = self._download_json( + 'https://music.yandex.%s/handlers/playlist.jsx' % tld, + playlist_id, 'Downloading missing tracks JSON', + fatal=False, + headers={ + 'Referer': url, + 'X-Requested-With': 'XMLHttpRequest', + 'X-Retpath-Y': url, + }, + query={ + 'owner': user, + 'kinds': playlist_id, + 'light': 'true', + 'lang': tld, + 'external-domain': 'music.yandex.%s' % tld, + 'overembed': 'false', + })['playlist'] - playlist = mu['pageData']['playlist'] - tracks, track_ids = playlist['tracks'], playlist['trackIds'] + tracks = playlist['tracks'] + track_ids = [compat_str(track_id) for track_id in playlist['trackIds']] - # tracks dictionary shipped with webpage is limited to 150 tracks, + # tracks dictionary shipped with playlist.jsx API is limited to 150 tracks, # missing tracks should be retrieved manually. if len(tracks) < len(track_ids): - present_track_ids = set([compat_str(track['id']) for track in tracks if track.get('id')]) - missing_track_ids = set(map(compat_str, track_ids)) - set(present_track_ids) - request = sanitized_Request( - 'https://music.yandex.ru/handlers/track-entries.jsx', - compat_urllib_parse.urlencode({ + present_track_ids = set([ + compat_str(track['id']) + for track in tracks if track.get('id')]) + missing_track_ids = [ + track_id for track_id in track_ids + if track_id not in present_track_ids] + missing_tracks = self._download_json( + 'https://music.yandex.%s/handlers/track-entries.jsx' % tld, + playlist_id, 'Downloading missing tracks JSON', + fatal=False, + headers={ + 'Referer': url, + 'X-Requested-With': 'XMLHttpRequest', + }, + query={ 'entries': ','.join(missing_track_ids), - 'lang': mu.get('settings', {}).get('lang', 'en'), - 'external-domain': 'music.yandex.ru', + 'lang': tld, + 'external-domain': 'music.yandex.%s' % tld, 'overembed': 'false', - 'sign': mu.get('authData', {}).get('user', {}).get('sign'), 'strict': 'true', - }).encode('utf-8')) - request.add_header('Referer', url) - request.add_header('X-Requested-With', 'XMLHttpRequest') - - missing_tracks = self._download_json( - request, playlist_id, 'Downloading missing tracks JSON', fatal=False) + }) if missing_tracks: tracks.extend(missing_tracks) return self.playlist_result( self._build_playlist(tracks), compat_str(playlist_id), - playlist['title'], playlist.get('description')) + playlist.get('title'), playlist.get('description')) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/yandexvideo.py youtube-dl-2019.05.20/youtube_dl/extractor/yandexvideo.py --- youtube-dl-2016.02.22/youtube_dl/extractor/yandexvideo.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/yandexvideo.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,90 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + url_or_none, +) + + +class YandexVideoIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + yandex\.ru(?:/portal/(?:video|efir))?/?\?.*?stream_id=| + frontend\.vh\.yandex\.ru/player/ + ) + (?P[\da-f]+) + ''' + _TESTS = [{ + 'url': 'https://yandex.ru/portal/video?stream_id=4dbb262b4fe5cf15a215de4f34eee34d', + 'md5': '33955d7ae052f15853dc41f35f17581c', + 'info_dict': { + 'id': '4dbb262b4fe5cf15a215de4f34eee34d', + 'ext': 'mp4', + 'title': 'В Нью-Йорке баржи и теплоход оторвались от причала и расплылись по Гудзону', + 'description': '', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 0, + 'duration': 30, + 'age_limit': 18, + }, + }, { + 'url': 'https://yandex.ru/portal/efir?stream_id=4dbb36ec4e0526d58f9f2dc8f0ecf374&from=morda', + 'only_matching': True, + }, { + 'url': 'https://yandex.ru/?stream_id=4dbb262b4fe5cf15a215de4f34eee34d', + 'only_matching': True, + }, { + 'url': 'https://frontend.vh.yandex.ru/player/4dbb262b4fe5cf15a215de4f34eee34d?from=morda', + 'only_matching': True, + }, { + # vod-episode, series episode + 'url': 'https://yandex.ru/portal/video?stream_id=45b11db6e4b68797919c93751a938cee', + 'only_matching': True, + }, { + # episode, sports + 'url': 'https://yandex.ru/?stream_channel=1538487871&stream_id=4132a07f71fb0396be93d74b3477131d', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + content = self._download_json( + 'https://frontend.vh.yandex.ru/v22/player/%s.json' % video_id, + video_id, query={ + 'stream_options': 'hires', + 'disable_trackings': 1, + })['content'] + + m3u8_url = url_or_none(content.get('content_url')) or url_or_none( + content['streams'][0]['url']) + title = content.get('title') or content.get('computed_title') + + formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + self._sort_formats(formats) + + description = content.get('description') + thumbnail = content.get('thumbnail') + timestamp = (int_or_none(content.get('release_date')) + or int_or_none(content.get('release_date_ut')) + or int_or_none(content.get('start_time'))) + duration = int_or_none(content.get('duration')) + series = content.get('program_title') + age_limit = int_or_none(content.get('restriction_age')) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'series': series, + 'age_limit': age_limit, + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/yapfiles.py youtube-dl-2019.05.20/youtube_dl/extractor/yapfiles.py --- youtube-dl-2016.02.22/youtube_dl/extractor/yapfiles.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/yapfiles.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,101 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + qualities, + unescapeHTML, + url_or_none, +) + + +class YapFilesIE(InfoExtractor): + _YAPFILES_URL = r'//(?:(?:www|api)\.)?yapfiles\.ru/get_player/*\?.*?\bv=(?P\w+)' + _VALID_URL = r'https?:%s' % _YAPFILES_URL + _TESTS = [{ + # with hd + 'url': 'http://www.yapfiles.ru/get_player/?v=vMDE1NjcyNDUt0413', + 'md5': '2db19e2bfa2450568868548a1aa1956c', + 'info_dict': { + 'id': 'vMDE1NjcyNDUt0413', + 'ext': 'mp4', + 'title': 'Самый худший пароль WIFI', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 72, + }, + }, { + # without hd + 'url': 'https://api.yapfiles.ru/get_player/?uid=video_player_1872528&plroll=1&adv=1&v=vMDE4NzI1Mjgt690b', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [unescapeHTML(mobj.group('url')) for mobj in re.finditer( + r']+\bsrc=(["\'])(?P(?:https?:)?%s.*?)\1' + % YapFilesIE._YAPFILES_URL, webpage)] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id, fatal=False) + + player_url = None + query = {} + if webpage: + player_url = self._search_regex( + r'player\.init\s*\(\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, + 'player url', default=None, group='url') + + if not player_url: + player_url = 'http://api.yapfiles.ru/load/%s/' % video_id + query = { + 'md5': 'ded5f369be61b8ae5f88e2eeb2f3caff', + 'type': 'json', + 'ref': url, + } + + player = self._download_json( + player_url, video_id, query=query)['player'] + + playlist_url = player['playlist'] + title = player['title'] + thumbnail = player.get('poster') + + if title == 'Ролик удален' or 'deleted.jpg' in (thumbnail or ''): + raise ExtractorError( + 'Video %s has been removed' % video_id, expected=True) + + playlist = self._download_json( + playlist_url, video_id)['player']['main'] + + hd_height = int_or_none(player.get('hd')) + + QUALITIES = ('sd', 'hd') + quality_key = qualities(QUALITIES) + formats = [] + for format_id in QUALITIES: + is_hd = format_id == 'hd' + format_url = url_or_none(playlist.get( + 'file%s' % ('_hd' if is_hd else ''))) + if not format_url: + continue + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'quality': quality_key(format_id), + 'height': hd_height if is_hd else None, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': int_or_none(player.get('length')), + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/yesjapan.py youtube-dl-2019.05.20/youtube_dl/extractor/yesjapan.py --- youtube-dl-2016.02.22/youtube_dl/extractor/yesjapan.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/yesjapan.py 2019-06-07 18:49:07.000000000 +0000 @@ -21,7 +21,7 @@ 'ext': 'mp4', 'timestamp': 1416391590, 'upload_date': '20141119', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', } } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/yinyuetai.py youtube-dl-2019.05.20/youtube_dl/extractor/yinyuetai.py --- youtube-dl-2016.02.22/youtube_dl/extractor/yinyuetai.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/yinyuetai.py 2019-06-07 18:49:07.000000000 +0000 @@ -18,7 +18,7 @@ 'title': '少女时代_PARTY_Music Video Teaser', 'creator': '少女时代', 'duration': 25, - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, }, { 'url': 'http://v.yinyuetai.com/video/h5/2322376', diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/ynet.py youtube-dl-2019.05.20/youtube_dl/extractor/ynet.py --- youtube-dl-2016.02.22/youtube_dl/extractor/ynet.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/ynet.py 2019-06-07 18:49:07.000000000 +0000 @@ -9,7 +9,7 @@ class YnetIE(InfoExtractor): - _VALID_URL = r'http://(?:.+?\.)?ynet\.co\.il/(?:.+?/)?0,7340,(?PL(?:-[0-9]+)+),00\.html' + _VALID_URL = r'https?://(?:.+?\.)?ynet\.co\.il/(?:.+?/)?0,7340,(?PL(?:-[0-9]+)+),00\.html' _TESTS = [ { 'url': 'http://hot.ynet.co.il/home/0,7340,L-11659-99244,00.html', @@ -17,7 +17,7 @@ 'id': 'L-11659-99244', 'ext': 'flv', 'title': 'איש לא יודע מאיפה באנו', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', } }, { 'url': 'http://hot.ynet.co.il/home/0,7340,L-8859-84418,00.html', @@ -25,7 +25,7 @@ 'id': 'L-8859-84418', 'ext': 'flv', 'title': "צפו: הנשיקה הלוהטת של תורגי' ויוליה פלוטקין", - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', } } ] @@ -41,10 +41,12 @@ m = re.search(r'ynet - HOT -- (["\']+)(?P.+?)\1', title) if m: title = m.group('title') + formats = self._extract_f4m_formats(f4m_url, video_id) + self._sort_formats(formats) return { 'id': video_id, 'title': title, - 'formats': self._extract_f4m_formats(f4m_url, video_id), + 'formats': formats, 'thumbnail': self._og_search_thumbnail(webpage), } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/youjizz.py youtube-dl-2019.05.20/youtube_dl/extractor/youjizz.py --- youtube-dl-2016.02.22/youtube_dl/extractor/youjizz.py 2016-02-14 22:26:17.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/youjizz.py 2019-06-07 18:49:07.000000000 +0000 @@ -4,58 +4,92 @@ from .common import InfoExtractor from ..utils import ( - ExtractorError, + determine_ext, + int_or_none, + parse_duration, + url_or_none, ) class YouJizzIE(InfoExtractor): - _VALID_URL = r'https?://(?:\w+\.)?youjizz\.com/videos/[^/#?]+-(?P<id>[0-9]+)\.html(?:$|[?#])' - _TEST = { + _VALID_URL = r'https?://(?:\w+\.)?youjizz\.com/videos/(?:[^/#?]*-(?P<id>\d+)\.html|embed/(?P<embed_id>\d+))' + _TESTS = [{ 'url': 'http://www.youjizz.com/videos/zeichentrick-1-2189178.html', - 'md5': '07e15fa469ba384c7693fd246905547c', + 'md5': 'b1e1dfaa8bb9537d8b84eeda9cf4acf4', 'info_dict': { 'id': '2189178', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Zeichentrick 1', 'age_limit': 18, + 'duration': 2874, } - } + }, { + 'url': 'http://www.youjizz.com/videos/-2189178.html', + 'only_matching': True, + }, { + 'url': 'https://www.youjizz.com/videos/embed/31991001', + 'only_matching': True, + }] def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') or mobj.group('embed_id') + webpage = self._download_webpage(url, video_id) - age_limit = self._rta_search(webpage) - video_title = self._html_search_regex( - r'<title>\s*(.*)\s*', webpage, 'title') - - embed_page_url = self._search_regex( - r'(https?://www.youjizz.com/videos/embed/[0-9]+)', - webpage, 'embed page') - webpage = self._download_webpage( - embed_page_url, video_id, note='downloading embed page') - - # Get the video URL - m_playlist = re.search(r'so.addVariable\("playlist", ?"(?P.+?)"\);', webpage) - if m_playlist is not None: - playlist_url = m_playlist.group('playlist') - playlist_page = self._download_webpage(playlist_url, video_id, - 'Downloading playlist page') - m_levels = list(re.finditer(r'(.+?)', webpage, 'title') + + formats = [] + + encodings = self._parse_json( + self._search_regex( + r'encodings\s*=\s*(\[.+?\]);\n', webpage, 'encodings', + default='[]'), + video_id, fatal=False) + for encoding in encodings: + if not isinstance(encoding, dict): + continue + format_url = url_or_none(encoding.get('filename')) + if not format_url: + continue + if determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + format_id = encoding.get('name') or encoding.get('quality') + height = int_or_none(self._search_regex( + r'^(\d+)[pP]', format_id, 'height', default=None)) + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'height': height, + }) + + if formats: + info_dict = { + 'formats': formats, + } else: - video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P[^"]+)"\)\);', - webpage, 'video URL') + # YouJizz's HTML5 player has invalid HTML + webpage = webpage.replace('"controls', '" controls') + info_dict = self._parse_html5_media_entries( + url, webpage, video_id)[0] + + duration = parse_duration(self._search_regex( + r'Runtime:([^<]+)', webpage, 'duration', + default=None)) + uploader = self._search_regex( + r'Uploaded By:.*?]*>([^<]+)', webpage, 'uploader', + default=None) - return { + info_dict.update({ 'id': video_id, - 'url': video_url, - 'title': video_title, - 'ext': 'flv', - 'format': 'flv', - 'player_url': embed_page_url, - 'age_limit': age_limit, - } + 'title': title, + 'age_limit': self._rta_search(webpage), + 'duration': duration, + 'uploader': uploader, + }) + + return info_dict diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/youku.py youtube-dl-2019.05.20/youtube_dl/extractor/youku.py --- youtube-dl-2016.02.22/youtube_dl/extractor/youku.py 2016-02-14 22:26:17.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/youku.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,19 +1,18 @@ # coding: utf-8 from __future__ import unicode_literals -import base64 import random +import re import string import time from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse, - compat_ord, -) from ..utils import ( ExtractorError, - sanitized_Request, + get_element_by_class, + js_to_json, + str_or_none, + strip_jsonp, ) @@ -22,7 +21,9 @@ IE_DESC = '优酷' _VALID_URL = r'''(?x) (?: - http://(?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)| + https?://( + (?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)| + video\.tudou\.com/v/)| youku:) (?P[A-Za-z0-9]+)(?:\.html|/v\.swf|) ''' @@ -31,9 +32,15 @@ # MD5 is unstable 'url': 'http://v.youku.com/v_show/id_XMTc1ODE5Njcy.html', 'info_dict': { - 'id': 'XMTc1ODE5Njcy_part1', + 'id': 'XMTc1ODE5Njcy', 'title': '★Smile﹗♡ Git Fresh -Booty Music舞蹈.', - 'ext': 'flv' + 'ext': 'mp4', + 'duration': 74.73, + 'thumbnail': r're:^https?://.*', + 'uploader': '。躲猫猫、', + 'uploader_id': '36017967', + 'uploader_url': 'http://i.youku.com/u/UMTQ0MDcxODY4', + 'tags': list, } }, { 'url': 'http://player.youku.com/player.php/sid/XNDgyMDQ2NTQw/v.swf', @@ -42,143 +49,82 @@ 'url': 'http://v.youku.com/v_show/id_XODgxNjg1Mzk2_ev_1.html', 'info_dict': { 'id': 'XODgxNjg1Mzk2', + 'ext': 'mp4', 'title': '武媚娘传奇 85', + 'duration': 1999.61, + 'thumbnail': r're:^https?://.*', + 'uploader': '疯狂豆花', + 'uploader_id': '62583473', + 'uploader_url': 'http://i.youku.com/u/UMjUwMzMzODky', + 'tags': list, }, - 'playlist_count': 11, - 'skip': 'Available in China only', }, { 'url': 'http://v.youku.com/v_show/id_XMTI1OTczNDM5Mg==.html', 'info_dict': { 'id': 'XMTI1OTczNDM5Mg', + 'ext': 'mp4', 'title': '花千骨 04', + 'duration': 2363, + 'thumbnail': r're:^https?://.*', + 'uploader': '放剧场-花千骨', + 'uploader_id': '772849359', + 'uploader_url': 'http://i.youku.com/u/UMzA5MTM5NzQzNg==', + 'tags': list, }, - 'playlist_count': 13, }, { 'url': 'http://v.youku.com/v_show/id_XNjA1NzA2Njgw.html', 'note': 'Video protected with password', 'info_dict': { 'id': 'XNjA1NzA2Njgw', + 'ext': 'mp4', 'title': '邢義田复旦讲座之想象中的胡人—从“左衽孔子”说起', + 'duration': 7264.5, + 'thumbnail': r're:^https?://.*', + 'uploader': 'FoxJin1006', + 'uploader_id': '322014285', + 'uploader_url': 'http://i.youku.com/u/UMTI4ODA1NzE0MA==', + 'tags': list, }, - 'playlist_count': 19, 'params': { 'videopassword': '100600', }, + }, { + # /play/get.json contains streams with "channel_type":"tail" + 'url': 'http://v.youku.com/v_show/id_XOTUxMzg4NDMy.html', + 'info_dict': { + 'id': 'XOTUxMzg4NDMy', + 'ext': 'mp4', + 'title': '我的世界☆明月庄主☆车震猎杀☆杀人艺术Minecraft', + 'duration': 702.08, + 'thumbnail': r're:^https?://.*', + 'uploader': '明月庄主moon', + 'uploader_id': '38465621', + 'uploader_url': 'http://i.youku.com/u/UMTUzODYyNDg0', + 'tags': list, + }, + }, { + 'url': 'http://video.tudou.com/v/XMjIyNzAzMTQ4NA==.html?f=46177805', + 'info_dict': { + 'id': 'XMjIyNzAzMTQ4NA', + 'ext': 'mp4', + 'title': '卡马乔国足开大脚长传冲吊集锦', + 'duration': 289, + 'thumbnail': r're:^https?://.*', + 'uploader': '阿卜杜拉之星', + 'uploader_id': '2382249', + 'uploader_url': 'http://i.youku.com/u/UOTUyODk5Ng==', + 'tags': list, + }, + }, { + 'url': 'http://video.tudou.com/v/XMjE4ODI3OTg2MA==.html', + 'only_matching': True, }] - def construct_video_urls(self, data): - # get sid, token - def yk_t(s1, s2): - ls = list(range(256)) - t = 0 - for i in range(256): - t = (t + ls[i] + compat_ord(s1[i % len(s1)])) % 256 - ls[i], ls[t] = ls[t], ls[i] - s = bytearray() - x, y = 0, 0 - for i in range(len(s2)): - y = (y + 1) % 256 - x = (x + ls[y]) % 256 - ls[x], ls[y] = ls[y], ls[x] - s.append(compat_ord(s2[i]) ^ ls[(ls[x] + ls[y]) % 256]) - return bytes(s) - - sid, token = yk_t( - b'becaf9be', base64.b64decode(data['security']['encrypt_string'].encode('ascii')) - ).decode('ascii').split('_') - - # get oip - oip = data['security']['ip'] - - fileid_dict = {} - for stream in data['stream']: - format = stream.get('stream_type') - fileid = stream['stream_fileid'] - fileid_dict[format] = fileid - - def get_fileid(format, n): - number = hex(int(str(n), 10))[2:].upper() - if len(number) == 1: - number = '0' + number - streamfileids = fileid_dict[format] - fileid = streamfileids[0:8] + number + streamfileids[10:] - return fileid - - # get ep - def generate_ep(format, n): - fileid = get_fileid(format, n) - ep_t = yk_t( - b'bf7e5f01', - ('%s_%s_%s' % (sid, fileid, token)).encode('ascii') - ) - ep = base64.b64encode(ep_t).decode('ascii') - return ep - - # generate video_urls - video_urls_dict = {} - for stream in data['stream']: - format = stream.get('stream_type') - video_urls = [] - for dt in stream['segs']: - n = str(stream['segs'].index(dt)) - param = { - 'K': dt['key'], - 'hd': self.get_hd(format), - 'myp': 0, - 'ypp': 0, - 'ctype': 12, - 'ev': 1, - 'token': token, - 'oip': oip, - 'ep': generate_ep(format, n) - } - video_url = \ - 'http://k.youku.com/player/getFlvPath/' + \ - 'sid/' + sid + \ - '_00' + \ - '/st/' + self.parse_ext_l(format) + \ - '/fileid/' + get_fileid(format, n) + '?' + \ - compat_urllib_parse.urlencode(param) - video_urls.append(video_url) - video_urls_dict[format] = video_urls - - return video_urls_dict - @staticmethod def get_ysuid(): return '%d%s' % (int(time.time()), ''.join([ random.choice(string.ascii_letters) for i in range(3)])) - def get_hd(self, fm): - hd_id_dict = { - '3gp': '0', - '3gphd': '1', - 'flv': '0', - 'flvhd': '0', - 'mp4': '1', - 'mp4hd': '1', - 'mp4hd2': '1', - 'mp4hd3': '1', - 'hd2': '2', - 'hd3': '3', - } - return hd_id_dict[fm] - - def parse_ext_l(self, fm): - ext_dict = { - '3gp': 'flv', - '3gphd': 'mp4', - 'flv': 'flv', - 'flvhd': 'flv', - 'mp4': 'mp4', - 'mp4hd': 'mp4', - 'mp4hd2': 'flv', - 'mp4hd3': 'flv', - 'hd2': 'flv', - 'hd3': 'flv', - } - return ext_dict[fm] - def get_format_name(self, fm): _dict = { '3gp': 'h6', @@ -192,36 +138,40 @@ 'hd2': 'h2', 'hd3': 'h1', } - return _dict[fm] + return _dict.get(fm) def _real_extract(self, url): video_id = self._match_id(url) self._set_cookie('youku.com', '__ysuid', self.get_ysuid()) + self._set_cookie('youku.com', 'xreferrer', 'http://www.youku.com') - def retrieve_data(req_url, note): - headers = { - 'Referer': req_url, - } - self._set_cookie('youku.com', 'xreferrer', 'http://www.youku.com') - req = sanitized_Request(req_url, headers=headers) - - cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') - if cn_verification_proxy: - req.add_header('Ytdl-request-proxy', cn_verification_proxy) + _, urlh = self._download_webpage_handle( + 'https://log.mmstat.com/eg.js', video_id, 'Retrieving cna info') + # The etag header is '"foobar"'; let's remove the double quotes + cna = urlh.headers['etag'][1:-1] - raw_data = self._download_json(req, video_id, note=note) - - return raw_data['data'] + # request basic data + basic_data_params = { + 'vid': video_id, + 'ccode': '0590', + 'client_ip': '192.168.1.1', + 'utid': cna, + 'client_ts': time.time() / 1000, + } video_password = self._downloader.params.get('videopassword') - - # request basic data - basic_data_url = 'http://play.youku.com/play/get.json?vid=%s&ct=12' % video_id if video_password: - basic_data_url += '&pwd=%s' % video_password + basic_data_params['password'] = video_password - data = retrieve_data(basic_data_url, 'Downloading JSON metadata') + headers = { + 'Referer': url, + } + headers.update(self.geo_verification_headers()) + data = self._download_json( + 'https://ups.youku.com/ups/get.json', video_id, + 'Downloading JSON metadata', + query=basic_data_params, headers=headers)['data'] error = data.get('error') if error: @@ -239,33 +189,121 @@ raise ExtractorError(msg) # get video title - title = data['video']['title'] + video_data = data['video'] + title = video_data['title'] - # generate video_urls_dict - video_urls_dict = self.construct_video_urls(data) - - # construct info - entries = [{ - 'id': '%s_part%d' % (video_id, i + 1), - 'title': title, - 'formats': [], - # some formats are not available for all parts, we have to detect - # which one has all - } for i in range(max(len(v.get('segs')) for v in data['stream']))] - for stream in data['stream']: - fm = stream.get('stream_type') - video_urls = video_urls_dict[fm] - for video_url, seg, entry in zip(video_urls, stream['segs'], entries): - entry['formats'].append({ - 'url': video_url, - 'format_id': self.get_format_name(fm), - 'ext': self.parse_ext_l(fm), - 'filesize': int(seg['size']), - }) + formats = [{ + 'url': stream['m3u8_url'], + 'format_id': self.get_format_name(stream.get('stream_type')), + 'ext': 'mp4', + 'protocol': 'm3u8_native', + 'filesize': int(stream.get('size')), + 'width': stream.get('width'), + 'height': stream.get('height'), + } for stream in data['stream'] if stream.get('channel_type') != 'tail'] + self._sort_formats(formats) return { - '_type': 'multi_video', 'id': video_id, 'title': title, - 'entries': entries, + 'formats': formats, + 'duration': video_data.get('seconds'), + 'thumbnail': video_data.get('logo'), + 'uploader': video_data.get('username'), + 'uploader_id': str_or_none(video_data.get('userid')), + 'uploader_url': data.get('uploader', {}).get('homepage'), + 'tags': video_data.get('tags'), } + + +class YoukuShowIE(InfoExtractor): + _VALID_URL = r'https?://list\.youku\.com/show/id_(?P[0-9a-z]+)\.html' + IE_NAME = 'youku:show' + + _TESTS = [{ + 'url': 'http://list.youku.com/show/id_zc7c670be07ff11e48b3f.html', + 'info_dict': { + 'id': 'zc7c670be07ff11e48b3f', + 'title': '花千骨 DVD版', + 'description': 'md5:a1ae6f5618571bbeb5c9821f9c81b558', + }, + 'playlist_count': 50, + }, { + # Episode number not starting from 1 + 'url': 'http://list.youku.com/show/id_zefbfbd70efbfbd780bef.html', + 'info_dict': { + 'id': 'zefbfbd70efbfbd780bef', + 'title': '超级飞侠3', + 'description': 'md5:275715156abebe5ccc2a1992e9d56b98', + }, + 'playlist_count': 24, + }, { + # Ongoing playlist. The initial page is the last one + 'url': 'http://list.youku.com/show/id_za7c275ecd7b411e1a19e.html', + 'only_matching': True, + }, { + # No data-id value. + 'url': 'http://list.youku.com/show/id_zefbfbd61237fefbfbdef.html', + 'only_matching': True, + }, { + # Wrong number of reload_id. + 'url': 'http://list.youku.com/show/id_z20eb4acaf5c211e3b2ad.html', + 'only_matching': True, + }] + + def _extract_entries(self, playlist_data_url, show_id, note, query): + query['callback'] = 'cb' + playlist_data = self._download_json( + playlist_data_url, show_id, query=query, note=note, + transform_source=lambda s: js_to_json(strip_jsonp(s))).get('html') + if playlist_data is None: + return [None, None] + drama_list = (get_element_by_class('p-drama-grid', playlist_data) + or get_element_by_class('p-drama-half-row', playlist_data)) + if drama_list is None: + raise ExtractorError('No episodes found') + video_urls = re.findall(r']+href="([^"]+)"', drama_list) + return playlist_data, [ + self.url_result(self._proto_relative_url(video_url, 'http:'), YoukuIE.ie_key()) + for video_url in video_urls] + + def _real_extract(self, url): + show_id = self._match_id(url) + webpage = self._download_webpage(url, show_id) + + entries = [] + page_config = self._parse_json(self._search_regex( + r'var\s+PageConfig\s*=\s*({.+});', webpage, 'page config'), + show_id, transform_source=js_to_json) + first_page, initial_entries = self._extract_entries( + 'http://list.youku.com/show/module', show_id, + note='Downloading initial playlist data page', + query={ + 'id': page_config['showid'], + 'tab': 'showInfo', + }) + first_page_reload_id = self._html_search_regex( + r']+id="(reload_\d+)', first_page, 'first page reload id') + # The first reload_id has the same items as first_page + reload_ids = re.findall(']+data-id="([^"]+)">', first_page) + entries.extend(initial_entries) + for idx, reload_id in enumerate(reload_ids): + if reload_id == first_page_reload_id: + continue + _, new_entries = self._extract_entries( + 'http://list.youku.com/show/episode', show_id, + note='Downloading playlist data page %d' % (idx + 1), + query={ + 'id': page_config['showid'], + 'stage': reload_id, + }) + if new_entries is not None: + entries.extend(new_entries) + desc = self._html_search_meta('description', webpage, fatal=False) + playlist_title = desc.split(',')[0] if desc else None + detail_li = get_element_by_class('p-intro', webpage) + playlist_description = get_element_by_class( + 'intro-more', detail_li) if detail_li else None + + return self.playlist_result( + entries, show_id, playlist_title, playlist_description) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/younow.py youtube-dl-2019.05.20/youtube_dl/extractor/younow.py --- youtube-dl-2016.02.22/youtube_dl/extractor/younow.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/younow.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,202 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + try_get, +) + +CDN_API_BASE = 'https://cdn.younow.com/php/api' +MOMENT_URL_FORMAT = '%s/moment/fetch/id=%%s' % CDN_API_BASE + + +class YouNowLiveIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?younow\.com/(?P[^/?#&]+)' + _TEST = { + 'url': 'https://www.younow.com/AmandaPadeezy', + 'info_dict': { + 'id': 'AmandaPadeezy', + 'ext': 'mp4', + 'is_live': True, + 'title': 'March 26, 2017', + 'thumbnail': r're:^https?://.*\.jpg$', + 'tags': ['girls'], + 'categories': ['girls'], + 'uploader': 'AmandaPadeezy', + 'uploader_id': '6716501', + 'uploader_url': 'https://www.younow.com/AmandaPadeezy', + 'creator': 'AmandaPadeezy', + }, + 'skip': True, + } + + @classmethod + def suitable(cls, url): + return (False + if YouNowChannelIE.suitable(url) or YouNowMomentIE.suitable(url) + else super(YouNowLiveIE, cls).suitable(url)) + + def _real_extract(self, url): + username = self._match_id(url) + + data = self._download_json( + 'https://api.younow.com/php/api/broadcast/info/curId=0/user=%s' + % username, username) + + if data.get('errorCode') != 0: + raise ExtractorError(data['errorMsg'], expected=True) + + uploader = try_get( + data, lambda x: x['user']['profileUrlString'], + compat_str) or username + + return { + 'id': uploader, + 'is_live': True, + 'title': self._live_title(uploader), + 'thumbnail': data.get('awsUrl'), + 'tags': data.get('tags'), + 'categories': data.get('tags'), + 'uploader': uploader, + 'uploader_id': data.get('userId'), + 'uploader_url': 'https://www.younow.com/%s' % username, + 'creator': uploader, + 'view_count': int_or_none(data.get('viewers')), + 'like_count': int_or_none(data.get('likes')), + 'formats': [{ + 'url': '%s/broadcast/videoPath/hls=1/broadcastId=%s/channelId=%s' + % (CDN_API_BASE, data['broadcastId'], data['userId']), + 'ext': 'mp4', + 'protocol': 'm3u8', + }], + } + + +def _extract_moment(item, fatal=True): + moment_id = item.get('momentId') + if not moment_id: + if not fatal: + return + raise ExtractorError('Unable to extract moment id') + + moment_id = compat_str(moment_id) + + title = item.get('text') + if not title: + title = 'YouNow %s' % ( + item.get('momentType') or item.get('titleType') or 'moment') + + uploader = try_get(item, lambda x: x['owner']['name'], compat_str) + uploader_id = try_get(item, lambda x: x['owner']['userId']) + uploader_url = 'https://www.younow.com/%s' % uploader if uploader else None + + entry = { + 'extractor_key': 'YouNowMoment', + 'id': moment_id, + 'title': title, + 'view_count': int_or_none(item.get('views')), + 'like_count': int_or_none(item.get('likes')), + 'timestamp': int_or_none(item.get('created')), + 'creator': uploader, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'uploader_url': uploader_url, + 'formats': [{ + 'url': 'https://hls.younow.com/momentsplaylists/live/%s/%s.m3u8' + % (moment_id, moment_id), + 'ext': 'mp4', + 'protocol': 'm3u8_native', + }], + } + + return entry + + +class YouNowChannelIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?younow\.com/(?P[^/]+)/channel' + _TEST = { + 'url': 'https://www.younow.com/its_Kateee_/channel', + 'info_dict': { + 'id': '14629760', + 'title': 'its_Kateee_ moments' + }, + 'playlist_mincount': 8, + } + + def _entries(self, username, channel_id): + created_before = 0 + for page_num in itertools.count(1): + if created_before is None: + break + info = self._download_json( + '%s/moment/profile/channelId=%s/createdBefore=%d/records=20' + % (CDN_API_BASE, channel_id, created_before), username, + note='Downloading moments page %d' % page_num) + items = info.get('items') + if not items or not isinstance(items, list): + break + for item in items: + if not isinstance(item, dict): + continue + item_type = item.get('type') + if item_type == 'moment': + entry = _extract_moment(item, fatal=False) + if entry: + yield entry + elif item_type == 'collection': + moments = item.get('momentsIds') + if isinstance(moments, list): + for moment_id in moments: + m = self._download_json( + MOMENT_URL_FORMAT % moment_id, username, + note='Downloading %s moment JSON' % moment_id, + fatal=False) + if m and isinstance(m, dict) and m.get('item'): + entry = _extract_moment(m['item']) + if entry: + yield entry + created_before = int_or_none(item.get('created')) + + def _real_extract(self, url): + username = self._match_id(url) + channel_id = compat_str(self._download_json( + 'https://api.younow.com/php/api/broadcast/info/curId=0/user=%s' + % username, username, note='Downloading user information')['userId']) + return self.playlist_result( + self._entries(username, channel_id), channel_id, + '%s moments' % username) + + +class YouNowMomentIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?younow\.com/[^/]+/(?P[^/?#&]+)' + _TEST = { + 'url': 'https://www.younow.com/GABO.../20712117/36319236/3b316doc/m', + 'md5': 'a30c70eadb9fb39a1aa3c8c0d22a0807', + 'info_dict': { + 'id': '20712117', + 'ext': 'mp4', + 'title': 'YouNow capture', + 'view_count': int, + 'like_count': int, + 'timestamp': 1490432040, + 'upload_date': '20170325', + 'uploader': 'GABO...', + 'uploader_id': 35917228, + }, + } + + @classmethod + def suitable(cls, url): + return (False + if YouNowChannelIE.suitable(url) + else super(YouNowMomentIE, cls).suitable(url)) + + def _real_extract(self, url): + video_id = self._match_id(url) + item = self._download_json(MOMENT_URL_FORMAT % video_id, video_id) + return _extract_moment(item['item']) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/youporn.py youtube-dl-2019.05.20/youtube_dl/extractor/youporn.py --- youtube-dl-2016.02.22/youtube_dl/extractor/youporn.py 2016-02-01 10:41:30.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/youporn.py 2019-06-07 18:49:07.000000000 +0000 @@ -9,6 +9,7 @@ str_to_int, unescapeHTML, unified_strdate, + url_or_none, ) from ..aes import aes_decrypt_text @@ -17,16 +18,16 @@ _VALID_URL = r'https?://(?:www\.)?youporn\.com/watch/(?P\d+)/(?P[^/?#&]+)' _TESTS = [{ 'url': 'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/', - 'md5': '71ec5fcfddacf80f495efa8b6a8d9a89', + 'md5': '3744d24c50438cf5b6f6d59feb5055c2', 'info_dict': { 'id': '505835', 'display_id': 'sex-ed-is-it-safe-to-masturbate-daily', 'ext': 'mp4', 'title': 'Sex Ed: Is It Safe To Masturbate Daily?', 'description': 'Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'Ask Dan And Jennifer', - 'upload_date': '20101221', + 'upload_date': '20101217', 'average_rating': int, 'view_count': int, 'comment_count': int, @@ -35,7 +36,7 @@ 'age_limit': 18, }, }, { - # Anonymous User uploader + # Unknown uploader 'url': 'http://www.youporn.com/watch/561726/big-tits-awesome-brunette-on-amazing-webcam-show/?from=related3&al=2&from_id=561726&pos=4', 'info_dict': { 'id': '561726', @@ -43,9 +44,9 @@ 'ext': 'mp4', 'title': 'Big Tits Awesome Brunette On amazing webcam show', 'description': 'http://sweetlivegirls.com Big Tits Awesome Brunette On amazing webcam show.mp4', - 'thumbnail': 're:^https?://.*\.jpg$', - 'uploader': 'Anonymous User', - 'upload_date': '20111125', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Unknown', + 'upload_date': '20110418', 'average_rating': int, 'view_count': int, 'comment_count': int, @@ -67,29 +68,45 @@ request.add_header('Cookie', 'age_verified=1') webpage = self._download_webpage(request, display_id) - title = self._search_regex( - [r'(?:video_titles|videoTitle)\s*[:=]\s*(["\'])(?P.+?)\1', - r'<h1[^>]+class=["\']heading\d?["\'][^>]*>([^<])<'], - webpage, 'title', group='title') + title = self._html_search_regex( + r'(?s)<div[^>]+class=["\']watchVideoTitle[^>]+>(.+?)</div>', + webpage, 'title', default=None) or self._og_search_title( + webpage, default=None) or self._html_search_meta( + 'title', webpage, fatal=True) links = [] + # Main source + definitions = self._parse_json( + self._search_regex( + r'mediaDefinition\s*=\s*(\[.+?\]);', webpage, + 'media definitions', default='[]'), + video_id, fatal=False) + if definitions: + for definition in definitions: + if not isinstance(definition, dict): + continue + video_url = url_or_none(definition.get('videoUrl')) + if video_url: + links.append(video_url) + + # Fallback #1, this also contains extra low quality 180p format + for _, link in re.findall(r'<a[^>]+href=(["\'])(http.+?)\1[^>]+title=["\']Download [Vv]ideo', webpage): + links.append(link) + + # Fallback #2 (unavailable as at 22.06.2017) sources = self._search_regex( - r'sources\s*:\s*({.+?})', webpage, 'sources', default=None) + r'(?s)sources\s*:\s*({.+?})', webpage, 'sources', default=None) if sources: for _, link in re.findall(r'[^:]+\s*:\s*(["\'])(http.+?)\1', sources): links.append(link) - # Fallback #1 + # Fallback #3 (unavailable as at 22.06.2017) for _, link in re.findall( - r'(?:videoUrl|videoSrc|videoIpadUrl|html5PlayerSrc)\s*[:=]\s*(["\'])(http.+?)\1', webpage): - links.append(link) - - # Fallback #2, this also contains extra low quality 180p format - for _, link in re.findall(r'<a[^>]+href=(["\'])(http.+?)\1[^>]+title=["\']Download [Vv]ideo', webpage): + r'(?:videoSrc|videoIpadUrl|html5PlayerSrc)\s*[:=]\s*(["\'])(http.+?)\1', webpage): links.append(link) - # Fallback #3, encrypted links + # Fallback #4, encrypted links (unavailable as at 22.06.2017) for _, encrypted_link in re.findall( r'encryptedQuality\d{3,4}URL\s*=\s*(["\'])([\da-zA-Z+/=]+)\1', webpage): links.append(aes_decrypt_text(encrypted_link, title, 32).decode('utf-8')) @@ -101,8 +118,9 @@ } # Video URL's path looks like this: # /201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4 + # /201012/17/505835/vl_240p_240k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4 # We will benefit from it by extracting some metadata - mobj = re.search(r'/(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+/', video_url) + mobj = re.search(r'(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+/', video_url) if mobj: height = int(mobj.group('height')) bitrate = int(mobj.group('bitrate')) @@ -114,42 +132,47 @@ formats.append(f) self._sort_formats(formats) - description = self._og_search_description(webpage, default=None) + description = self._html_search_regex( + r'(?s)<div[^>]+\bid=["\']description["\'][^>]*>(.+?)</div>', + webpage, 'description', + default=None) or self._og_search_description( + webpage, default=None) thumbnail = self._search_regex( r'(?:imageurl\s*=|poster\s*:)\s*(["\'])(?P<thumbnail>.+?)\1', webpage, 'thumbnail', fatal=False, group='thumbnail') uploader = self._html_search_regex( - r'(?s)<div[^>]+class=["\']videoInfoBy(?:\s+[^"\']+)?["\'][^>]*>\s*By:\s*</div>(.+?)</(?:a|div)>', + r'(?s)<div[^>]+class=["\']submitByLink["\'][^>]*>(.+?)</div>', webpage, 'uploader', fatal=False) upload_date = unified_strdate(self._html_search_regex( - r'(?s)<div[^>]+class=["\']videoInfoTime["\'][^>]*>(.+?)</div>', + [r'Date\s+[Aa]dded:\s*<span>([^<]+)', + r'(?s)<div[^>]+class=["\']videoInfo(?:Date|Time)["\'][^>]*>(.+?)</div>'], webpage, 'upload date', fatal=False)) age_limit = self._rta_search(webpage) average_rating = int_or_none(self._search_regex( - r'<div[^>]+class=["\']videoInfoRating["\'][^>]*>\s*<div[^>]+class=["\']videoRatingPercentage["\'][^>]*>(\d+)%</div>', + r'<div[^>]+class=["\']videoRatingPercentage["\'][^>]*>(\d+)%</div>', webpage, 'average rating', fatal=False)) view_count = str_to_int(self._search_regex( - r'(?s)<div[^>]+class=["\']videoInfoViews["\'][^>]*>.*?([\d,.]+)\s*</div>', - webpage, 'view count', fatal=False)) + r'(?s)<div[^>]+class=(["\']).*?\bvideoInfoViews\b.*?\1[^>]*>.*?(?P<count>[\d,.]+)<', + webpage, 'view count', fatal=False, group='count')) comment_count = str_to_int(self._search_regex( r'>All [Cc]omments? \(([\d,.]+)\)', webpage, 'comment count', fatal=False)) - def extract_tag_box(title): - tag_box = self._search_regex( - (r'<div[^>]+class=["\']tagBoxTitle["\'][^>]*>\s*%s\b.*?</div>\s*' - '<div[^>]+class=["\']tagBoxContent["\']>(.+?)</div>') % re.escape(title), - webpage, '%s tag box' % title, default=None) + def extract_tag_box(regex, title): + tag_box = self._search_regex(regex, webpage, title, default=None) if not tag_box: return [] return re.findall(r'<a[^>]+href=[^>]+>([^<]+)', tag_box) - categories = extract_tag_box('Category') - tags = extract_tag_box('Tags') + categories = extract_tag_box( + r'(?s)Categories:.*?</[^>]+>(.+?)</div>', 'categories') + tags = extract_tag_box( + r'(?s)Tags:.*?</div>\s*<div[^>]+class=["\']tagBoxContent["\'][^>]*>(.+?)</div>', + 'tags') return { 'id': video_id, diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/yourporn.py youtube-dl-2019.05.20/youtube_dl/extractor/yourporn.py --- youtube-dl-2016.02.22/youtube_dl/extractor/yourporn.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/yourporn.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,57 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + parse_duration, + urljoin, +) + + +class YourPornIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?:yourporn\.sexy|sxyprn\.com)/post/(?P<id>[^/?#&.]+)' + _TESTS = [{ + 'url': 'https://yourporn.sexy/post/57ffcb2e1179b.html', + 'md5': '6f8682b6464033d87acaa7a8ff0c092e', + 'info_dict': { + 'id': '57ffcb2e1179b', + 'ext': 'mp4', + 'title': 'md5:c9f43630bd968267672651ba905a7d35', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 165, + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://sxyprn.com/post/57ffcb2e1179b.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_url = urljoin(url, self._parse_json( + self._search_regex( + r'data-vnfo=(["\'])(?P<data>{.+?})\1', webpage, 'data info', + group='data'), + video_id)[video_id]).replace('/cdn/', '/cdn4/') + + title = (self._search_regex( + r'<[^>]+\bclass=["\']PostEditTA[^>]+>([^<]+)', webpage, 'title', + default=None) or self._og_search_description(webpage)).strip() + thumbnail = self._og_search_thumbnail(webpage) + duration = parse_duration(self._search_regex( + r'duration\s*:\s*<[^>]+>([\d:]+)', webpage, 'duration', + default=None)) + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'age_limit': 18, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/yourupload.py youtube-dl-2019.05.20/youtube_dl/extractor/yourupload.py --- youtube-dl-2016.02.22/youtube_dl/extractor/yourupload.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/yourupload.py 2019-06-07 18:49:07.000000000 +0000 @@ -2,44 +2,37 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import urljoin class YourUploadIE(InfoExtractor): - _VALID_URL = r'''(?x)https?://(?:www\.)? - (?:yourupload\.com/watch| - embed\.yourupload\.com| - embed\.yucache\.net - )/(?P<id>[A-Za-z0-9]+) - ''' - _TESTS = [ - { - 'url': 'http://yourupload.com/watch/14i14h', - 'md5': '5e2c63385454c557f97c4c4131a393cd', - 'info_dict': { - 'id': '14i14h', - 'ext': 'mp4', - 'title': 'BigBuckBunny_320x180.mp4', - 'thumbnail': 're:^https?://.*\.jpe?g', - } - }, - { - 'url': 'http://embed.yourupload.com/14i14h', - 'only_matching': True, - }, - { - 'url': 'http://embed.yucache.net/14i14h?client_file_id=803349', - 'only_matching': True, - }, - ] + _VALID_URL = r'https?://(?:www\.)?(?:yourupload\.com/(?:watch|embed)|embed\.yourupload\.com)/(?P<id>[A-Za-z0-9]+)' + _TESTS = [{ + 'url': 'http://yourupload.com/watch/14i14h', + 'md5': '5e2c63385454c557f97c4c4131a393cd', + 'info_dict': { + 'id': '14i14h', + 'ext': 'mp4', + 'title': 'BigBuckBunny_320x180.mp4', + 'thumbnail': r're:^https?://.*\.jpe?g', + } + }, { + 'url': 'http://www.yourupload.com/embed/14i14h', + 'only_matching': True, + }, { + 'url': 'http://embed.yourupload.com/14i14h', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - embed_url = 'http://embed.yucache.net/{0:}'.format(video_id) + embed_url = 'http://www.yourupload.com/embed/%s' % video_id + webpage = self._download_webpage(embed_url, video_id) title = self._og_search_title(webpage) - video_url = self._og_search_video_url(webpage) + video_url = urljoin(embed_url, self._og_search_video_url(webpage)) thumbnail = self._og_search_thumbnail(webpage, default=None) return { diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/youtube.py youtube-dl-2019.05.20/youtube_dl/extractor/youtube.py --- youtube-dl-2016.02.22/youtube_dl/extractor/youtube.py 2016-02-20 13:02:05.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/youtube.py 2019-06-07 18:49:07.000000000 +0000 @@ -6,6 +6,7 @@ import itertools import json import os.path +import random import re import time import traceback @@ -15,17 +16,19 @@ from ..swfinterp import SWFInterpreter from ..compat import ( compat_chr, + compat_HTTPError, + compat_kwargs, compat_parse_qs, - compat_urllib_parse, compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, + compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, compat_urlparse, compat_str, ) from ..utils import ( clean_html, - encode_dict, + dict_get, error_to_compat_str, ExtractorError, float_or_none, @@ -34,17 +37,21 @@ int_or_none, mimetype2ext, orderedSet, + parse_codecs, parse_duration, + qualities, remove_quotes, remove_start, - sanitized_Request, smuggle_url, + str_or_none, str_to_int, + try_get, unescapeHTML, unified_strdate, unsmuggle_url, uppercase_escape, - ISO3166Utils, + url_or_none, + urlencode_postdata, ) @@ -52,10 +59,17 @@ """Provide base functions for Youtube extractors""" _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge' + + _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup' + _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge' + _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}' + _NETRC_MACHINE = 'youtube' # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = False + _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)[0-9A-Za-z-_]{10,}' + def _set_language(self): self._set_cookie( '.youtube.com', 'PREF', 'f1=50000000&hl=en', @@ -75,10 +89,10 @@ If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised. """ - (username, password) = self._get_login_info() + username, password = self._get_login_info() # No authentication to be performed if username is None: - if self._LOGIN_REQUIRED: + if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None: raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) return True @@ -89,90 +103,172 @@ if login_page is False: return - galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"', - login_page, 'Login GALX parameter') + login_form = self._hidden_inputs(login_page) - # Log in - login_form_strs = { - 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1', - 'Email': username, - 'GALX': galx, - 'Passwd': password, - - 'PersistentCookie': 'yes', - '_utf8': '霱', - 'bgresponse': 'js_disabled', - 'checkConnection': '', - 'checkedDomains': 'youtube', - 'dnConn': '', - 'pstMsg': '0', - 'rmShown': '1', - 'secTok': '', - 'signIn': 'Sign in', - 'timeStmp': '', - 'service': 'youtube', - 'uilel': '3', - 'hl': 'en_US', - } + def req(url, f_req, note, errnote): + data = login_form.copy() + data.update({ + 'pstMsg': 1, + 'checkConnection': 'youtube', + 'checkedDomains': 'youtube', + 'hl': 'en', + 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]', + 'f.req': json.dumps(f_req), + 'flowName': 'GlifWebSignIn', + 'flowEntry': 'ServiceLogin', + }) + return self._download_json( + url, None, note=note, errnote=errnote, + transform_source=lambda s: re.sub(r'^[^[]*', '', s), + fatal=False, + data=urlencode_postdata(data), headers={ + 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8', + 'Google-Accounts-XSRF': 1, + }) - login_data = compat_urllib_parse.urlencode(encode_dict(login_form_strs)).encode('ascii') + def warn(message): + self._downloader.report_warning(message) - req = sanitized_Request(self._LOGIN_URL, login_data) - login_results = self._download_webpage( - req, None, - note='Logging in', errnote='unable to log in', fatal=False) - if login_results is False: - return False + lookup_req = [ + username, + None, [], None, 'US', None, None, 2, False, True, + [ + None, None, + [2, 1, None, 1, + 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', + None, [], 4], + 1, [None, None, []], None, None, None, True + ], + username, + ] + + lookup_results = req( + self._LOOKUP_URL, lookup_req, + 'Looking up account info', 'Unable to look up account info') - if re.search(r'id="errormsg_0_Passwd"', login_results) is not None: - raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True) + if lookup_results is False: + return False - # Two-Factor - # TODO add SMS and phone call support - these require making a request and then prompting the user + user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str) + if not user_hash: + warn('Unable to extract user hash') + return False - if re.search(r'(?i)<form[^>]* id="challenge"', login_results) is not None: - tfa_code = self._get_tfa_info('2-step verification code') + challenge_req = [ + user_hash, + None, 1, None, [1, None, None, None, [password, None, True]], + [ + None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4], + 1, [None, None, []], None, None, None, True + ]] + + challenge_results = req( + self._CHALLENGE_URL, challenge_req, + 'Logging in', 'Unable to log in') - if not tfa_code: - self._downloader.report_warning( - 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>' - '(Note that only TOTP (Google Authenticator App) codes work at this time.)') - return False + if challenge_results is False: + return - tfa_code = remove_start(tfa_code, 'G-') + login_res = try_get(challenge_results, lambda x: x[0][5], list) + if login_res: + login_msg = try_get(login_res, lambda x: x[5], compat_str) + warn( + 'Unable to login: %s' % 'Invalid password' + if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg) + return False - tfa_form_strs = self._form_hidden_inputs('challenge', login_results) + res = try_get(challenge_results, lambda x: x[0][-1], list) + if not res: + warn('Unable to extract result entry') + return False - tfa_form_strs.update({ - 'Pin': tfa_code, - 'TrustDevice': 'on', - }) + login_challenge = try_get(res, lambda x: x[0][0], list) + if login_challenge: + challenge_str = try_get(login_challenge, lambda x: x[2], compat_str) + if challenge_str == 'TWO_STEP_VERIFICATION': + # SEND_SUCCESS - TFA code has been successfully sent to phone + # QUOTA_EXCEEDED - reached the limit of TFA codes + status = try_get(login_challenge, lambda x: x[5], compat_str) + if status == 'QUOTA_EXCEEDED': + warn('Exceeded the limit of TFA codes, try later') + return False + + tl = try_get(challenge_results, lambda x: x[1][2], compat_str) + if not tl: + warn('Unable to extract TL') + return False + + tfa_code = self._get_tfa_info('2-step verification code') + + if not tfa_code: + warn( + 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>' + '(Note that only TOTP (Google Authenticator App) codes work at this time.)') + return False + + tfa_code = remove_start(tfa_code, 'G-') + + tfa_req = [ + user_hash, None, 2, None, + [ + 9, None, None, None, None, None, None, None, + [None, tfa_code, True, 2] + ]] + + tfa_results = req( + self._TFA_URL.format(tl), tfa_req, + 'Submitting TFA code', 'Unable to submit TFA code') + + if tfa_results is False: + return False + + tfa_res = try_get(tfa_results, lambda x: x[0][5], list) + if tfa_res: + tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str) + warn( + 'Unable to finish TFA: %s' % 'Invalid TFA code' + if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg) + return False - tfa_data = compat_urllib_parse.urlencode(encode_dict(tfa_form_strs)).encode('ascii') + check_cookie_url = try_get( + tfa_results, lambda x: x[0][-1][2], compat_str) + else: + CHALLENGES = { + 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.", + 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.', + 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.", + } + challenge = CHALLENGES.get( + challenge_str, + '%s returned error %s.' % (self.IE_NAME, challenge_str)) + warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge) + return False + else: + check_cookie_url = try_get(res, lambda x: x[2], compat_str) - tfa_req = sanitized_Request(self._TWOFACTOR_URL, tfa_data) - tfa_results = self._download_webpage( - tfa_req, None, - note='Submitting TFA code', errnote='unable to submit tfa', fatal=False) + if not check_cookie_url: + warn('Unable to extract CheckCookie URL') + return False - if tfa_results is False: - return False + check_cookie_results = self._download_webpage( + check_cookie_url, None, 'Checking cookie', fatal=False) - if re.search(r'(?i)<form[^>]* id="challenge"', tfa_results) is not None: - self._downloader.report_warning('Two-factor code expired or invalid. Please try again, or use a one-use backup code instead.') - return False - if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None: - self._downloader.report_warning('unable to log in - did the page structure change?') - return False - if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None: - self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.') - return False + if check_cookie_results is False: + return False - if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None: - self._downloader.report_warning('unable to log in: bad username or password') + if 'https://myaccount.google.com/' not in check_cookie_results: + warn('Unable to log in') return False + return True + def _download_webpage_handle(self, *args, **kwargs): + query = kwargs.get('query', {}).copy() + query['disable_polymer'] = 'true' + kwargs['query'] = query + return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle( + *args, **compat_kwargs(kwargs)) + def _real_initialize(self): if self._downloader is None: return @@ -193,10 +289,25 @@ if not mobj: break - more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), playlist_id, - 'Downloading page #%s' % page_num, - transform_source=uppercase_escape) + count = 0 + retries = 3 + while count <= retries: + try: + # Downloading page may result in intermittent 5xx HTTP error + # that is usually worked around with a retry + more = self._download_json( + 'https://youtube.com/%s' % mobj.group('more'), playlist_id, + 'Downloading page #%s%s' + % (page_num, ' (retry #%d)' % count if count else ''), + transform_source=uppercase_escape) + break + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503): + count += 1 + if count <= retries: + continue + raise + content_html = more['content_html'] if not content_html.strip(): # Some webpages show a "Load more" button but they don't @@ -233,7 +344,9 @@ class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): def _process_page(self, content): - for playlist_id in orderedSet(re.findall(r'href="/?playlist\?list=([0-9A-Za-z-_]{10,})"', content)): + for playlist_id in orderedSet(re.findall( + r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"', + content)): yield self.url_result( 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist') @@ -252,8 +365,14 @@ (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/| (?:www\.)?deturl\.com/www\.youtube\.com/| (?:www\.)?pwnyoutube\.com/| + (?:www\.)?hooktube\.com/| (?:www\.)?yourepeat\.com/| tube\.majestyc\.net/| + (?:(?:www|dev)\.)?invidio\.us/| + (?:www\.)?invidiou\.sh/| + (?:www\.)?invidious\.snopyta\.org/| + (?:www\.)?invidious\.kabi\.tk/| + (?:www\.)?vid\.wxzm\.sx/| youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls (?: # the various things that can precede the ID: @@ -267,15 +386,21 @@ )) |(?: youtu\.be| # just youtu.be/xxxx - vid\.plus # or vid.plus/xxxx + vid\.plus| # or vid.plus/xxxx + zwearz\.com/watch| # or zwearz.com/watch/xxxx )/ |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId= ) )? # all until now is optional -> you can pass the naked ID ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID - (?!.*?&list=) # combined list/video URLs are handled by the playlist IE + (?!.*?\blist= + (?: + %(playlist_id)s| # combined list/video URLs are handled by the playlist IE + WL # WL are handled by the watch later IE + ) + ) (?(1).+)? # if we found the ID, everything can follow - $""" + $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' _formats = { '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, @@ -308,6 +433,7 @@ '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, # Apple HTTP Live Streaming + '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, @@ -317,75 +443,87 @@ '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10}, # DASH mp4 video - '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, - '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, - '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, - '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, - '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, - '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559) - '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, - '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, - '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60, 'preference': -40}, - '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60, 'preference': -40}, - '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, + '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559) + '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, + '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, + '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'}, # Dash mp4 audio - '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'}, - '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'}, - '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'}, + '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'}, + '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'}, + '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'}, + '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, + '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, + '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'}, + '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'}, # Dash webm - '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, - '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, - '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, - '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, - '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, - '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, - '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9', 'preference': -40}, - '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, - '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, - '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, - '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, - '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, - '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, - '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, - '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, + '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'}, + '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'}, # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug) - '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, - '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40}, - '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40}, - '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40}, - '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, - '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40}, + '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, # Dash webm audio - '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50}, - '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50}, + '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128}, + '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256}, # Dash webm audio with opus inside - '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50}, - '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50}, - '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50}, + '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50}, + '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70}, + '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160}, # RTMP (unnamed) '_rtmp': {'protocol': 'rtmp'}, } - _SUBTITLE_FORMATS = ('ttml', 'vtt') + _SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt') + + _GEO_BYPASS = False IE_NAME = 'youtube' _TESTS = [ { - 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9', + 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9', 'info_dict': { 'id': 'BaW_jenozKc', 'ext': 'mp4', 'title': 'youtube-dl test video "\'/\\ä↭𝕐', 'uploader': 'Philipp Hagemeister', 'uploader_id': 'phihag', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', + 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q', + 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q', 'upload_date': '20121002', 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', 'categories': ['Science & Technology'], 'tags': ['youtube-dl'], + 'duration': 10, + 'view_count': int, 'like_count': int, 'dislike_count': int, 'start_time': 1, @@ -393,7 +531,7 @@ } }, { - 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY', + 'url': 'https://www.youtube.com/watch?v=UxxajLWwzqY', 'note': 'Test generic use_cipher_signature video (#897)', 'info_dict': { 'id': 'UxxajLWwzqY', @@ -401,13 +539,17 @@ 'upload_date': '20120506', 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]', 'alt_title': 'I Love It (feat. Charli XCX)', - 'description': 'md5:782e8651347686cba06e58f71ab51773', + 'description': 'md5:f3ceb5ef83a08d95b9d146f973157cc8', 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli', 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop', 'iconic ep', 'iconic', 'love', 'it'], + 'duration': 180, 'uploader': 'Icona Pop', 'uploader_id': 'IconaPop', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop', 'creator': 'Icona Pop', + 'track': 'I Love It (feat. Charli XCX)', + 'artist': 'Icona Pop', } }, { @@ -417,12 +559,16 @@ 'id': '07FYdnEawAQ', 'ext': 'mp4', 'upload_date': '20130703', - 'title': 'Justin Timberlake - Tunnel Vision (Explicit)', + 'title': 'Justin Timberlake - Tunnel Vision (Official Music Video) (Explicit)', 'alt_title': 'Tunnel Vision', - 'description': 'md5:64249768eec3bc4276236606ea996373', + 'description': 'md5:07dab3356cde4199048e4c7cd93471e1', + 'duration': 419, 'uploader': 'justintimberlakeVEVO', 'uploader_id': 'justintimberlakeVEVO', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO', 'creator': 'Justin Timberlake', + 'track': 'Tunnel Vision', + 'artist': 'Justin Timberlake', 'age_limit': 18, } }, @@ -437,11 +583,12 @@ 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7', 'uploader': 'SET India', 'uploader_id': 'setindia', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia', 'age_limit': 18, } }, { - 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY', + 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY', 'note': 'Use the first video ID in the URL', 'info_dict': { 'id': 'BaW_jenozKc', @@ -449,10 +596,13 @@ 'title': 'youtube-dl test video "\'/\\ä↭𝕐', 'uploader': 'Philipp Hagemeister', 'uploader_id': 'phihag', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', 'upload_date': '20121002', 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', 'categories': ['Science & Technology'], 'tags': ['youtube-dl'], + 'duration': 10, + 'view_count': int, 'like_count': int, 'dislike_count': int, }, @@ -461,13 +611,14 @@ }, }, { - 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I', + 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I', 'note': '256k DASH audio (format 141) via DASH manifest', 'info_dict': { 'id': 'a9LDPn-MO4I', 'ext': 'm4a', 'upload_date': '20121002', 'uploader_id': '8KVIDEO', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO', 'description': '', 'uploader': '8KVIDEO', 'title': 'UHDTV TEST 8K VIDEO.mp4' @@ -476,6 +627,7 @@ 'youtube_include_dash_manifest': True, 'format': '141', }, + 'skip': 'format 141 not served anymore', }, # DASH manifest with encrypted signature { @@ -483,15 +635,16 @@ 'info_dict': { 'id': 'IB3lcPjvWLA', 'ext': 'm4a', - 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson', - 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d', + 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson', + 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf', + 'duration': 244, 'uploader': 'AfrojackVEVO', 'uploader_id': 'AfrojackVEVO', 'upload_date': '20131011', }, 'params': { 'youtube_include_dash_manifest': True, - 'format': '141', + 'format': '141/bestaudio[ext=m4a]', }, }, # JS player signature function name containing $ @@ -501,8 +654,8 @@ 'id': 'nfWlot6h_JM', 'ext': 'm4a', 'title': 'Taylor Swift - Shake It Off', - 'alt_title': 'Shake It Off', - 'description': 'md5:95f66187cd7c8b2c13eb78e1223b63c3', + 'description': 'md5:bec2185232c05479482cb5a9b82719bf', + 'duration': 242, 'uploader': 'TaylorSwiftVEVO', 'uploader_id': 'TaylorSwiftVEVO', 'upload_date': '20140818', @@ -510,7 +663,7 @@ }, 'params': { 'youtube_include_dash_manifest': True, - 'format': '141', + 'format': '141/bestaudio[ext=m4a]', }, }, # Controversy video @@ -519,49 +672,58 @@ 'info_dict': { 'id': 'T4XJQO3qol8', 'ext': 'mp4', + 'duration': 219, 'upload_date': '20100909', - 'uploader': 'The Amazing Atheist', + 'uploader': 'Amazing Atheist', 'uploader_id': 'TheAmazingAtheist', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist', 'title': 'Burning Everyone\'s Koran', 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html', } }, # Normal age-gate video (No vevo, embed allowed) { - 'url': 'http://youtube.com/watch?v=HtVdAasjOgU', + 'url': 'https://youtube.com/watch?v=HtVdAasjOgU', 'info_dict': { 'id': 'HtVdAasjOgU', 'ext': 'mp4', 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer', - 'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}', + 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}', + 'duration': 142, 'uploader': 'The Witcher', 'uploader_id': 'WitcherGame', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame', 'upload_date': '20140605', 'age_limit': 18, }, }, # Age-gate video with encrypted signature { - 'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU', + 'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU', 'info_dict': { 'id': '6kLq3WMV1nU', 'ext': 'mp4', 'title': 'Dedication To My Ex (Miss That) (Lyric Video)', 'description': 'md5:33765bb339e1b47e7e72b5490139bb41', + 'duration': 246, 'uploader': 'LloydVEVO', 'uploader_id': 'LloydVEVO', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO', 'upload_date': '20110629', 'age_limit': 18, }, }, - # video_info is None (https://github.com/rg3/youtube-dl/issues/4421) + # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421) + # YouTube Red ad is not captured for creator { 'url': '__2ABJjxzNo', 'info_dict': { 'id': '__2ABJjxzNo', 'ext': 'mp4', + 'duration': 266, 'upload_date': '20100430', 'uploader_id': 'deadmau5', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5', 'creator': 'deadmau5', 'description': 'md5:12c56784b8032162bb936a5f76d55360', 'uploader': 'deadmau5', @@ -572,16 +734,18 @@ 'DASH manifest missing', ] }, - # Olympics (https://github.com/rg3/youtube-dl/issues/4431) + # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431) { 'url': 'lqQg6PlCWgI', 'info_dict': { 'id': 'lqQg6PlCWgI', 'ext': 'mp4', + 'duration': 6085, 'upload_date': '20150827', 'uploader_id': 'olympic', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic', 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games', - 'uploader': 'Olympics', + 'uploader': 'Olympic', 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games', }, 'params': { @@ -595,10 +759,12 @@ 'id': '_b-2C3KPAM0', 'ext': 'mp4', 'stretched_ratio': 16 / 9., + 'duration': 85, 'upload_date': '20110310', 'uploader_id': 'AllenMeow', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow', 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯', - 'uploader': '孫艾倫', + 'uploader': '孫ᄋᄅ', 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人', }, }, @@ -619,19 +785,22 @@ }, 'skip': 'This live event has ended.', }, - # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097) + # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097) { 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y', 'info_dict': { 'id': 'FIl7x6_3R5Y', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'md5:7b81415841e02ecd4313668cde88737a', 'description': 'md5:116377fd2963b81ec4ce64b542173306', + 'duration': 220, 'upload_date': '20150625', 'uploader_id': 'dorappi2000', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000', 'uploader': 'dorappi2000', - 'formats': 'mincount:33', + 'formats': 'mincount:31', }, + 'skip': 'not actual anymore', }, # DASH manifest with segment_list { @@ -649,7 +818,8 @@ 'params': { 'youtube_include_dash_manifest': True, 'format': '135', # bestvideo - } + }, + 'skip': 'This live event has ended.', }, { # Multifeed videos (multiple cameras), URL is for Main Camera @@ -665,9 +835,12 @@ 'ext': 'mp4', 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)', 'description': 'md5:dc7872fb300e143831327f1bae3af010', + 'duration': 7335, 'upload_date': '20150721', 'uploader': 'Beer Games Beer', 'uploader_id': 'beergamesbeer', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', + 'license': 'Standard YouTube License', }, }, { 'info_dict': { @@ -675,9 +848,12 @@ 'ext': 'mp4', 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)', 'description': 'md5:dc7872fb300e143831327f1bae3af010', + 'duration': 7337, 'upload_date': '20150721', 'uploader': 'Beer Games Beer', 'uploader_id': 'beergamesbeer', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', + 'license': 'Standard YouTube License', }, }, { 'info_dict': { @@ -685,9 +861,12 @@ 'ext': 'mp4', 'title': 'teamPGP: Rocket League Noob Stream (grizzle)', 'description': 'md5:dc7872fb300e143831327f1bae3af010', + 'duration': 7337, 'upload_date': '20150721', 'uploader': 'Beer Games Beer', 'uploader_id': 'beergamesbeer', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', + 'license': 'Standard YouTube License', }, }, { 'info_dict': { @@ -695,51 +874,65 @@ 'ext': 'mp4', 'title': 'teamPGP: Rocket League Noob Stream (zim)', 'description': 'md5:dc7872fb300e143831327f1bae3af010', + 'duration': 7334, 'upload_date': '20150721', 'uploader': 'Beer Games Beer', 'uploader_id': 'beergamesbeer', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', + 'license': 'Standard YouTube License', }, }], 'params': { 'skip_download': True, }, + 'skip': 'This video is not available.', }, { - # Multifeed video with comma in title (see https://github.com/rg3/youtube-dl/issues/8536) + # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536) 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo', 'info_dict': { 'id': 'gVfLd0zydlo', 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30', }, 'playlist_count': 2, + 'skip': 'Not multifeed anymore', + }, + { + 'url': 'https://vid.plus/FlRa-iH7PGw', + 'only_matching': True, }, { - 'url': 'http://vid.plus/FlRa-iH7PGw', + 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html', 'only_matching': True, }, { - # Title with JS-like syntax "};" (see https://github.com/rg3/youtube-dl/issues/7468) + # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468) # Also tests cut-off URL expansion in video description (see - # https://github.com/rg3/youtube-dl/issues/1892, - # https://github.com/rg3/youtube-dl/issues/8164) + # https://github.com/ytdl-org/youtube-dl/issues/1892, + # https://github.com/ytdl-org/youtube-dl/issues/8164) 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg', 'info_dict': { 'id': 'lsguqyKfVQg', 'ext': 'mp4', 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21', - 'alt_title': 'Dark Walk', + 'alt_title': 'Dark Walk - Position Music', 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a', + 'duration': 133, 'upload_date': '20151119', 'uploader_id': 'IronSoulElf', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf', 'uploader': 'IronSoulElf', - 'creator': 'Todd Haberman, Daniel Law Heath & Aaron Kaplan', + 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan', + 'track': 'Dark Walk - Position Music', + 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan', + 'album': 'Position Music - Production Music Vol. 143 - Dark Walk', }, 'params': { 'skip_download': True, }, }, { - # Tags with '};' (see https://github.com/rg3/youtube-dl/issues/7468) + # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468) 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8', 'only_matching': True, }, @@ -758,11 +951,248 @@ 'params': { 'skip_download': True, }, + 'skip': 'This video does not exist.', + }, + { + # Video licensed under Creative Commons + 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA', + 'info_dict': { + 'id': 'M4gD1WSo5mA', + 'ext': 'mp4', + 'title': 'md5:e41008789470fc2533a3252216f1c1d1', + 'description': 'md5:a677553cf0840649b731a3024aeff4cc', + 'duration': 721, + 'upload_date': '20150127', + 'uploader_id': 'BerkmanCenter', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter', + 'uploader': 'The Berkman Klein Center for Internet & Society', + 'license': 'Creative Commons Attribution license (reuse allowed)', + }, + 'params': { + 'skip_download': True, + }, + }, + { + # Channel-like uploader_url + 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg', + 'info_dict': { + 'id': 'eQcmzGIKrzg', + 'ext': 'mp4', + 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders', + 'description': 'md5:dda0d780d5a6e120758d1711d062a867', + 'duration': 4060, + 'upload_date': '20151119', + 'uploader': 'Bernie Sanders', + 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg', + 'license': 'Creative Commons Attribution license (reuse allowed)', + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;v=V36LpHqtcDY', 'only_matching': True, - } + }, + { + # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059) + 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo', + 'only_matching': True, + }, + { + # Rental video preview + 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg', + 'info_dict': { + 'id': 'uGpuVWrhIzE', + 'ext': 'mp4', + 'title': 'Piku - Trailer', + 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb', + 'upload_date': '20150811', + 'uploader': 'FlixMatrix', + 'uploader_id': 'FlixMatrixKaravan', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan', + 'license': 'Standard YouTube License', + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'This video is not available.', + }, + { + # YouTube Red video with episode data + 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4', + 'info_dict': { + 'id': 'iqKdEhx-dD4', + 'ext': 'mp4', + 'title': 'Isolation - Mind Field (Ep 1)', + 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f', + 'duration': 2085, + 'upload_date': '20170118', + 'uploader': 'Vsauce', + 'uploader_id': 'Vsauce', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce', + 'series': 'Mind Field', + 'season_number': 1, + 'episode_number': 1, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': [ + 'Skipping DASH manifest', + ], + }, + { + # The following content has been identified by the YouTube community + # as inappropriate or offensive to some audiences. + 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI', + 'info_dict': { + 'id': '6SJNVb0GnPI', + 'ext': 'mp4', + 'title': 'Race Differences in Intelligence', + 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1', + 'duration': 965, + 'upload_date': '20140124', + 'uploader': 'New Century Foundation', + 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg', + }, + 'params': { + 'skip_download': True, + }, + }, + { + # itag 212 + 'url': '1t24XAntNCY', + 'only_matching': True, + }, + { + # geo restricted to JP + 'url': 'sJL6WA-aGkQ', + 'only_matching': True, + }, + { + 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM', + 'only_matching': True, + }, + { + 'url': 'https://invidio.us/watch?v=BaW_jenozKc', + 'only_matching': True, + }, + { + # DRM protected + 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc', + 'only_matching': True, + }, + { + # Video with unsupported adaptive stream type formats + 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U', + 'info_dict': { + 'id': 'Z4Vy8R84T1U', + 'ext': 'mp4', + 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'duration': 433, + 'upload_date': '20130923', + 'uploader': 'Amelia Putri Harwita', + 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q', + 'formats': 'maxcount:10', + }, + 'params': { + 'skip_download': True, + 'youtube_include_dash_manifest': False, + }, + }, + { + # Youtube Music Auto-generated description + 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs', + 'info_dict': { + 'id': 'MgNrAu2pzNs', + 'ext': 'mp4', + 'title': 'Voyeur Girl', + 'description': 'md5:7ae382a65843d6df2685993e90a8628f', + 'upload_date': '20190312', + 'uploader': 'Various Artists - Topic', + 'uploader_id': 'UCVWKBi1ELZn0QX2CBLSkiyw', + 'artist': 'Stephen', + 'track': 'Voyeur Girl', + 'album': 'it\'s too much love to know my dear', + 'release_date': '20190313', + 'release_year': 2019, + }, + 'params': { + 'skip_download': True, + }, + }, + { + # Youtube Music Auto-generated description + # Retrieve 'artist' field from 'Artist:' in video description + # when it is present on youtube music video + 'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY', + 'info_dict': { + 'id': 'k0jLE7tTwjY', + 'ext': 'mp4', + 'title': 'Latch Feat. Sam Smith', + 'description': 'md5:3cb1e8101a7c85fcba9b4fb41b951335', + 'upload_date': '20150110', + 'uploader': 'Various Artists - Topic', + 'uploader_id': 'UCNkEcmYdjrH4RqtNgh7BZ9w', + 'artist': 'Disclosure', + 'track': 'Latch Feat. Sam Smith', + 'album': 'Latch Featuring Sam Smith', + 'release_date': '20121008', + 'release_year': 2012, + }, + 'params': { + 'skip_download': True, + }, + }, + { + # Youtube Music Auto-generated description + # handle multiple artists on youtube music video + 'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA', + 'info_dict': { + 'id': '74qn0eJSjpA', + 'ext': 'mp4', + 'title': 'Eastside', + 'description': 'md5:290516bb73dcbfab0dcc4efe6c3de5f2', + 'upload_date': '20180710', + 'uploader': 'Benny Blanco - Topic', + 'uploader_id': 'UCzqz_ksRu_WkIzmivMdIS7A', + 'artist': 'benny blanco, Halsey, Khalid', + 'track': 'Eastside', + 'album': 'Eastside', + 'release_date': '20180713', + 'release_year': 2018, + }, + 'params': { + 'skip_download': True, + }, + }, + { + # Youtube Music Auto-generated description + # handle youtube music video with release_year and no release_date + 'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M', + 'info_dict': { + 'id': '-hcAI0g-f5M', + 'ext': 'mp4', + 'title': 'Put It On Me', + 'description': 'md5:93c55acc682ae7b0c668f2e34e1c069e', + 'upload_date': '20180426', + 'uploader': 'Matt Maeson - Topic', + 'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ', + 'artist': 'Matt Maeson', + 'track': 'Put It On Me', + 'album': 'The Hearse', + 'release_date': None, + 'release_year': 2018, + }, + 'params': { + 'skip_download': True, + }, + }, ] def __init__(self, *args, **kwargs): @@ -791,7 +1221,7 @@ def _extract_signature_function(self, video_id, player_url, example_sig): id_m = re.match( - r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|/base)?\.(?P<ext>[a-z]+)$', + r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2,3}_[A-Z]{2})?/base)?\.(?P<ext>[a-z]+)$', player_url) if not id_m: raise ExtractorError('Cannot identify player %r' % player_url) @@ -876,8 +1306,12 @@ def _parse_sig_js(self, jscode): funcname = self._search_regex( - r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode, - 'Initial JS player signature function name') + (r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', + r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(', + r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*c\s*&&\s*d\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?(?P<sig>[a-zA-Z0-9$]+)\(', + r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(', + r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('), + jscode, 'Initial JS player signature function name', group='sig') jsi = JSInterpreter(jscode) initial_function = jsi.extract_function(funcname) @@ -898,6 +1332,9 @@ if player_url.startswith('//'): player_url = 'https:' + player_url + elif not re.match(r'https?://', player_url): + player_url = compat_urlparse.urljoin( + 'https://www.youtube.com', player_url) try: player_id = (player_url, self._signature_cache_id(s)) if player_id not in self._player_cache: @@ -930,7 +1367,7 @@ continue sub_formats = [] for ext in self._SUBTITLE_FORMATS: - params = compat_urllib_parse.urlencode({ + params = compat_urllib_parse_urlencode({ 'lang': lang, 'v': video_id, 'fmt': ext, @@ -953,8 +1390,8 @@ # regex won't capture the whole JSON. Yet working around by trying more # concrete regex first keeping in mind proper quoted string handling # to be implemented in future that will replace this workaround (see - # https://github.com/rg3/youtube-dl/issues/7468, - # https://github.com/rg3/youtube-dl/pull/7599) + # https://github.com/ytdl-org/youtube-dl/issues/7468, + # https://github.com/ytdl-org/youtube-dl/pull/7599) r';ytplayer\.config\s*=\s*({.+?});ytplayer', r';ytplayer\.config\s*=\s*({.+?});', ) @@ -975,50 +1412,160 @@ return {} try: args = player_config['args'] - caption_url = args['ttsurl'] - if not caption_url: - self._downloader.report_warning(err_msg) - return {} - timestamp = args['timestamp'] - # We get the available subtitles - list_params = compat_urllib_parse.urlencode({ - 'type': 'list', - 'tlangs': 1, - 'asrs': 1, - }) - list_url = caption_url + '&' + list_params - caption_list = self._download_xml(list_url, video_id) - original_lang_node = caption_list.find('track') - if original_lang_node is None: - self._downloader.report_warning('Video doesn\'t have automatic captions') - return {} - original_lang = original_lang_node.attrib['lang_code'] - caption_kind = original_lang_node.attrib.get('kind', '') - - sub_lang_list = {} - for lang_node in caption_list.findall('target'): - sub_lang = lang_node.attrib['lang_code'] - sub_formats = [] - for ext in self._SUBTITLE_FORMATS: - params = compat_urllib_parse.urlencode({ - 'lang': original_lang, - 'tlang': sub_lang, - 'fmt': ext, - 'ts': timestamp, - 'kind': caption_kind, - }) - sub_formats.append({ - 'url': caption_url + '&' + params, - 'ext': ext, - }) - sub_lang_list[sub_lang] = sub_formats - return sub_lang_list + caption_url = args.get('ttsurl') + if caption_url: + timestamp = args['timestamp'] + # We get the available subtitles + list_params = compat_urllib_parse_urlencode({ + 'type': 'list', + 'tlangs': 1, + 'asrs': 1, + }) + list_url = caption_url + '&' + list_params + caption_list = self._download_xml(list_url, video_id) + original_lang_node = caption_list.find('track') + if original_lang_node is None: + self._downloader.report_warning('Video doesn\'t have automatic captions') + return {} + original_lang = original_lang_node.attrib['lang_code'] + caption_kind = original_lang_node.attrib.get('kind', '') + + sub_lang_list = {} + for lang_node in caption_list.findall('target'): + sub_lang = lang_node.attrib['lang_code'] + sub_formats = [] + for ext in self._SUBTITLE_FORMATS: + params = compat_urllib_parse_urlencode({ + 'lang': original_lang, + 'tlang': sub_lang, + 'fmt': ext, + 'ts': timestamp, + 'kind': caption_kind, + }) + sub_formats.append({ + 'url': caption_url + '&' + params, + 'ext': ext, + }) + sub_lang_list[sub_lang] = sub_formats + return sub_lang_list + + def make_captions(sub_url, sub_langs): + parsed_sub_url = compat_urllib_parse_urlparse(sub_url) + caption_qs = compat_parse_qs(parsed_sub_url.query) + captions = {} + for sub_lang in sub_langs: + sub_formats = [] + for ext in self._SUBTITLE_FORMATS: + caption_qs.update({ + 'tlang': [sub_lang], + 'fmt': [ext], + }) + sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace( + query=compat_urllib_parse_urlencode(caption_qs, True))) + sub_formats.append({ + 'url': sub_url, + 'ext': ext, + }) + captions[sub_lang] = sub_formats + return captions + + # New captions format as of 22.06.2017 + player_response = args.get('player_response') + if player_response and isinstance(player_response, compat_str): + player_response = self._parse_json( + player_response, video_id, fatal=False) + if player_response: + renderer = player_response['captions']['playerCaptionsTracklistRenderer'] + base_url = renderer['captionTracks'][0]['baseUrl'] + sub_lang_list = [] + for lang in renderer['translationLanguages']: + lang_code = lang.get('languageCode') + if lang_code: + sub_lang_list.append(lang_code) + return make_captions(base_url, sub_lang_list) + + # Some videos don't provide ttsurl but rather caption_tracks and + # caption_translation_languages (e.g. 20LmZk1hakA) + # Does not used anymore as of 22.06.2017 + caption_tracks = args['caption_tracks'] + caption_translation_languages = args['caption_translation_languages'] + caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0] + sub_lang_list = [] + for lang in caption_translation_languages.split(','): + lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang)) + sub_lang = lang_qs.get('lc', [None])[0] + if sub_lang: + sub_lang_list.append(sub_lang) + return make_captions(caption_url, sub_lang_list) # An extractor error can be raise by the download process if there are # no automatic captions but there are subtitles - except (KeyError, ExtractorError): + except (KeyError, IndexError, ExtractorError): self._downloader.report_warning(err_msg) return {} + def _mark_watched(self, video_id, video_info, player_response): + playback_url = url_or_none(try_get( + player_response, + lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get( + video_info, lambda x: x['videostats_playback_base_url'][0])) + if not playback_url: + return + parsed_playback_url = compat_urlparse.urlparse(playback_url) + qs = compat_urlparse.parse_qs(parsed_playback_url.query) + + # cpn generation algorithm is reverse engineered from base.js. + # In fact it works even with dummy cpn. + CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_' + cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16))) + + qs.update({ + 'ver': ['2'], + 'cpn': [cpn], + }) + playback_url = compat_urlparse.urlunparse( + parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True))) + + self._download_webpage( + playback_url, video_id, 'Marking watched', + 'Unable to mark watched', fatal=False) + + @staticmethod + def _extract_urls(webpage): + # Embedded YouTube player + entries = [ + unescapeHTML(mobj.group('url')) + for mobj in re.finditer(r'''(?x) + (?: + <iframe[^>]+?src=| + data-video-url=| + <embed[^>]+?src=| + embedSWF\(?:\s*| + <object[^>]+data=| + new\s+SWFObject\( + ) + (["\']) + (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ + (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?) + \1''', webpage)] + + # lazyYT YouTube embed + entries.extend(list(map( + unescapeHTML, + re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)))) + + # Wordpress "YouTube Video Importer" plugin + matches = re.findall(r'''(?x)<div[^>]+ + class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+ + data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage) + entries.extend(m[-1] for m in matches) + + return entries + + @staticmethod + def _extract_url(webpage): + urls = YoutubeIE._extract_urls(webpage) + return urls[0] if urls else None + @classmethod def extract_id(cls, url): mobj = re.match(cls._VALID_URL, url, re.VERBOSE) @@ -1027,25 +1574,45 @@ video_id = mobj.group(2) return video_id - def _extract_from_m3u8(self, manifest_url, video_id): - url_map = {} - - def _get_urls(_manifest): - lines = _manifest.split('\n') - urls = filter(lambda l: l and not l.startswith('#'), - lines) - return urls - manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest') - formats_urls = _get_urls(manifest) - for format_url in formats_urls: - itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag') - url_map[itag] = format_url - return url_map - def _extract_annotations(self, video_id): url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.') + @staticmethod + def _extract_chapters(description, duration): + if not description: + return None + chapter_lines = re.findall( + r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)', + description) + if not chapter_lines: + return None + chapters = [] + for next_num, (chapter_line, time_point) in enumerate( + chapter_lines, start=1): + start_time = parse_duration(time_point) + if start_time is None: + continue + if start_time > duration: + break + end_time = (duration if next_num == len(chapter_lines) + else parse_duration(chapter_lines[next_num][1])) + if end_time is None: + continue + if end_time > duration: + end_time = duration + if start_time > end_time: + break + chapter_title = re.sub( + r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-') + chapter_title = re.sub(r'\s+', ' ', chapter_title) + chapters.append({ + 'start_time': start_time, + 'end_time': end_time, + 'title': chapter_title, + }) + return chapters + def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) @@ -1089,16 +1656,33 @@ if dash_mpd and dash_mpd[0] not in dash_mpds: dash_mpds.append(dash_mpd[0]) + def add_dash_mpd_pr(pl_response): + dash_mpd = url_or_none(try_get( + pl_response, lambda x: x['streamingData']['dashManifestUrl'], + compat_str)) + if dash_mpd and dash_mpd not in dash_mpds: + dash_mpds.append(dash_mpd) + + is_live = None + view_count = None + + def extract_view_count(v_info): + return int_or_none(try_get(v_info, lambda x: x['view_count'][0])) + + def extract_token(v_info): + return dict_get(v_info, ('account_playback_token', 'accountPlaybackToken', 'token')) + + player_response = {} + # Get video info embed_webpage = None - is_live = None if re.search(r'player-age-gate-content">', video_webpage) is not None: age_gate = True # We simulate the access to the video from www.youtube.com/v/{video_id} # this can be viewed without login into Youtube url = proto + '://www.youtube.com/embed/%s' % video_id embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage') - data = compat_urllib_parse.urlencode({ + data = compat_urllib_parse_urlencode({ 'video_id': video_id, 'eurl': 'https://youtube.googleapis.com/v/' + video_id, 'sts': self._search_regex( @@ -1114,85 +1698,132 @@ else: age_gate = False video_info = None + sts = None # Try looking directly into the video webpage ytplayer_config = self._get_ytplayer_config(video_id, video_webpage) if ytplayer_config: args = ytplayer_config['args'] - if args.get('url_encoded_fmt_stream_map'): + if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'): # Convert to the same format returned by compat_parse_qs video_info = dict((k, [v]) for k, v in args.items()) add_dash_mpd(video_info) + # Rental video is not rented but preview is available (e.g. + # https://www.youtube.com/watch?v=yYr8q0y5Jfg, + # https://github.com/ytdl-org/youtube-dl/issues/10532) + if not video_info and args.get('ypc_vid'): + return self.url_result( + args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid']) if args.get('livestream') == '1' or args.get('live_playback') == 1: is_live = True + sts = ytplayer_config.get('sts') + if not player_response: + pl_response = str_or_none(args.get('player_response')) + if pl_response: + pl_response = self._parse_json(pl_response, video_id, fatal=False) + if isinstance(pl_response, dict): + player_response = pl_response if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): + add_dash_mpd_pr(player_response) # We also try looking in get_video_info since it may contain different dashmpd # URL that points to a DASH manifest with possibly different itag set (some itags # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH # manifest pointed by get_video_info's dashmpd). # The general idea is to take a union of itags of both DASH manifests (for example - # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093) + # video with such 'manifest behavior' see https://github.com/ytdl-org/youtube-dl/issues/6093) self.report_video_info_webpage_download(video_id) - for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']: - video_info_url = ( - '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' - % (proto, video_id, el_type)) + for el in ('embedded', 'detailpage', 'vevo', ''): + query = { + 'video_id': video_id, + 'ps': 'default', + 'eurl': '', + 'gl': 'US', + 'hl': 'en', + } + if el: + query['el'] = el + if sts: + query['sts'] = sts video_info_webpage = self._download_webpage( - video_info_url, + '%s://www.youtube.com/get_video_info' % proto, video_id, note=False, - errnote='unable to download video info webpage') + errnote='unable to download video info webpage', + fatal=False, query=query) + if not video_info_webpage: + continue get_video_info = compat_parse_qs(video_info_webpage) - if get_video_info.get('use_cipher_signature') != ['True']: - add_dash_mpd(get_video_info) + if not player_response: + pl_response = get_video_info.get('player_response', [None])[0] + if isinstance(pl_response, dict): + player_response = pl_response + add_dash_mpd_pr(player_response) + add_dash_mpd(get_video_info) + if view_count is None: + view_count = extract_view_count(get_video_info) if not video_info: video_info = get_video_info - if 'token' in get_video_info: + get_token = extract_token(get_video_info) + if get_token: # Different get_video_info requests may report different results, e.g. # some may report video unavailability, but some may serve it without - # any complaint (see https://github.com/rg3/youtube-dl/issues/7362, + # any complaint (see https://github.com/ytdl-org/youtube-dl/issues/7362, # the original webpage as well as el=info and el=embedded get_video_info # requests report video unavailability due to geo restriction while # el=detailpage succeeds and returns valid data). This is probably # due to YouTube measures against IP ranges of hosting providers. # Working around by preferring the first succeeded video_info containing # the token if no such video_info yet was found. - if 'token' not in video_info: + token = extract_token(video_info) + if not token: video_info = get_video_info break - if 'token' not in video_info: - if 'reason' in video_info: - if 'The uploader has not made this video available in your country.' in video_info['reason']: - regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None) - if regions_allowed: - raise ExtractorError('YouTube said: This video is available in %s only' % ( - ', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))), - expected=True) - raise ExtractorError( - 'YouTube said: %s' % video_info['reason'][0], - expected=True, video_id=video_id) - else: - raise ExtractorError( - '"token" parameter not in video info for unknown reason', - video_id=video_id) + + def extract_unavailable_message(): + return self._html_search_regex( + r'(?s)<h1[^>]+id="unavailable-message"[^>]*>(.+?)</h1>', + video_webpage, 'unavailable message', default=None) + + if not video_info: + unavailable_message = extract_unavailable_message() + if not unavailable_message: + unavailable_message = 'Unable to extract video data' + raise ExtractorError( + 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id) + + video_details = try_get( + player_response, lambda x: x['videoDetails'], dict) or {} # title if 'title' in video_info: video_title = video_info['title'][0] + elif 'title' in player_response: + video_title = video_details['title'] else: self._downloader.report_warning('Unable to extract video title') video_title = '_' # description - video_description = get_element_by_id("eow-description", video_webpage) + description_original = video_description = get_element_by_id("eow-description", video_webpage) if video_description: - video_description = re.sub(r'''(?x) + + def replace_url(m): + redir_url = compat_urlparse.urljoin(url, m.group(1)) + parsed_redir_url = compat_urllib_parse_urlparse(redir_url) + if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect': + qs = compat_parse_qs(parsed_redir_url.query) + q = qs.get('q') + if q and q[0]: + return q[0] + return redir_url + + description_original = video_description = re.sub(r'''(?x) <a\s+ - (?:[a-zA-Z-]+="[^"]+"\s+)*? + (?:[a-zA-Z-]+="[^"]*"\s+)*? (?:title|href)="([^"]+)"\s+ - (?:[a-zA-Z-]+="[^"]+"\s+)*? - class="(?:yt-uix-redirect-link|yt-uix-sessionlink[^"]*)"[^>]*> + (?:[a-zA-Z-]+="[^"]*"\s+)*? + class="[^"]*"[^>]*> [^<]+\.{3}\s* </a> - ''', r'\1', video_description) + ''', replace_url, video_description) video_description = clean_html(video_description) else: fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage) @@ -1201,140 +1832,49 @@ else: video_description = '' - if 'multifeed_metadata_list' in video_info and not smuggled_data.get('force_singlefeed', False): + if not smuggled_data.get('force_singlefeed', False): if not self._downloader.params.get('noplaylist'): - entries = [] - feed_ids = [] - multifeed_metadata_list = video_info['multifeed_metadata_list'][0] - for feed in multifeed_metadata_list.split(','): - # Unquote should take place before split on comma (,) since textual - # fields may contain comma as well (see - # https://github.com/rg3/youtube-dl/issues/8536) - feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed)) - entries.append({ - '_type': 'url_transparent', - 'ie_key': 'Youtube', - 'url': smuggle_url( - '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]), - {'force_singlefeed': True}), - 'title': '%s (%s)' % (video_title, feed_data['title'][0]), - }) - feed_ids.append(feed_data['id'][0]) - self.to_screen( - 'Downloading multifeed video (%s) - add --no-playlist to just download video %s' - % (', '.join(feed_ids), video_id)) - return self.playlist_result(entries, video_id, video_title, video_description) - self.to_screen('Downloading just video %s because of --no-playlist' % video_id) + multifeed_metadata_list = try_get( + player_response, + lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'], + compat_str) or try_get( + video_info, lambda x: x['multifeed_metadata_list'][0], compat_str) + if multifeed_metadata_list: + entries = [] + feed_ids = [] + for feed in multifeed_metadata_list.split(','): + # Unquote should take place before split on comma (,) since textual + # fields may contain comma as well (see + # https://github.com/ytdl-org/youtube-dl/issues/8536) + feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed)) + entries.append({ + '_type': 'url_transparent', + 'ie_key': 'Youtube', + 'url': smuggle_url( + '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]), + {'force_singlefeed': True}), + 'title': '%s (%s)' % (video_title, feed_data['title'][0]), + }) + feed_ids.append(feed_data['id'][0]) + self.to_screen( + 'Downloading multifeed video (%s) - add --no-playlist to just download video %s' + % (', '.join(feed_ids), video_id)) + return self.playlist_result(entries, video_id, video_title, video_description) + else: + self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - if 'view_count' in video_info: - view_count = int(video_info['view_count'][0]) - else: - view_count = None + if view_count is None: + view_count = extract_view_count(video_info) + if view_count is None and video_details: + view_count = int_or_none(video_details.get('viewCount')) # Check for "rental" videos if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: - raise ExtractorError('"rental" videos not supported') - - # Start extracting information - self.report_information_extraction(video_id) - - # uploader - if 'author' not in video_info: - raise ExtractorError('Unable to extract uploader name') - video_uploader = compat_urllib_parse_unquote_plus(video_info['author'][0]) - - # uploader_id - video_uploader_id = None - mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage) - if mobj is not None: - video_uploader_id = mobj.group(1) - else: - self._downloader.report_warning('unable to extract uploader nickname') - - # thumbnail image - # We try first to get a high quality image: - m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">', - video_webpage, re.DOTALL) - if m_thumb is not None: - video_thumbnail = m_thumb.group(1) - elif 'thumbnail_url' not in video_info: - self._downloader.report_warning('unable to extract video thumbnail') - video_thumbnail = None - else: # don't panic if we can't find it - video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0]) - - # upload date - upload_date = self._html_search_meta( - 'datePublished', video_webpage, 'upload date', default=None) - if not upload_date: - upload_date = self._search_regex( - [r'(?s)id="eow-date.*?>(.*?)</span>', - r'id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live|Started) on (.+?)</strong>'], - video_webpage, 'upload date', default=None) - if upload_date: - upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split()) - upload_date = unified_strdate(upload_date) + raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True) - m_music = re.search( - r'<h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*<ul[^>]*>\s*<li>(?P<title>.+?) by (?P<creator>.+?)(?:\(.+?\))?</li', - video_webpage) - if m_music: - video_alt_title = remove_quotes(unescapeHTML(m_music.group('title'))) - video_creator = clean_html(m_music.group('creator')) - else: - video_alt_title = video_creator = None - - m_cat_container = self._search_regex( - r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>', - video_webpage, 'categories', default=None) - if m_cat_container: - category = self._html_search_regex( - r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category', - default=None) - video_categories = None if category is None else [category] - else: - video_categories = None - - video_tags = [ - unescapeHTML(m.group('content')) - for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)] - - def _extract_count(count_name): - return str_to_int(self._search_regex( - r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' - % re.escape(count_name), - video_webpage, count_name, default=None)) - - like_count = _extract_count('like') - dislike_count = _extract_count('dislike') - - # subtitles - video_subtitles = self.extract_subtitles(video_id, video_webpage) - automatic_captions = self.extract_automatic_captions(video_id, video_webpage) - - if 'length_seconds' not in video_info: - self._downloader.report_warning('unable to extract video duration') - video_duration = None - else: - video_duration = int(compat_urllib_parse_unquote_plus(video_info['length_seconds'][0])) - - # annotations - video_annotations = None - if self._downloader.params.get('writeannotations', False): - video_annotations = self._extract_annotations(video_id) - - def _map_to_format_list(urlmap): - formats = [] - for itag, video_real_url in urlmap.items(): - dct = { - 'format_id': itag, - 'url': video_real_url, - 'player_url': player_url, - } - if itag in self._formats: - dct.update(self._formats[itag]) - formats.append(dct) - return formats + def _extract_filesize(media_url): + return int_or_none(self._search_regex( + r'\bclen[=/](\d+)', media_url, 'filesize', default=None)) if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'): self.report_rtmp_download() @@ -1344,24 +1884,57 @@ 'url': video_info['conn'][0], 'player_url': player_url, }] - elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1: + elif not is_live and (len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1): encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0] if 'rtmpe%3Dyes' in encoded_url_map: - raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True) + raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True) + formats_spec = {} + fmt_list = video_info.get('fmt_list', [''])[0] + if fmt_list: + for fmt in fmt_list.split(','): + spec = fmt.split('/') + if len(spec) > 1: + width_height = spec[1].split('x') + if len(width_height) == 2: + formats_spec[spec[0]] = { + 'resolution': spec[1], + 'width': int_or_none(width_height[0]), + 'height': int_or_none(width_height[1]), + } + q = qualities(['small', 'medium', 'hd720']) + streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) + if streaming_formats: + for fmt in streaming_formats: + itag = str_or_none(fmt.get('itag')) + if not itag: + continue + quality = fmt.get('quality') + quality_label = fmt.get('qualityLabel') or quality + formats_spec[itag] = { + 'asr': int_or_none(fmt.get('audioSampleRate')), + 'filesize': int_or_none(fmt.get('contentLength')), + 'format_note': quality_label, + 'fps': int_or_none(fmt.get('fps')), + 'height': int_or_none(fmt.get('height')), + 'quality': q(quality), + # bitrate for itag 43 is always 2147483647 + 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None, + 'width': int_or_none(fmt.get('width')), + } formats = [] for url_data_str in encoded_url_map.split(','): url_data = compat_parse_qs(url_data_str) - if 'itag' not in url_data or 'url' not in url_data: + if 'itag' not in url_data or 'url' not in url_data or url_data.get('drm_families'): + continue + stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0])) + # Unsupported FORMAT_STREAM_TYPE_OTF + if stream_type == 3: continue format_id = url_data['itag'][0] url = url_data['url'][0] - if 'sig' in url_data: - url += '&signature=' + url_data['sig'][0] - elif 's' in url_data: - encrypted_sig = url_data['s'][0] + if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True): ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")' - jsplayer_url_json = self._search_regex( ASSETS_RE, embed_webpage if age_gate else video_webpage, @@ -1382,6 +1955,11 @@ video_webpage, 'age gate player URL') player_url = json.loads(player_url_json) + if 'sig' in url_data: + url += '&signature=' + url_data['sig'][0] + elif 's' in url_data: + encrypted_sig = url_data['s'][0] + if self._downloader.params.get('verbose'): if player_url is None: player_version = 'unknown' @@ -1394,7 +1972,8 @@ player_desc = 'flash player %s' % player_version else: player_version = self._search_regex( - [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', r'(?:www|player)-([^/]+)/base\.js'], + [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', + r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'], player_url, 'html5 player', fatal=False) player_desc = 'html5 player %s' % player_version @@ -1405,7 +1984,8 @@ signature = self._decrypt_signature( encrypted_sig, video_id, player_url, age_gate) - url += '&signature=' + signature + sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature' + url += '&%s=%s' % (sp, signature) if 'ratebypass' not in url: url += '&ratebypass=yes' @@ -1416,20 +1996,28 @@ } if format_id in self._formats: dct.update(self._formats[format_id]) + if format_id in formats_spec: + dct.update(formats_spec[format_id]) # Some itags are not included in DASH manifest thus corresponding formats will - # lack metadata (see https://github.com/rg3/youtube-dl/pull/5993). + # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993). # Trying to extract metadata from url_encoded_fmt_stream_map entry. mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0]) width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None) + filesize = int_or_none(url_data.get( + 'clen', [None])[0]) or _extract_filesize(url) + + quality = url_data.get('quality', [None])[0] + more_fields = { - 'filesize': int_or_none(url_data.get('clen', [None])[0]), + 'filesize': filesize, 'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000), 'width': width, 'height': height, 'fps': int_or_none(url_data.get('fps', [None])[0]), - 'format_note': url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0], + 'format_note': url_data.get('quality_label', [None])[0] or quality, + 'quality': q(quality), } for key, value in more_fields.items(): if value: @@ -1449,30 +2037,215 @@ codecs = mobj.group('val') break if codecs: - codecs = codecs.split(',') - if len(codecs) == 2: - acodec, vcodec = codecs[1], codecs[0] - else: - acodec, vcodec = (codecs[0], 'none') if kind == 'audio' else ('none', codecs[0]) - dct.update({ - 'acodec': acodec, - 'vcodec': vcodec, - }) + dct.update(parse_codecs(codecs)) + if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none': + dct['downloader_options'] = { + # Youtube throttles chunks >~10M + 'http_chunk_size': 10485760, + } formats.append(dct) - elif video_info.get('hlsvp'): - manifest_url = video_info['hlsvp'][0] - url_map = self._extract_from_m3u8(manifest_url, video_id) - formats = _map_to_format_list(url_map) - # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming - for a_format in formats: - a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True' else: - unavailable_message = self._html_search_regex( - r'(?s)<h1[^>]+id="unavailable-message"[^>]*>(.+?)</h1>', - video_webpage, 'unavailable message', default=None) - if unavailable_message: - raise ExtractorError(unavailable_message, expected=True) - raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') + manifest_url = ( + url_or_none(try_get( + player_response, + lambda x: x['streamingData']['hlsManifestUrl'], + compat_str)) + or url_or_none(try_get( + video_info, lambda x: x['hlsvp'][0], compat_str))) + if manifest_url: + formats = [] + m3u8_formats = self._extract_m3u8_formats( + manifest_url, video_id, 'mp4', fatal=False) + for a_format in m3u8_formats: + itag = self._search_regex( + r'/itag/(\d+)/', a_format['url'], 'itag', default=None) + if itag: + a_format['format_id'] = itag + if itag in self._formats: + dct = self._formats[itag].copy() + dct.update(a_format) + a_format = dct + a_format['player_url'] = player_url + # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming + a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True' + formats.append(a_format) + else: + error_message = clean_html(video_info.get('reason', [None])[0]) + if not error_message: + error_message = extract_unavailable_message() + if error_message: + raise ExtractorError(error_message, expected=True) + raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info') + + # uploader + video_uploader = try_get( + video_info, lambda x: x['author'][0], + compat_str) or str_or_none(video_details.get('author')) + if video_uploader: + video_uploader = compat_urllib_parse_unquote_plus(video_uploader) + else: + self._downloader.report_warning('unable to extract uploader name') + + # uploader_id + video_uploader_id = None + video_uploader_url = None + mobj = re.search( + r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">', + video_webpage) + if mobj is not None: + video_uploader_id = mobj.group('uploader_id') + video_uploader_url = mobj.group('uploader_url') + else: + self._downloader.report_warning('unable to extract uploader nickname') + + channel_id = ( + str_or_none(video_details.get('channelId')) + or self._html_search_meta( + 'channelId', video_webpage, 'channel id', default=None) + or self._search_regex( + r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1', + video_webpage, 'channel id', default=None, group='id')) + channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None + + # thumbnail image + # We try first to get a high quality image: + m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">', + video_webpage, re.DOTALL) + if m_thumb is not None: + video_thumbnail = m_thumb.group(1) + elif 'thumbnail_url' not in video_info: + self._downloader.report_warning('unable to extract video thumbnail') + video_thumbnail = None + else: # don't panic if we can't find it + video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0]) + + # upload date + upload_date = self._html_search_meta( + 'datePublished', video_webpage, 'upload date', default=None) + if not upload_date: + upload_date = self._search_regex( + [r'(?s)id="eow-date.*?>(.*?)</span>', + r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], + video_webpage, 'upload date', default=None) + upload_date = unified_strdate(upload_date) + + video_license = self._html_search_regex( + r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li', + video_webpage, 'license', default=None) + + m_music = re.search( + r'''(?x) + <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s* + <ul[^>]*>\s* + <li>(?P<title>.+?) + by (?P<creator>.+?) + (?: + \(.+?\)| + <a[^>]* + (?: + \bhref=["\']/red[^>]*>| # drop possible + >\s*Listen ad-free with YouTube Red # YouTube Red ad + ) + .*? + )?</li + ''', + video_webpage) + if m_music: + video_alt_title = remove_quotes(unescapeHTML(m_music.group('title'))) + video_creator = clean_html(m_music.group('creator')) + else: + video_alt_title = video_creator = None + + def extract_meta(field): + return self._html_search_regex( + r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field, + video_webpage, field, default=None) + + track = extract_meta('Song') + artist = extract_meta('Artist') + album = extract_meta('Album') + + # Youtube Music Auto-generated description + release_date = release_year = None + if video_description: + mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^·]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description) + if mobj: + if not track: + track = mobj.group('track').strip() + if not artist: + artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')) + if not album: + album = mobj.group('album'.strip()) + release_year = mobj.group('release_year') + release_date = mobj.group('release_date') + if release_date: + release_date = release_date.replace('-', '') + if not release_year: + release_year = int(release_date[:4]) + if release_year: + release_year = int(release_year) + + m_episode = re.search( + r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>', + video_webpage) + if m_episode: + series = unescapeHTML(m_episode.group('series')) + season_number = int(m_episode.group('season')) + episode_number = int(m_episode.group('episode')) + else: + series = season_number = episode_number = None + + m_cat_container = self._search_regex( + r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>', + video_webpage, 'categories', default=None) + if m_cat_container: + category = self._html_search_regex( + r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category', + default=None) + video_categories = None if category is None else [category] + else: + video_categories = None + + video_tags = [ + unescapeHTML(m.group('content')) + for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)] + + def _extract_count(count_name): + return str_to_int(self._search_regex( + r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' + % re.escape(count_name), + video_webpage, count_name, default=None)) + + like_count = _extract_count('like') + dislike_count = _extract_count('dislike') + + if view_count is None: + view_count = str_to_int(self._search_regex( + r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage, + 'view count', default=None)) + + average_rating = ( + float_or_none(video_details.get('averageRating')) + or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0]))) + + # subtitles + video_subtitles = self.extract_subtitles(video_id, video_webpage) + automatic_captions = self.extract_automatic_captions(video_id, video_webpage) + + video_duration = try_get( + video_info, lambda x: int_or_none(x['length_seconds'][0])) + if not video_duration: + video_duration = int_or_none(video_details.get('lengthSeconds')) + if not video_duration: + video_duration = parse_duration(self._html_search_meta( + 'duration', video_webpage, 'video duration')) + + # annotations + video_annotations = None + if self._downloader.params.get('writeannotations', False): + video_annotations = self._extract_annotations(video_id) + + chapters = self._extract_chapters(description_original, video_duration) # Look for the DASH manifest if self._downloader.params.get('youtube_include_dash_manifest', True): @@ -1490,6 +2263,8 @@ for df in self._extract_mpd_formats( mpd_url, video_id, fatal=dash_mpd_fatal, formats_dict=self._formats): + if not df.get('filesize'): + df['filesize'] = _extract_filesize(df['url']) # Do not overwrite DASH format found in some previous DASH manifest if df['format_id'] not in dash_formats: dash_formats[df['format_id']] = df @@ -1506,7 +2281,7 @@ # Remove the formats we found through non-DASH, they # contain less info and it can be wrong, because we use # fixed values (for example the resolution). See - # https://github.com/rg3/youtube-dl/issues/5774 for an + # https://github.com/ytdl-org/youtube-dl/issues/5774 for an # example. formats = [f for f in formats if f['format_id'] not in dash_formats.keys()] formats.extend(dash_formats.values()) @@ -1526,16 +2301,48 @@ if f.get('vcodec') != 'none': f['stretched_ratio'] = ratio + if not formats: + token = extract_token(video_info) + if not token: + if 'reason' in video_info: + if 'The uploader has not made this video available in your country.' in video_info['reason']: + regions_allowed = self._html_search_meta( + 'regionsAllowed', video_webpage, default=None) + countries = regions_allowed.split(',') if regions_allowed else None + self.raise_geo_restricted( + msg=video_info['reason'][0], countries=countries) + reason = video_info['reason'][0] + if 'Invalid parameters' in reason: + unavailable_message = extract_unavailable_message() + if unavailable_message: + reason = unavailable_message + raise ExtractorError( + 'YouTube said: %s' % reason, + expected=True, video_id=video_id) + else: + raise ExtractorError( + '"token" parameter not in video info for unknown reason', + video_id=video_id) + + if not formats and (video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos'])): + raise ExtractorError('This video is DRM protected.', expected=True) + self._sort_formats(formats) + self.mark_watched(video_id, video_info, player_response) + return { 'id': video_id, 'uploader': video_uploader, 'uploader_id': video_uploader_id, + 'uploader_url': video_uploader_url, + 'channel_id': channel_id, + 'channel_url': channel_url, 'upload_date': upload_date, - 'creator': video_creator, + 'license': video_license, + 'creator': video_creator or artist, 'title': video_title, - 'alt_title': video_alt_title, + 'alt_title': video_alt_title or track, 'thumbnail': video_thumbnail, 'description': video_description, 'categories': video_categories, @@ -1545,15 +2352,24 @@ 'duration': video_duration, 'age_limit': 18 if age_gate else 0, 'annotations': video_annotations, + 'chapters': chapters, 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id, 'view_count': view_count, 'like_count': like_count, 'dislike_count': dislike_count, - 'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]), + 'average_rating': average_rating, 'formats': formats, 'is_live': is_live, 'start_time': start_time, 'end_time': end_time, + 'series': series, + 'season_number': season_number, + 'episode_number': episode_number, + 'track': track, + 'artist': artist, + 'album': album, + 'release_date': release_date, + 'release_year': release_year, } @@ -1562,21 +2378,28 @@ _VALID_URL = r"""(?x)(?: (?:https?://)? (?:\w+\.)? - youtube\.com/ (?: - (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries) - \? (?:.*?[&;])*? (?:p|a|list)= - | p/ + (?: + youtube\.com| + invidio\.us + ) + / + (?: + (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11})) + \? (?:.*?[&;])*? (?:p|a|list)= + | p/ + )| + youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist= ) ( - (?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,} + (?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)?[0-9A-Za-z-_]{10,} # Top tracks, they can also include dots |(?:MC)[\w\.]* ) .* | - ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,}) - )""" + (%(playlist_id)s) + )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?' IE_NAME = 'youtube:playlist' @@ -1594,6 +2417,7 @@ 'title': 'YDL_Empty_List', }, 'playlist_count': 0, + 'skip': 'This playlist is private', }, { 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', @@ -1625,17 +2449,25 @@ 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl', }, 'playlist_count': 2, + 'skip': 'This playlist is private', }, { 'note': 'embedded', - 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', + 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', 'playlist_count': 4, 'info_dict': { 'title': 'JODA15', 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', } }, { + 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', + 'playlist_mincount': 485, + 'info_dict': { + 'title': '2017 華語最新單曲 (2/24更新)', + 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', + } + }, { 'note': 'Embedded SWF player', - 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0', + 'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0', 'playlist_count': 4, 'info_dict': { 'title': 'JODA7', @@ -1648,7 +2480,64 @@ 'title': 'Uploads from Interstellar Movie', 'id': 'UUXw-G3eDE9trcvY2sBMM_aA', }, - 'playlist_mincout': 21, + 'playlist_mincount': 21, + }, { + # Playlist URL that does not actually serve a playlist + 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4', + 'info_dict': { + 'id': 'FqZTN594JQw', + 'ext': 'webm', + 'title': "Smiley's People 01 detective, Adventure Series, Action", + 'uploader': 'STREEM', + 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng', + 'upload_date': '20150526', + 'license': 'Standard YouTube License', + 'description': 'md5:507cdcb5a49ac0da37a920ece610be80', + 'categories': ['People & Blogs'], + 'tags': list, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [YoutubeIE.ie_key()], + }, { + 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5', + 'info_dict': { + 'id': 'yeWKywCrFtk', + 'ext': 'mp4', + 'title': 'Small Scale Baler and Braiding Rugs', + 'uploader': 'Backus-Page House Museum', + 'uploader_id': 'backuspagemuseum', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum', + 'upload_date': '20161008', + 'license': 'Standard YouTube License', + 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a', + 'categories': ['Nonprofits & Activism'], + 'tags': list, + 'like_count': int, + 'dislike_count': int, + }, + 'params': { + 'noplaylist': True, + 'skip_download': True, + }, + }, { + 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21', + 'only_matching': True, + }, { + 'url': 'TLGGrESM50VT6acwMjAyMjAxNw', + 'only_matching': True, + }, { + # music album playlist + 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM', + 'only_matching': True, + }, { + 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU', + 'only_matching': True, }] def _real_initialize(self): @@ -1657,20 +2546,32 @@ def _extract_mix(self, playlist_id): # The mixes are generated from a single video # the id of the playlist is just 'RD' + video_id - url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id) - webpage = self._download_webpage( - url, playlist_id, 'Downloading Youtube mix') + ids = [] + last_id = playlist_id[-11:] + for n in itertools.count(1): + url = 'https://youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id) + webpage = self._download_webpage( + url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n)) + new_ids = orderedSet(re.findall( + r'''(?xs)data-video-username=".*?".*? + href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id), + webpage)) + # Fetch new pages until all the videos are repeated, it seems that + # there are always 51 unique videos. + new_ids = [_id for _id in new_ids if _id not in ids] + if not new_ids: + break + ids.extend(new_ids) + last_id = ids[-1] + + url_results = self._ids_to_results(ids) + search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage) title_span = ( - search_title('playlist-title') or - search_title('title long-title') or - search_title('title')) + search_title('playlist-title') + or search_title('title long-title') + or search_title('title')) title = clean_html(title_span) - ids = orderedSet(re.findall( - r'''(?xs)data-video-username=".*?".*? - href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id), - webpage)) - url_results = self._ids_to_results(ids) return self.playlist_result(url_results, playlist_id, title) @@ -1678,14 +2579,18 @@ url = self._TEMPLATE_URL % playlist_id page = self._download_webpage(url, playlist_id) - for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page): + # the yt-alert-message now has tabindex attribute (see https://github.com/ytdl-org/youtube-dl/issues/11604) + for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page): match = match.strip() # Check if the playlist exists or is private - if re.match(r'[^<]*(The|This) playlist (does not exist|is private)[^<]*', match): - raise ExtractorError( - 'The playlist doesn\'t exist or is private, use --username or ' - '--netrc to access it.', - expected=True) + mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match) + if mobj: + reason = mobj.group('reason') + message = 'This playlist %s' % reason + if 'private' in reason: + message += ', use --username or --netrc to access it' + message += '.' + raise ExtractorError(message, expected=True) elif re.match(r'[^<]*Invalid parameters[^<]*', match): raise ExtractorError( 'Invalid parameters. Maybe URL is incorrect.', @@ -1697,20 +2602,55 @@ playlist_title = self._html_search_regex( r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>', - page, 'title') + page, 'title', default=None) + + _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref=' + uploader = self._search_regex( + r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE, + page, 'uploader', default=None) + mobj = re.search( + r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE, + page) + if mobj: + uploader_id = mobj.group('uploader_id') + uploader_url = compat_urlparse.urljoin(url, mobj.group('path')) + else: + uploader_id = uploader_url = None + + has_videos = True + + if not playlist_title: + try: + # Some playlist URLs don't actually serve a playlist (e.g. + # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4) + next(self._entries(page, playlist_id)) + except StopIteration: + has_videos = False + + playlist = self.playlist_result( + self._entries(page, playlist_id), playlist_id, playlist_title) + playlist.update({ + 'uploader': uploader, + 'uploader_id': uploader_id, + 'uploader_url': uploader_url, + }) - return self.playlist_result(self._entries(page, playlist_id), playlist_id, playlist_title) + return has_videos, playlist def _check_download_just_video(self, url, playlist_id): # Check if it's a video-specific URL query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - if 'v' in query_dict: - video_id = query_dict['v'][0] + video_id = query_dict.get('v', [None])[0] or self._search_regex( + r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url, + 'video id', default=None) + if video_id: if self._downloader.params.get('noplaylist'): self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - return self.url_result(video_id, 'Youtube', video_id=video_id) + return video_id, self.url_result(video_id, 'Youtube', video_id=video_id) else: self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) + return video_id, None + return None, None def _real_extract(self, url): # Extract playlist id @@ -1719,20 +2659,28 @@ raise ExtractorError('Invalid URL: %s' % url) playlist_id = mobj.group(1) or mobj.group(2) - video = self._check_download_just_video(url, playlist_id) + video_id, video = self._check_download_just_video(url, playlist_id) if video: return video - if playlist_id.startswith('RD') or playlist_id.startswith('UL'): + if playlist_id.startswith(('RD', 'UL', 'PU')): # Mixes require a custom extraction process return self._extract_mix(playlist_id) - return self._extract_playlist(playlist_id) + has_videos, playlist = self._extract_playlist(playlist_id) + if has_videos or not video_id: + return playlist + + # Some playlist URLs don't actually serve a playlist (see + # https://github.com/ytdl-org/youtube-dl/issues/10537). + # Fallback to plain video extraction if there is a video id + # along with playlist id. + return self.url_result(video_id, 'Youtube', video_id=video_id) class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): IE_DESC = 'YouTube.com channels' - _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)' + _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)' _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos' _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?' IE_NAME = 'youtube:channel' @@ -1753,16 +2701,23 @@ 'id': 'UUs0ifCMCm1icqRbqhUINa0w', 'title': 'Uploads from Deus Ex', }, + }, { + 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA', + 'only_matching': True, }] @classmethod def suitable(cls, url): - return False if YoutubePlaylistsIE.suitable(url) else super(YoutubeChannelIE, cls).suitable(url) + return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url) + else super(YoutubeChannelIE, cls).suitable(url)) + + def _build_template_url(self, url, channel_id): + return self._TEMPLATE_URL % channel_id def _real_extract(self, url): channel_id = self._match_id(url) - url = self._TEMPLATE_URL % channel_id + url = self._build_template_url(url, channel_id) # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778) # Workaround by extracting as a playlist if managed to obtain channel playlist URL @@ -1776,9 +2731,13 @@ channel_playlist_id = self._html_search_meta( 'channelId', channel_page, 'channel id', default=None) if not channel_playlist_id: - channel_playlist_id = self._search_regex( - r'data-(?:channel-external-|yt)id="([^"]+)"', - channel_page, 'channel id', default=None) + channel_url = self._html_search_meta( + ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'), + channel_page, 'channel url', default=None) + if channel_url: + channel_playlist_id = self._search_regex( + r'vnd\.youtube://user/([0-9A-Za-z_-]+)', + channel_url, 'channel id', default=None) if channel_playlist_id and channel_playlist_id.startswith('UC'): playlist_id = 'UU' + channel_playlist_id[2:] return self.url_result( @@ -1801,36 +2760,121 @@ for video_id, video_title in self.extract_videos_from_page(channel_page)] return self.playlist_result(entries, channel_id) + try: + next(self._entries(channel_page, channel_id)) + except StopIteration: + alert_message = self._html_search_regex( + r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>', + channel_page, 'alert', default=None, group='alert') + if alert_message: + raise ExtractorError('Youtube said: %s' % alert_message, expected=True) + return self.playlist_result(self._entries(channel_page, channel_id), channel_id) class YoutubeUserIE(YoutubeChannelIE): IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)' - _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)' - _TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos' + _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)' + _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos' IE_NAME = 'youtube:user' _TESTS = [{ 'url': 'https://www.youtube.com/user/TheLinuxFoundation', 'playlist_mincount': 320, 'info_dict': { - 'title': 'TheLinuxFoundation', + 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ', + 'title': 'Uploads from The Linux Foundation', + } + }, { + # Only available via https://www.youtube.com/c/12minuteathlete/videos + # but not https://www.youtube.com/user/12minuteathlete/videos + 'url': 'https://www.youtube.com/c/12minuteathlete/videos', + 'playlist_mincount': 249, + 'info_dict': { + 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ', + 'title': 'Uploads from 12 Minute Athlete', } }, { 'url': 'ytuser:phihag', 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/c/gametrailers', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/gametrailers', + 'only_matching': True, + }, { + # This channel is not available, geo restricted to JP + 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos', + 'only_matching': True, }] @classmethod def suitable(cls, url): # Don't return True if the url can be extracted with other youtube # extractor, the regex would is too permissive and it would match. - other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls) - if any(ie.suitable(url) for ie in other_ies): + other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls) + if any(ie.suitable(url) for ie in other_yt_ies): return False else: return super(YoutubeUserIE, cls).suitable(url) + def _build_template_url(self, url, channel_id): + mobj = re.match(self._VALID_URL, url) + return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id')) + + +class YoutubeLiveIE(YoutubeBaseInfoExtractor): + IE_DESC = 'YouTube.com live streams' + _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live' + IE_NAME = 'youtube:live' + + _TESTS = [{ + 'url': 'https://www.youtube.com/user/TheYoungTurks/live', + 'info_dict': { + 'id': 'a48o2S1cPoo', + 'ext': 'mp4', + 'title': 'The Young Turks - Live Main Show', + 'uploader': 'The Young Turks', + 'uploader_id': 'TheYoungTurks', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks', + 'upload_date': '20150715', + 'license': 'Standard YouTube License', + 'description': 'md5:438179573adcdff3c97ebb1ee632b891', + 'categories': ['News & Politics'], + 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'], + 'like_count': int, + 'dislike_count': int, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/c/CommanderVideoHq/live', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/TheYoungTurks/live', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + channel_id = mobj.group('id') + base_url = mobj.group('base_url') + webpage = self._download_webpage(url, channel_id, fatal=False) + if webpage: + page_type = self._og_search_property( + 'type', webpage, 'page type', default='') + video_id = self._html_search_meta( + 'videoId', webpage, 'video id', default=None) + if page_type.startswith('video') and video_id and re.match( + r'^[0-9A-Za-z_-]{11}$', video_id): + return self.url_result(video_id, YoutubeIE.ie_key()) + return self.url_result(base_url) + class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): IE_DESC = 'YouTube.com user/channel playlists' @@ -1838,7 +2882,7 @@ IE_NAME = 'youtube:playlists' _TESTS = [{ - 'url': 'http://www.youtube.com/user/ThirstForScience/playlists', + 'url': 'https://www.youtube.com/user/ThirstForScience/playlists', 'playlist_mincount': 4, 'info_dict': { 'id': 'ThirstForScience', @@ -1846,7 +2890,7 @@ }, }, { # with "Load more" button - 'url': 'http://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd', + 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd', 'playlist_mincount': 70, 'info_dict': { 'id': 'igorkle1', @@ -1862,7 +2906,11 @@ }] -class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE): +class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor): + _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?' + + +class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor): IE_DESC = 'YouTube.com searches' # there doesn't appear to be a real limit, for example if you search for # 'python' you get more than 8.000.000 results @@ -1878,29 +2926,34 @@ videos = [] limit = n + url_query = { + 'search_query': query.encode('utf-8'), + } + url_query.update(self._EXTRA_QUERY_ARGS) + result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query) + for pagenum in itertools.count(1): - url_query = { - 'search_query': query.encode('utf-8'), - 'page': pagenum, - 'spf': 'navigate', - } - url_query.update(self._EXTRA_QUERY_ARGS) - result_url = 'https://www.youtube.com/results?' + compat_urllib_parse.urlencode(url_query) data = self._download_json( result_url, video_id='query "%s"' % query, note='Downloading page %s' % pagenum, - errnote='Unable to download API page') + errnote='Unable to download API page', + query={'spf': 'navigate'}) html_content = data[1]['body']['content'] if 'class="search-message' in html_content: raise ExtractorError( '[youtube] No video results', expected=True) - new_videos = self._ids_to_results(orderedSet(re.findall( - r'href="/watch\?v=(.{11})', html_content))) + new_videos = list(self._process_page(html_content)) videos += new_videos if not new_videos or len(videos) > limit: break + next_link = self._html_search_regex( + r'href="(/results\?[^"]*\bsp=[^"]+)"[^>]*>\s*<span[^>]+class="[^"]*\byt-uix-button-content\b[^"]*"[^>]*>Next', + html_content, 'next link', default=None) + if next_link is None: + break + result_url = compat_urlparse.urljoin('https://www.youtube.com/', next_link) if len(videos) > n: videos = videos[:n] @@ -1914,7 +2967,7 @@ _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'} -class YoutubeSearchURLIE(InfoExtractor): +class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor): IE_DESC = 'YouTube.com search URLs' IE_NAME = 'youtube:search_url' _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)' @@ -1932,37 +2985,13 @@ def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) query = compat_urllib_parse_unquote_plus(mobj.group('query')) - webpage = self._download_webpage(url, query) - result_code = self._search_regex( - r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML') - - part_codes = re.findall( - r'(?s)<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*>(.*?)</h3>', result_code) - entries = [] - for part_code in part_codes: - part_title = self._html_search_regex( - [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False) - part_url_snippet = self._html_search_regex( - r'(?s)href="([^"]+)"', part_code, 'item URL') - part_url = compat_urlparse.urljoin( - 'https://www.youtube.com/', part_url_snippet) - entries.append({ - '_type': 'url', - 'url': part_url, - 'title': part_title, - }) - - return { - '_type': 'playlist', - 'entries': entries, - 'title': query, - } + return self.playlist_result(self._process_page(webpage), playlist_title=query) class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor): IE_DESC = 'YouTube.com (multi-season) shows' - _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)' IE_NAME = 'youtube:show' _TESTS = [{ 'url': 'https://www.youtube.com/show/airdisasters', @@ -1993,10 +3022,7 @@ def _real_initialize(self): self._login() - def _real_extract(self, url): - page = self._download_webpage( - 'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE) - + def _entries(self, page): # The extraction process is the same as for playlists, but the regex # for the video ids doesn't contain an index ids = [] @@ -2007,12 +3033,15 @@ # 'recommended' feed has infinite 'load more' and each new portion spins # the same videos in (sometimes) slightly different order, so we'll check # for unicity and break when portion has no new videos - new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches)) + new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches))) if not new_ids: break ids.extend(new_ids) + for entry in self._ids_to_results(new_ids): + yield entry + mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) if not mobj: break @@ -2024,14 +3053,18 @@ content_html = more['content_html'] more_widget_html = more['load_more_widget_html'] + def _real_extract(self, url): + page = self._download_webpage( + 'https://www.youtube.com/feed/%s' % self._FEED_NAME, + self._PLAYLIST_TITLE) return self.playlist_result( - self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE) + self._entries(page), playlist_title=self._PLAYLIST_TITLE) class YoutubeWatchLaterIE(YoutubePlaylistIE): IE_NAME = 'youtube:watchlater' IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)' - _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater' _TESTS = [{ 'url': 'https://www.youtube.com/playlist?list=WL', @@ -2042,16 +3075,17 @@ }] def _real_extract(self, url): - video = self._check_download_just_video(url, 'WL') + _, video = self._check_download_just_video(url, 'WL') if video: return video - return self._extract_playlist('WL') + _, playlist = self._extract_playlist('WL') + return playlist class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): IE_NAME = 'youtube:favorites' IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)' - _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?' _LOGIN_REQUIRED = True def _real_extract(self, url): @@ -2062,21 +3096,21 @@ class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)' - _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?' _FEED_NAME = 'recommended' _PLAYLIST_TITLE = 'Youtube Recommended videos' class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)' - _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?' _FEED_NAME = 'subscriptions' _PLAYLIST_TITLE = 'Youtube Subscriptions' class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)' - _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory' _FEED_NAME = 'history' _PLAYLIST_TITLE = 'Youtube History' @@ -2101,10 +3135,10 @@ ''' _TESTS = [{ - 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041', + 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041', 'only_matching': True, }, { - 'url': 'http://www.youtube.com/watch?', + 'url': 'https://www.youtube.com/watch?', 'only_matching': True, }, { 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534', @@ -2125,7 +3159,7 @@ 'Did you forget to quote the URL? Remember that & is a meta ' 'character in most shells, so you want to put the URL in quotes, ' 'like youtube-dl ' - '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" ' + '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" ' ' or simply youtube-dl BaW_jenozKc .', expected=True) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/zapiks.py youtube-dl-2019.05.20/youtube_dl/extractor/zapiks.py --- youtube-dl-2016.02.22/youtube_dl/extractor/zapiks.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/zapiks.py 2019-06-07 18:49:07.000000000 +0000 @@ -24,7 +24,7 @@ 'ext': 'mp4', 'title': 'EP2S3 - Bon Appétit - Eh bé viva les pyrénées con!', 'description': 'md5:7054d6f6f620c6519be1fe710d4da847', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 528, 'timestamp': 1359044972, 'upload_date': '20130124', diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/zaq1.py youtube-dl-2019.05.20/youtube_dl/extractor/zaq1.py --- youtube-dl-2016.02.22/youtube_dl/extractor/zaq1.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/zaq1.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,101 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unified_timestamp, +) + + +class Zaq1IE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?zaq1\.pl/video/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'http://zaq1.pl/video/xev0e', + 'md5': '24a5eb3f052e604ae597c4d0d19b351e', + 'info_dict': { + 'id': 'xev0e', + 'title': 'DJ NA WESELE. TANIEC Z FIGURAMI.węgrów/sokołów podlaski/siedlce/mińsk mazowiecki/warszawa', + 'description': 'www.facebook.com/weseledjKontakt: 728 448 199 / 505 419 147', + 'ext': 'mp4', + 'duration': 511, + 'timestamp': 1490896361, + 'uploader': 'Anonim', + 'upload_date': '20170330', + 'view_count': int, + } + }, { + # malformed JSON-LD + 'url': 'http://zaq1.pl/video/x81vn', + 'info_dict': { + 'id': 'x81vn', + 'title': 'SEKRETNE ŻYCIE WALTERA MITTY', + 'ext': 'mp4', + 'duration': 6234, + 'timestamp': 1493494860, + 'uploader': 'Anonim', + 'upload_date': '20170429', + 'view_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Failed to parse JSON'], + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_url = self._search_regex( + r'data-video-url=(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'video url', group='url') + + info = self._search_json_ld(webpage, video_id, fatal=False) + + def extract_data(field, name, fatal=False): + return self._search_regex( + r'data-%s=(["\'])(?P<field>(?:(?!\1).)+)\1' % field, + webpage, field, fatal=fatal, group='field') + + if not info.get('title'): + info['title'] = extract_data('file-name', 'title', fatal=True) + + if not info.get('duration'): + info['duration'] = int_or_none(extract_data('duration', 'duration')) + + if not info.get('thumbnail'): + info['thumbnail'] = extract_data('photo-url', 'thumbnail') + + if not info.get('timestamp'): + info['timestamp'] = unified_timestamp(self._html_search_meta( + 'uploadDate', webpage, 'timestamp')) + + if not info.get('interactionCount'): + info['view_count'] = int_or_none(self._html_search_meta( + 'interactionCount', webpage, 'view count')) + + uploader = self._html_search_regex( + r'Wideo dodał:\s*<a[^>]*>([^<]+)</a>', webpage, 'uploader', + fatal=False) + + width = int_or_none(self._html_search_meta( + 'width', webpage, fatal=False)) + height = int_or_none(self._html_search_meta( + 'height', webpage, fatal=False)) + + info.update({ + 'id': video_id, + 'formats': [{ + 'url': video_url, + 'width': width, + 'height': height, + 'http_headers': { + 'Referer': url, + }, + }], + 'uploader': uploader, + }) + + return info diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/zattoo.py youtube-dl-2019.05.20/youtube_dl/extractor/zattoo.py --- youtube-dl-2016.02.22/youtube_dl/extractor/zattoo.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/zattoo.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,433 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +from uuid import uuid4 + +from .common import InfoExtractor +from ..compat import ( + compat_HTTPError, + compat_str, +) +from ..utils import ( + ExtractorError, + int_or_none, + try_get, + url_or_none, + urlencode_postdata, +) + + +class ZattooPlatformBaseIE(InfoExtractor): + _power_guide_hash = None + + def _host_url(self): + return 'https://%s' % (self._API_HOST if hasattr(self, '_API_HOST') else self._HOST) + + def _login(self): + username, password = self._get_login_info() + if not username or not password: + self.raise_login_required( + 'A valid %s account is needed to access this media.' + % self._NETRC_MACHINE) + + try: + data = self._download_json( + '%s/zapi/v2/account/login' % self._host_url(), None, 'Logging in', + data=urlencode_postdata({ + 'login': username, + 'password': password, + 'remember': 'true', + }), headers={ + 'Referer': '%s/login' % self._host_url(), + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + raise ExtractorError( + 'Unable to login: incorrect username and/or password', + expected=True) + raise + + self._power_guide_hash = data['session']['power_guide_hash'] + + def _real_initialize(self): + webpage = self._download_webpage( + self._host_url(), None, 'Downloading app token') + app_token = self._html_search_regex( + r'appToken\s*=\s*(["\'])(?P<token>(?:(?!\1).)+?)\1', + webpage, 'app token', group='token') + app_version = self._html_search_regex( + r'<!--\w+-(.+?)-', webpage, 'app version', default='2.8.2') + + # Will setup appropriate cookies + self._request_webpage( + '%s/zapi/v2/session/hello' % self._host_url(), None, + 'Opening session', data=urlencode_postdata({ + 'client_app_token': app_token, + 'uuid': compat_str(uuid4()), + 'lang': 'en', + 'app_version': app_version, + 'format': 'json', + })) + + self._login() + + def _extract_cid(self, video_id, channel_name): + channel_groups = self._download_json( + '%s/zapi/v2/cached/channels/%s' % (self._host_url(), + self._power_guide_hash), + video_id, 'Downloading channel list', + query={'details': False})['channel_groups'] + channel_list = [] + for chgrp in channel_groups: + channel_list.extend(chgrp['channels']) + try: + return next( + chan['cid'] for chan in channel_list + if chan.get('cid') and ( + chan.get('display_alias') == channel_name + or chan.get('cid') == channel_name)) + except StopIteration: + raise ExtractorError('Could not extract channel id') + + def _extract_cid_and_video_info(self, video_id): + data = self._download_json( + '%s/zapi/v2/cached/program/power_details/%s' % ( + self._host_url(), self._power_guide_hash), + video_id, + 'Downloading video information', + query={ + 'program_ids': video_id, + 'complete': True, + }) + + p = data['programs'][0] + cid = p['cid'] + + info_dict = { + 'id': video_id, + 'title': p.get('t') or p['et'], + 'description': p.get('d'), + 'thumbnail': p.get('i_url'), + 'creator': p.get('channel_name'), + 'episode': p.get('et'), + 'episode_number': int_or_none(p.get('e_no')), + 'season_number': int_or_none(p.get('s_no')), + 'release_year': int_or_none(p.get('year')), + 'categories': try_get(p, lambda x: x['c'], list), + 'tags': try_get(p, lambda x: x['g'], list) + } + + return cid, info_dict + + def _extract_formats(self, cid, video_id, record_id=None, is_live=False): + postdata_common = { + 'https_watch_urls': True, + } + + if is_live: + postdata_common.update({'timeshift': 10800}) + url = '%s/zapi/watch/live/%s' % (self._host_url(), cid) + elif record_id: + url = '%s/zapi/watch/recording/%s' % (self._host_url(), record_id) + else: + url = '%s/zapi/watch/recall/%s/%s' % (self._host_url(), cid, video_id) + + formats = [] + for stream_type in ('dash', 'hls', 'hls5', 'hds'): + postdata = postdata_common.copy() + postdata['stream_type'] = stream_type + + data = self._download_json( + url, video_id, 'Downloading %s formats' % stream_type.upper(), + data=urlencode_postdata(postdata), fatal=False) + if not data: + continue + + watch_urls = try_get( + data, lambda x: x['stream']['watch_urls'], list) + if not watch_urls: + continue + + for watch in watch_urls: + if not isinstance(watch, dict): + continue + watch_url = url_or_none(watch.get('url')) + if not watch_url: + continue + format_id_list = [stream_type] + maxrate = watch.get('maxrate') + if maxrate: + format_id_list.append(compat_str(maxrate)) + audio_channel = watch.get('audio_channel') + if audio_channel: + format_id_list.append(compat_str(audio_channel)) + preference = 1 if audio_channel == 'A' else None + format_id = '-'.join(format_id_list) + if stream_type in ('dash', 'dash_widevine', 'dash_playready'): + this_formats = self._extract_mpd_formats( + watch_url, video_id, mpd_id=format_id, fatal=False) + elif stream_type in ('hls', 'hls5', 'hls5_fairplay'): + this_formats = self._extract_m3u8_formats( + watch_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id=format_id, + fatal=False) + elif stream_type == 'hds': + this_formats = self._extract_f4m_formats( + watch_url, video_id, f4m_id=format_id, fatal=False) + elif stream_type == 'smooth_playready': + this_formats = self._extract_ism_formats( + watch_url, video_id, ism_id=format_id, fatal=False) + else: + assert False + for this_format in this_formats: + this_format['preference'] = preference + formats.extend(this_formats) + self._sort_formats(formats) + return formats + + def _extract_video(self, channel_name, video_id, record_id=None, is_live=False): + if is_live: + cid = self._extract_cid(video_id, channel_name) + info_dict = { + 'id': channel_name, + 'title': self._live_title(channel_name), + 'is_live': True, + } + else: + cid, info_dict = self._extract_cid_and_video_info(video_id) + formats = self._extract_formats( + cid, video_id, record_id=record_id, is_live=is_live) + info_dict['formats'] = formats + return info_dict + + +class QuicklineBaseIE(ZattooPlatformBaseIE): + _NETRC_MACHINE = 'quickline' + _HOST = 'mobiltv.quickline.com' + + +class QuicklineIE(QuicklineBaseIE): + _VALID_URL = r'https?://(?:www\.)?%s/watch/(?P<channel>[^/]+)/(?P<id>[0-9]+)' % re.escape(QuicklineBaseIE._HOST) + + _TEST = { + 'url': 'https://mobiltv.quickline.com/watch/prosieben/130671867-maze-runner-die-auserwaehlten-in-der-brandwueste', + 'only_matching': True, + } + + def _real_extract(self, url): + channel_name, video_id = re.match(self._VALID_URL, url).groups() + return self._extract_video(channel_name, video_id) + + +class QuicklineLiveIE(QuicklineBaseIE): + _VALID_URL = r'https?://(?:www\.)?%s/watch/(?P<id>[^/]+)' % re.escape(QuicklineBaseIE._HOST) + + _TEST = { + 'url': 'https://mobiltv.quickline.com/watch/srf1', + 'only_matching': True, + } + + @classmethod + def suitable(cls, url): + return False if QuicklineIE.suitable(url) else super(QuicklineLiveIE, cls).suitable(url) + + def _real_extract(self, url): + channel_name = video_id = self._match_id(url) + return self._extract_video(channel_name, video_id, is_live=True) + + +class ZattooBaseIE(ZattooPlatformBaseIE): + _NETRC_MACHINE = 'zattoo' + _HOST = 'zattoo.com' + + +def _make_valid_url(tmpl, host): + return tmpl % re.escape(host) + + +class ZattooIE(ZattooBaseIE): + _VALID_URL_TEMPLATE = r'https?://(?:www\.)?%s/watch/(?P<channel>[^/]+?)/(?P<id>[0-9]+)[^/]+(?:/(?P<recid>[0-9]+))?' + _VALID_URL = _make_valid_url(_VALID_URL_TEMPLATE, ZattooBaseIE._HOST) + + # Since regular videos are only available for 7 days and recorded videos + # are only available for a specific user, we cannot have detailed tests. + _TESTS = [{ + 'url': 'https://zattoo.com/watch/prosieben/130671867-maze-runner-die-auserwaehlten-in-der-brandwueste', + 'only_matching': True, + }, { + 'url': 'https://zattoo.com/watch/srf_zwei/132905652-eishockey-spengler-cup/102791477/1512211800000/1514433500000/92000', + 'only_matching': True, + }] + + def _real_extract(self, url): + channel_name, video_id, record_id = re.match(self._VALID_URL, url).groups() + return self._extract_video(channel_name, video_id, record_id) + + +class ZattooLiveIE(ZattooBaseIE): + _VALID_URL = r'https?://(?:www\.)?zattoo\.com/watch/(?P<id>[^/]+)' + + _TEST = { + 'url': 'https://zattoo.com/watch/srf1', + 'only_matching': True, + } + + @classmethod + def suitable(cls, url): + return False if ZattooIE.suitable(url) else super(ZattooLiveIE, cls).suitable(url) + + def _real_extract(self, url): + channel_name = video_id = self._match_id(url) + return self._extract_video(channel_name, video_id, is_live=True) + + +class NetPlusIE(ZattooIE): + _NETRC_MACHINE = 'netplus' + _HOST = 'netplus.tv' + _API_HOST = 'www.%s' % _HOST + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://www.netplus.tv/watch/abc/123-abc', + 'only_matching': True, + }] + + +class MNetTVIE(ZattooIE): + _NETRC_MACHINE = 'mnettv' + _HOST = 'tvplus.m-net.de' + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://tvplus.m-net.de/watch/abc/123-abc', + 'only_matching': True, + }] + + +class WalyTVIE(ZattooIE): + _NETRC_MACHINE = 'walytv' + _HOST = 'player.waly.tv' + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://player.waly.tv/watch/abc/123-abc', + 'only_matching': True, + }] + + +class BBVTVIE(ZattooIE): + _NETRC_MACHINE = 'bbvtv' + _HOST = 'bbv-tv.net' + _API_HOST = 'www.%s' % _HOST + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://www.bbv-tv.net/watch/abc/123-abc', + 'only_matching': True, + }] + + +class VTXTVIE(ZattooIE): + _NETRC_MACHINE = 'vtxtv' + _HOST = 'vtxtv.ch' + _API_HOST = 'www.%s' % _HOST + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://www.vtxtv.ch/watch/abc/123-abc', + 'only_matching': True, + }] + + +class MyVisionTVIE(ZattooIE): + _NETRC_MACHINE = 'myvisiontv' + _HOST = 'myvisiontv.ch' + _API_HOST = 'www.%s' % _HOST + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://www.myvisiontv.ch/watch/abc/123-abc', + 'only_matching': True, + }] + + +class GlattvisionTVIE(ZattooIE): + _NETRC_MACHINE = 'glattvisiontv' + _HOST = 'iptv.glattvision.ch' + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://iptv.glattvision.ch/watch/abc/123-abc', + 'only_matching': True, + }] + + +class SAKTVIE(ZattooIE): + _NETRC_MACHINE = 'saktv' + _HOST = 'saktv.ch' + _API_HOST = 'www.%s' % _HOST + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://www.saktv.ch/watch/abc/123-abc', + 'only_matching': True, + }] + + +class EWETVIE(ZattooIE): + _NETRC_MACHINE = 'ewetv' + _HOST = 'tvonline.ewe.de' + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://tvonline.ewe.de/watch/abc/123-abc', + 'only_matching': True, + }] + + +class QuantumTVIE(ZattooIE): + _NETRC_MACHINE = 'quantumtv' + _HOST = 'quantum-tv.com' + _API_HOST = 'www.%s' % _HOST + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://www.quantum-tv.com/watch/abc/123-abc', + 'only_matching': True, + }] + + +class OsnatelTVIE(ZattooIE): + _NETRC_MACHINE = 'osnateltv' + _HOST = 'tvonline.osnatel.de' + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://tvonline.osnatel.de/watch/abc/123-abc', + 'only_matching': True, + }] + + +class EinsUndEinsTVIE(ZattooIE): + _NETRC_MACHINE = '1und1tv' + _HOST = '1und1.tv' + _API_HOST = 'www.%s' % _HOST + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://www.1und1.tv/watch/abc/123-abc', + 'only_matching': True, + }] + + +class SaltTVIE(ZattooIE): + _NETRC_MACHINE = 'salttv' + _HOST = 'tv.salt.ch' + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://tv.salt.ch/watch/abc/123-abc', + 'only_matching': True, + }] diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/zdf.py youtube-dl-2019.05.20/youtube_dl/extractor/zdf.py --- youtube-dl-2016.02.22/youtube_dl/extractor/zdf.py 2016-01-09 00:15:59.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/zdf.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,250 +1,319 @@ # coding: utf-8 from __future__ import unicode_literals -import functools import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( - int_or_none, - unified_strdate, - OnDemandPagedList, - xpath_text, determine_ext, + int_or_none, + NO_DEFAULT, + orderedSet, + parse_codecs, qualities, - float_or_none, - ExtractorError, + try_get, + unified_timestamp, + update_url_query, + url_or_none, + urljoin, ) -class ZDFIE(InfoExtractor): - _VALID_URL = r'(?:zdf:|zdf:video:|https?://www\.zdf\.de/ZDFmediathek(?:#)?/(.*beitrag/(?:video/)?))(?P<id>[0-9]+)(?:/[^/?]+)?(?:\?.*)?' +class ZDFBaseIE(InfoExtractor): + def _call_api(self, url, player, referrer, video_id, item): + return self._download_json( + url, video_id, 'Downloading JSON %s' % item, + headers={ + 'Referer': referrer, + 'Api-Auth': 'Bearer %s' % player['apiToken'], + }) + + def _extract_player(self, webpage, video_id, fatal=True): + return self._parse_json( + self._search_regex( + r'(?s)data-zdfplayer-jsb=(["\'])(?P<json>{.+?})\1', webpage, + 'player JSON', default='{}' if not fatal else NO_DEFAULT, + group='json'), + video_id) + + +class ZDFIE(ZDFBaseIE): + _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P<id>[^/?]+)\.html' + _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh') _TESTS = [{ - 'url': 'http://www.zdf.de/ZDFmediathek/beitrag/video/2037704/ZDFspezial---Ende-des-Machtpokers--?bc=sts;stt', + 'url': 'https://www.zdf.de/dokumentation/terra-x/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100.html', 'info_dict': { - 'id': '2037704', - 'ext': 'webm', - 'title': 'ZDFspezial - Ende des Machtpokers', - 'description': 'Union und SPD haben sich auf einen Koalitionsvertrag geeinigt. Aber was bedeutet das für die Bürger? Sehen Sie hierzu das ZDFspezial "Ende des Machtpokers - Große Koalition für Deutschland".', - 'duration': 1022, - 'uploader': 'spezial', - 'uploader_id': '225948', - 'upload_date': '20131127', + 'id': 'die-magie-der-farben-von-koenigspurpur-und-jeansblau-100', + 'ext': 'mp4', + 'title': 'Die Magie der Farben (2/2)', + 'description': 'md5:a89da10c928c6235401066b60a6d5c1a', + 'duration': 2615, + 'timestamp': 1465021200, + 'upload_date': '20160604', }, - 'skip': 'Videos on ZDF.de are depublicised in short order', + }, { + 'url': 'https://www.zdf.de/service-und-hilfe/die-neue-zdf-mediathek/zdfmediathek-trailer-100.html', + 'only_matching': True, + }, { + 'url': 'https://www.zdf.de/filme/taunuskrimi/die-lebenden-und-die-toten-1---ein-taunuskrimi-100.html', + 'only_matching': True, + }, { + 'url': 'https://www.zdf.de/dokumentation/planet-e/planet-e-uebersichtsseite-weitere-dokumentationen-von-planet-e-100.html', + 'only_matching': True, }] - def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): - param_groups = {} - for param_group in smil.findall(self._xpath_ns('./head/paramGroup', namespace)): - group_id = param_group.attrib.get(self._xpath_ns('id', 'http://www.w3.org/XML/1998/namespace')) - params = {} - for param in param_group: - params[param.get('name')] = param.get('value') - param_groups[group_id] = params + @staticmethod + def _extract_subtitles(src): + subtitles = {} + for caption in try_get(src, lambda x: x['captions'], list) or []: + subtitle_url = url_or_none(caption.get('uri')) + if subtitle_url: + lang = caption.get('language', 'deu') + subtitles.setdefault(lang, []).append({ + 'url': subtitle_url, + }) + return subtitles + + def _extract_format(self, video_id, formats, format_urls, meta): + format_url = url_or_none(meta.get('url')) + if not format_url: + return + if format_url in format_urls: + return + format_urls.add(format_url) + mime_type = meta.get('mimeType') + ext = determine_ext(format_url) + if mime_type == 'application/x-mpegURL' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', m3u8_id='hls', + entry_protocol='m3u8_native', fatal=False)) + elif mime_type == 'application/f4m+xml' or ext == 'f4m': + formats.extend(self._extract_f4m_formats( + update_url_query(format_url, {'hdcore': '3.7.0'}), video_id, f4m_id='hds', fatal=False)) + else: + f = parse_codecs(meta.get('mimeCodec')) + format_id = ['http'] + for p in (meta.get('type'), meta.get('quality')): + if p and isinstance(p, compat_str): + format_id.append(p) + f.update({ + 'url': format_url, + 'format_id': '-'.join(format_id), + 'format_note': meta.get('quality'), + 'language': meta.get('language'), + 'quality': qualities(self._QUALITIES)(meta.get('quality')), + 'preference': -10, + }) + formats.append(f) + + def _extract_entry(self, url, player, content, video_id): + title = content.get('title') or content['teaserHeadline'] + + t = content['mainVideoContent']['http://zdf.de/rels/target'] + + ptmd_path = t.get('http://zdf.de/rels/streams/ptmd') + + if not ptmd_path: + ptmd_path = t[ + 'http://zdf.de/rels/streams/ptmd-template'].replace( + '{playerId}', 'portal') + + ptmd = self._call_api( + urljoin(url, ptmd_path), player, url, video_id, 'metadata') formats = [] - for video in smil.findall(self._xpath_ns('.//video', namespace)): - src = video.get('src') - if not src: + track_uris = set() + for p in ptmd['priorityList']: + formitaeten = p.get('formitaeten') + if not isinstance(formitaeten, list): continue - bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) - group_id = video.get('paramGroup') - param_group = param_groups[group_id] - for proto in param_group['protocols'].split(','): - formats.append({ - 'url': '%s://%s' % (proto, param_group['host']), - 'app': param_group['app'], - 'play_path': src, - 'ext': 'flv', - 'format_id': '%s-%d' % (proto, bitrate), - 'tbr': bitrate, - }) + for f in formitaeten: + f_qualities = f.get('qualities') + if not isinstance(f_qualities, list): + continue + for quality in f_qualities: + tracks = try_get(quality, lambda x: x['audio']['tracks'], list) + if not tracks: + continue + for track in tracks: + self._extract_format( + video_id, formats, track_uris, { + 'url': track.get('uri'), + 'type': f.get('type'), + 'mimeType': f.get('mimeType'), + 'quality': quality.get('quality'), + 'language': track.get('language'), + }) self._sort_formats(formats) - return formats - def extract_from_xml_url(self, video_id, xml_url): - doc = self._download_xml( - xml_url, video_id, - note='Downloading video info', - errnote='Failed to download video info') - - status_code = doc.find('./status/statuscode') - if status_code is not None and status_code.text != 'ok': - code = status_code.text - if code == 'notVisibleAnymore': - message = 'Video %s is not available' % video_id - else: - message = '%s returned error: %s' % (self.IE_NAME, code) - raise ExtractorError(message, expected=True) - - title = doc.find('.//information/title').text - description = xpath_text(doc, './/information/detail', 'description') - duration = int_or_none(xpath_text(doc, './/details/lengthSec', 'duration')) - uploader = xpath_text(doc, './/details/originChannelTitle', 'uploader') - uploader_id = xpath_text(doc, './/details/originChannelId', 'uploader id') - upload_date = unified_strdate(xpath_text(doc, './/details/airtime', 'upload date')) - - def xml_to_thumbnails(fnode): - thumbnails = [] - for node in fnode: - thumbnail_url = node.text - if not thumbnail_url: + thumbnails = [] + layouts = try_get( + content, lambda x: x['teaserImageRef']['layouts'], dict) + if layouts: + for layout_key, layout_url in layouts.items(): + layout_url = url_or_none(layout_url) + if not layout_url: continue thumbnail = { - 'url': thumbnail_url, + 'url': layout_url, + 'format_id': layout_key, } - if 'key' in node.attrib: - m = re.match('^([0-9]+)x([0-9]+)$', node.attrib['key']) - if m: - thumbnail['width'] = int(m.group(1)) - thumbnail['height'] = int(m.group(2)) + mobj = re.search(r'(?P<width>\d+)x(?P<height>\d+)', layout_key) + if mobj: + thumbnail.update({ + 'width': int(mobj.group('width')), + 'height': int(mobj.group('height')), + }) thumbnails.append(thumbnail) - return thumbnails - thumbnails = xml_to_thumbnails(doc.findall('.//teaserimages/teaserimage')) + return { + 'id': video_id, + 'title': title, + 'description': content.get('leadParagraph') or content.get('teasertext'), + 'duration': int_or_none(t.get('duration')), + 'timestamp': unified_timestamp(content.get('editorialDate')), + 'thumbnails': thumbnails, + 'subtitles': self._extract_subtitles(ptmd), + 'formats': formats, + } - format_nodes = doc.findall('.//formitaeten/formitaet') - quality = qualities(['veryhigh', 'high', 'med', 'low']) + def _extract_regular(self, url, player, video_id): + content = self._call_api( + player['content'], player, url, video_id, 'content') + return self._extract_entry(player['content'], player, content, video_id) + + def _extract_mobile(self, video_id): + document = self._download_json( + 'https://zdf-cdn.live.cellular.de/mediathekV2/document/%s' % video_id, + video_id)['document'] - def get_quality(elem): - return quality(xpath_text(elem, 'quality')) - format_nodes.sort(key=get_quality) - format_ids = [] - formats = [] - for fnode in format_nodes: - video_url = fnode.find('url').text - is_available = 'http://www.metafilegenerator' not in video_url - if not is_available: - continue - format_id = fnode.attrib['basetype'] - quality = xpath_text(fnode, './quality', 'quality') - format_m = re.match(r'''(?x) - (?P<vcodec>[^_]+)_(?P<acodec>[^_]+)_(?P<container>[^_]+)_ - (?P<proto>[^_]+)_(?P<index>[^_]+)_(?P<indexproto>[^_]+) - ''', format_id) - - ext = determine_ext(video_url, None) or format_m.group('container') - if ext not in ('smil', 'f4m', 'm3u8'): - format_id = format_id + '-' + quality - if format_id in format_ids: - continue - - if ext == 'meta': - continue - elif ext == 'smil': - formats.extend(self._extract_smil_formats( - video_url, video_id, fatal=False)) - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - video_url, video_id, f4m_id=format_id, fatal=False)) - else: - proto = format_m.group('proto').lower() - - abr = int_or_none(xpath_text(fnode, './audioBitrate', 'abr'), 1000) - vbr = int_or_none(xpath_text(fnode, './videoBitrate', 'vbr'), 1000) - - width = int_or_none(xpath_text(fnode, './width', 'width')) - height = int_or_none(xpath_text(fnode, './height', 'height')) - - filesize = int_or_none(xpath_text(fnode, './filesize', 'filesize')) - - format_note = '' - if not format_note: - format_note = None - - formats.append({ - 'format_id': format_id, - 'url': video_url, - 'ext': ext, - 'acodec': format_m.group('acodec'), - 'vcodec': format_m.group('vcodec'), - 'abr': abr, - 'vbr': vbr, - 'width': width, - 'height': height, - 'filesize': filesize, - 'format_note': format_note, - 'protocol': proto, - '_available': is_available, - }) - format_ids.append(format_id) + title = document['titel'] + formats = [] + format_urls = set() + for f in document['formitaeten']: + self._extract_format(video_id, formats, format_urls, f) self._sort_formats(formats) + thumbnails = [] + teaser_bild = document.get('teaserBild') + if isinstance(teaser_bild, dict): + for thumbnail_key, thumbnail in teaser_bild.items(): + thumbnail_url = try_get( + thumbnail, lambda x: x['url'], compat_str) + if thumbnail_url: + thumbnails.append({ + 'url': thumbnail_url, + 'id': thumbnail_key, + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + return { 'id': video_id, 'title': title, - 'description': description, - 'duration': duration, + 'description': document.get('beschreibung'), + 'duration': int_or_none(document.get('length')), + 'timestamp': unified_timestamp(try_get( + document, lambda x: x['meta']['editorialDate'], compat_str)), 'thumbnails': thumbnails, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'upload_date': upload_date, + 'subtitles': self._extract_subtitles(document), 'formats': formats, } def _real_extract(self, url): video_id = self._match_id(url) - xml_url = 'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id - return self.extract_from_xml_url(video_id, xml_url) + webpage = self._download_webpage(url, video_id, fatal=False) + if webpage: + player = self._extract_player(webpage, url, fatal=False) + if player: + return self._extract_regular(url, player, video_id) + + return self._extract_mobile(video_id) -class ZDFChannelIE(InfoExtractor): - _VALID_URL = r'(?:zdf:topic:|https?://www\.zdf\.de/ZDFmediathek(?:#)?/.*kanaluebersicht/(?:[^/]+/)?)(?P<id>[0-9]+)' + +class ZDFChannelIE(ZDFBaseIE): + _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)' _TESTS = [{ - 'url': 'http://www.zdf.de/ZDFmediathek#/kanaluebersicht/1586442/sendung/Titanic', + 'url': 'https://www.zdf.de/sport/das-aktuelle-sportstudio', 'info_dict': { - 'id': '1586442', + 'id': 'das-aktuelle-sportstudio', + 'title': 'das aktuelle sportstudio | ZDF', }, - 'playlist_count': 3, - }, { - 'url': 'http://www.zdf.de/ZDFmediathek/kanaluebersicht/aktuellste/332', - 'only_matching': True, + 'playlist_count': 21, }, { - 'url': 'http://www.zdf.de/ZDFmediathek/kanaluebersicht/meist-gesehen/332', - 'only_matching': True, + 'url': 'https://www.zdf.de/dokumentation/planet-e', + 'info_dict': { + 'id': 'planet-e', + 'title': 'planet e.', + }, + 'playlist_count': 4, }, { - 'url': 'http://www.zdf.de/ZDFmediathek/kanaluebersicht/_/1798716?bc=nrt;nrm?flash=off', + 'url': 'https://www.zdf.de/filme/taunuskrimi/', 'only_matching': True, }] - _PAGE_SIZE = 50 - def _fetch_page(self, channel_id, page): - offset = page * self._PAGE_SIZE - xml_url = ( - 'http://www.zdf.de/ZDFmediathek/xmlservice/web/aktuellste?ak=web&offset=%d&maxLength=%d&id=%s' - % (offset, self._PAGE_SIZE, channel_id)) - doc = self._download_xml( - xml_url, channel_id, - note='Downloading channel info', - errnote='Failed to download channel info') - - title = doc.find('.//information/title').text - description = doc.find('.//information/detail').text - for asset in doc.findall('.//teasers/teaser'): - a_type = asset.find('./type').text - a_id = asset.find('./details/assetId').text - if a_type not in ('video', 'topic'): - continue - yield { - '_type': 'url', - 'playlist_title': title, - 'playlist_description': description, - 'url': 'zdf:%s:%s' % (a_type, a_id), - } + @classmethod + def suitable(cls, url): + return False if ZDFIE.suitable(url) else super(ZDFChannelIE, cls).suitable(url) def _real_extract(self, url): channel_id = self._match_id(url) - entries = OnDemandPagedList( - functools.partial(self._fetch_page, channel_id), self._PAGE_SIZE) - return { - '_type': 'playlist', - 'id': channel_id, - 'entries': entries, - } + webpage = self._download_webpage(url, channel_id) + + entries = [ + self.url_result(item_url, ie=ZDFIE.ie_key()) + for item_url in orderedSet(re.findall( + r'data-plusbar-url=["\'](http.+?\.html)', webpage))] + + return self.playlist_result( + entries, channel_id, self._og_search_title(webpage, fatal=False)) + + r""" + player = self._extract_player(webpage, channel_id) + + channel_id = self._search_regex( + r'docId\s*:\s*(["\'])(?P<id>(?!\1).+?)\1', webpage, + 'channel id', group='id') + + channel = self._call_api( + 'https://api.zdf.de/content/documents/%s.json' % channel_id, + player, url, channel_id) + + items = [] + for module in channel['module']: + for teaser in try_get(module, lambda x: x['teaser'], list) or []: + t = try_get( + teaser, lambda x: x['http://zdf.de/rels/target'], dict) + if not t: + continue + items.extend(try_get( + t, + lambda x: x['resultsWithVideo']['http://zdf.de/rels/search/results'], + list) or []) + items.extend(try_get( + module, + lambda x: x['filterRef']['resultsWithVideo']['http://zdf.de/rels/search/results'], + list) or []) + + entries = [] + entry_urls = set() + for item in items: + t = try_get(item, lambda x: x['http://zdf.de/rels/target'], dict) + if not t: + continue + sharing_url = t.get('http://zdf.de/rels/sharing-url') + if not sharing_url or not isinstance(sharing_url, compat_str): + continue + if sharing_url in entry_urls: + continue + entry_urls.add(sharing_url) + entries.append(self.url_result( + sharing_url, ie=ZDFIE.ie_key(), video_id=t.get('id'))) + + return self.playlist_result(entries, channel_id, channel.get('title')) + """ diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/zingmp3.py youtube-dl-2019.05.20/youtube_dl/extractor/zingmp3.py --- youtube-dl-2016.02.22/youtube_dl/extractor/zingmp3.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/zingmp3.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,16 +1,20 @@ -# coding=utf-8 +# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import ( + ExtractorError, + int_or_none, + update_url_query, +) class ZingMp3BaseInfoExtractor(InfoExtractor): - def _extract_item(self, item, fatal=True): - error_message = item.find('./errormessage').text + def _extract_item(self, item, page_type, fatal=True): + error_message = item.get('msg') if error_message: if not fatal: return @@ -18,25 +22,48 @@ '%s returned error: %s' % (self.IE_NAME, error_message), expected=True) - title = item.find('./title').text.strip() - source = item.find('./source').text - extension = item.attrib['type'] - thumbnail = item.find('./backimage').text + formats = [] + for quality, source_url in zip(item.get('qualities') or item.get('quality', []), item.get('source_list') or item.get('source', [])): + if not source_url or source_url == 'require vip': + continue + if not re.match(r'https?://', source_url): + source_url = '//' + source_url + source_url = self._proto_relative_url(source_url, 'http:') + quality_num = int_or_none(quality) + f = { + 'format_id': quality, + 'url': source_url, + } + if page_type == 'video': + f.update({ + 'height': quality_num, + 'ext': 'mp4', + }) + else: + f.update({ + 'abr': quality_num, + 'ext': 'mp3', + }) + formats.append(f) + + cover = item.get('cover') return { - 'title': title, - 'url': source, - 'ext': extension, - 'thumbnail': thumbnail, + 'title': (item.get('name') or item.get('title')).strip(), + 'formats': formats, + 'thumbnail': 'http:/' + cover if cover else None, + 'artist': item.get('artist'), } - def _extract_player_xml(self, player_xml_url, id, playlist_title=None): - player_xml = self._download_xml(player_xml_url, id, 'Downloading Player XML') - items = player_xml.findall('./item') + def _extract_player_json(self, player_json_url, id, page_type, playlist_title=None): + player_json = self._download_json(player_json_url, id, 'Downloading Player JSON') + items = player_json['data'] + if 'item' in items: + items = items['item'] if len(items) == 1: # one single song - data = self._extract_item(items[0]) + data = self._extract_item(items[0], page_type) data['id'] = id return data @@ -45,7 +72,7 @@ entries = [] for i, item in enumerate(items, 1): - entry = self._extract_item(item, fatal=False) + entry = self._extract_item(item, page_type, fatal=False) if not entry: continue entry['id'] = '%s-%d' % (id, i) @@ -59,8 +86,8 @@ } -class ZingMp3SongIE(ZingMp3BaseInfoExtractor): - _VALID_URL = r'https?://mp3\.zing\.vn/bai-hat/(?P<slug>[^/]+)/(?P<song_id>\w+)\.html' +class ZingMp3IE(ZingMp3BaseInfoExtractor): + _VALID_URL = r'https?://mp3\.zing\.vn/(?:bai-hat|album|playlist|video-clip)/[^/]+/(?P<id>\w+)\.html' _TESTS = [{ 'url': 'http://mp3.zing.vn/bai-hat/Xa-Mai-Xa-Bao-Thy/ZWZB9WAB.html', 'md5': 'ead7ae13693b3205cbc89536a077daed', @@ -68,53 +95,49 @@ 'id': 'ZWZB9WAB', 'title': 'Xa Mãi Xa', 'ext': 'mp3', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, - }] - IE_NAME = 'zingmp3:song' - IE_DESC = 'mp3.zing.vn songs' - - def _real_extract(self, url): - matched = re.match(self._VALID_URL, url) - slug = matched.group('slug') - song_id = matched.group('song_id') - - webpage = self._download_webpage( - 'http://mp3.zing.vn/bai-hat/%s/%s.html' % (slug, song_id), song_id) - - player_xml_url = self._search_regex( - r'&xmlURL=(?P<xml_url>[^&]+)&', webpage, 'player xml url') - - return self._extract_player_xml(player_xml_url, song_id) - - -class ZingMp3AlbumIE(ZingMp3BaseInfoExtractor): - _VALID_URL = r'https?://mp3\.zing\.vn/(?:album|playlist)/(?P<slug>[^/]+)/(?P<album_id>\w+)\.html' - _TESTS = [{ + }, { + 'url': 'http://mp3.zing.vn/video-clip/Let-It-Go-Frozen-OST-Sungha-Jung/ZW6BAEA0.html', + 'md5': '870295a9cd8045c0e15663565902618d', + 'info_dict': { + 'id': 'ZW6BAEA0', + 'title': 'Let It Go (Frozen OST)', + 'ext': 'mp4', + }, + }, { 'url': 'http://mp3.zing.vn/album/Lau-Dai-Tinh-Ai-Bang-Kieu-Minh-Tuyet/ZWZBWDAF.html', 'info_dict': { '_type': 'playlist', 'id': 'ZWZBWDAF', - 'title': 'Lâu Đài Tình Ái - Bằng Kiều ft. Minh Tuyết | Album 320 lossless', + 'title': 'Lâu Đài Tình Ái - Bằng Kiều,Minh Tuyết | Album 320 lossless', }, 'playlist_count': 10, + 'skip': 'removed at the request of the owner', }, { 'url': 'http://mp3.zing.vn/playlist/Duong-Hong-Loan-apollobee/IWCAACCB.html', 'only_matching': True, }] - IE_NAME = 'zingmp3:album' - IE_DESC = 'mp3.zing.vn albums' + IE_NAME = 'zingmp3' + IE_DESC = 'mp3.zing.vn' def _real_extract(self, url): - matched = re.match(self._VALID_URL, url) - slug = matched.group('slug') - album_id = matched.group('album_id') - - webpage = self._download_webpage( - 'http://mp3.zing.vn/album/%s/%s.html' % (slug, album_id), album_id) - player_xml_url = self._search_regex( - r'&xmlURL=(?P<xml_url>[^&]+)&', webpage, 'player xml url') - - return self._extract_player_xml( - player_xml_url, album_id, - playlist_title=self._og_search_title(webpage)) + page_id = self._match_id(url) + + webpage = self._download_webpage(url, page_id) + + player_json_url = self._search_regex([ + r'data-xml="([^"]+)', + r'&xmlURL=([^&]+)&' + ], webpage, 'player xml url') + + playlist_title = None + page_type = self._search_regex(r'/(?:html5)?xml/([^/-]+)', player_json_url, 'page type') + if page_type == 'video': + player_json_url = update_url_query(player_json_url, {'format': 'json'}) + else: + player_json_url = player_json_url.replace('/xml/', '/html5xml/') + if page_type == 'album': + playlist_title = self._og_search_title(webpage) + + return self._extract_player_json(player_json_url, page_id, page_type, playlist_title) diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/zippcast.py youtube-dl-2019.05.20/youtube_dl/extractor/zippcast.py --- youtube-dl-2016.02.22/youtube_dl/extractor/zippcast.py 2016-01-16 13:33:37.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/zippcast.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,94 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - determine_ext, - str_to_int, -) - - -class ZippCastIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?zippcast\.com/(?:video/|videoview\.php\?.*\bvplay=)(?P<id>[0-9a-zA-Z]+)' - _TESTS = [{ - # m3u8, hq direct link - 'url': 'http://www.zippcast.com/video/c9cfd5c7e44dbc29c81', - 'md5': '5ea0263b5606866c4d6cda0fc5e8c6b6', - 'info_dict': { - 'id': 'c9cfd5c7e44dbc29c81', - 'ext': 'mp4', - 'title': '[Vinesauce] Vinny - Digital Space Traveler', - 'description': 'Muted on youtube, but now uploaded in it\'s original form.', - 'thumbnail': 're:^https?://.*\.jpg$', - 'uploader': 'vinesauce', - 'view_count': int, - 'categories': ['Entertainment'], - 'tags': list, - }, - }, { - # f4m, lq ipod direct link - 'url': 'http://www.zippcast.com/video/b79c0a233e9c6581775', - 'only_matching': True, - }, { - 'url': 'http://www.zippcast.com/videoview.php?vplay=c9cfd5c7e44dbc29c81&auto=no', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage( - 'http://www.zippcast.com/video/%s' % video_id, video_id) - - formats = [] - video_url = self._search_regex( - r'<source[^>]+src=(["\'])(?P<url>.+?)\1', webpage, - 'video url', default=None, group='url') - if video_url: - formats.append({ - 'url': video_url, - 'format_id': 'http', - 'preference': 0, # direct link is almost always of worse quality - }) - src_url = self._search_regex( - r'src\s*:\s*(?:escape\()?(["\'])(?P<url>http://.+?)\1', - webpage, 'src', default=None, group='url') - ext = determine_ext(src_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - src_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - src_url, video_id, f4m_id='hds', fatal=False)) - self._sort_formats(formats) - - title = self._og_search_title(webpage) - description = self._og_search_description(webpage) or self._html_search_meta( - 'description', webpage) - uploader = self._search_regex( - r'<a[^>]+href="https?://[^/]+/profile/[^>]+>([^<]+)</a>', - webpage, 'uploader', fatal=False) - thumbnail = self._og_search_thumbnail(webpage) - view_count = str_to_int(self._search_regex( - r'>([\d,.]+) views!', webpage, 'view count', fatal=False)) - - categories = re.findall( - r'<a[^>]+href="https?://[^/]+/categories/[^"]+">([^<]+),?<', - webpage) - tags = re.findall( - r'<a[^>]+href="https?://[^/]+/search/tags/[^"]+">([^<]+),?<', - webpage) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'view_count': view_count, - 'categories': categories, - 'tags': tags, - 'formats': formats, - } diff -Nru youtube-dl-2016.02.22/youtube_dl/extractor/zype.py youtube-dl-2019.05.20/youtube_dl/extractor/zype.py --- youtube-dl-2016.02.22/youtube_dl/extractor/zype.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/extractor/zype.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,57 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class ZypeIE(InfoExtractor): + _VALID_URL = r'https?://player\.zype\.com/embed/(?P<id>[\da-fA-F]+)\.js\?.*?api_key=[^&]+' + _TEST = { + 'url': 'https://player.zype.com/embed/5b400b834b32992a310622b9.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ&autoplay=false&controls=true&da=false', + 'md5': 'eaee31d474c76a955bdaba02a505c595', + 'info_dict': { + 'id': '5b400b834b32992a310622b9', + 'ext': 'mp4', + 'title': 'Smoky Barbecue Favorites', + 'thumbnail': r're:^https?://.*\.jpe?g', + }, + } + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//player\.zype\.com/embed/[\da-fA-F]+\.js\?.*?api_key=.+?)\1', + webpage)] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = self._search_regex( + r'video_title\s*[:=]\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, + 'title', group='value') + + m3u8_url = self._search_regex( + r'(["\'])(?P<url>(?:(?!\1).)+\.m3u8(?:(?!\1).)*)\1', webpage, + 'm3u8 url', group='url') + + formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + self._sort_formats(formats) + + thumbnail = self._search_regex( + r'poster\s*[:=]\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, 'thumbnail', + default=False, group='url') + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'formats': formats, + } diff -Nru youtube-dl-2016.02.22/youtube_dl/__init__.py youtube-dl-2019.05.20/youtube_dl/__init__.py --- youtube-dl-2016.02.22/youtube_dl/__init__.py 2016-01-31 11:57:01.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/__init__.py 2019-06-07 18:53:00.000000000 +0000 @@ -1,5 +1,5 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals @@ -16,9 +16,7 @@ parseOpts, ) from .compat import ( - compat_expanduser, compat_getpass, - compat_print, compat_shlex_split, workaround_optparse_bug9161, ) @@ -27,6 +25,7 @@ decodeOption, DEFAULT_OUTTMPL, DownloadError, + expand_path, match_filter_func, MaxDownloadsReached, preferredencoding, @@ -35,19 +34,21 @@ setproctitle, std_headers, write_string, + render_table, ) from .update import update_self from .downloader import ( FileDownloader, ) from .extractor import gen_extractors, list_extractors +from .extractor.adobepass import MSO_INFO from .YoutubeDL import YoutubeDL def _real_main(argv=None): # Compatibility fixes for Windows if sys.platform == 'win32': - # https://github.com/rg3/youtube-dl/issues/820 + # https://github.com/ytdl-org/youtube-dl/issues/820 codecs.register(lambda name: codecs.lookup('utf-8') if name == 'cp65001' else None) workaround_optparse_bug9161() @@ -67,16 +68,16 @@ # Custom HTTP headers if opts.headers is not None: for h in opts.headers: - if h.find(':', 1) < 0: + if ':' not in h: parser.error('wrong header formatting, it should be key:value, not "%s"' % h) - key, value = h.split(':', 2) + key, value = h.split(':', 1) if opts.verbose: write_string('[debug] Adding header from command line option %s:%s\n' % (key, value)) std_headers[key] = value # Dump user agent if opts.dump_user_agent: - compat_print(std_headers['User-Agent']) + write_string(std_headers['User-Agent'] + '\n', out=sys.stdout) sys.exit(0) # Batch file verification @@ -86,23 +87,24 @@ if opts.batchfile == '-': batchfd = sys.stdin else: - batchfd = io.open(opts.batchfile, 'r', encoding='utf-8', errors='ignore') + batchfd = io.open( + expand_path(opts.batchfile), + 'r', encoding='utf-8', errors='ignore') batch_urls = read_batch_urls(batchfd) if opts.verbose: write_string('[debug] Batch file urls: ' + repr(batch_urls) + '\n') except IOError: sys.exit('ERROR: batch file could not be read') - all_urls = batch_urls + args - all_urls = [url.strip() for url in all_urls] + all_urls = batch_urls + [url.strip() for url in args] # batch_urls are already striped in read_batch_urls _enc = preferredencoding() all_urls = [url.decode(_enc, 'ignore') if isinstance(url, bytes) else url for url in all_urls] if opts.list_extractors: for ie in list_extractors(opts.age_limit): - compat_print(ie.IE_NAME + (' (CURRENTLY BROKEN)' if not ie._WORKING else '')) + write_string(ie.IE_NAME + (' (CURRENTLY BROKEN)' if not ie._WORKING else '') + '\n', out=sys.stdout) matchedUrls = [url for url in all_urls if ie.suitable(url)] for mu in matchedUrls: - compat_print(' ' + mu) + write_string(' ' + mu + '\n', out=sys.stdout) sys.exit(0) if opts.list_extractor_descriptions: for ie in list_extractors(opts.age_limit): @@ -115,7 +117,11 @@ _SEARCHES = ('cute kittens', 'slithering pythons', 'falling cat', 'angry poodle', 'purple fish', 'running tortoise', 'sleeping bunny', 'burping cow') _COUNTS = ('', '5', '10', 'all') desc += ' (Example: "%s%s:%s" )' % (ie.SEARCH_KEY, random.choice(_COUNTS), random.choice(_SEARCHES)) - compat_print(desc) + write_string(desc + '\n', out=sys.stdout) + sys.exit(0) + if opts.ap_list_mso: + table = [[mso_id, mso_info['name']] for mso_id, mso_info in MSO_INFO.items()] + write_string('Supported TV Providers:\n' + render_table(['mso', 'mso name'], table) + '\n', out=sys.stdout) sys.exit(0) # Conflicting, missing and erroneous options @@ -123,12 +129,22 @@ parser.error('using .netrc conflicts with giving username/password') if opts.password is not None and opts.username is None: parser.error('account username missing\n') + if opts.ap_password is not None and opts.ap_username is None: + parser.error('TV Provider account username missing\n') if opts.outtmpl is not None and (opts.usetitle or opts.autonumber or opts.useid): parser.error('using output template conflicts with using title, video ID or auto number') + if opts.autonumber_size is not None: + if opts.autonumber_size <= 0: + parser.error('auto number size must be positive') + if opts.autonumber_start is not None: + if opts.autonumber_start < 0: + parser.error('auto number start must be positive or 0') if opts.usetitle and opts.useid: parser.error('using title conflicts with using video ID') if opts.username is not None and opts.password is None: opts.password = compat_getpass('Type account password and press [Return]: ') + if opts.ap_username is not None and opts.ap_password is None: + opts.ap_password = compat_getpass('Type TV provider account password and press [Return]: ') if opts.ratelimit is not None: numeric_limit = FileDownloader.parse_bytes(opts.ratelimit) if numeric_limit is None: @@ -144,25 +160,50 @@ if numeric_limit is None: parser.error('invalid max_filesize specified') opts.max_filesize = numeric_limit - if opts.retries is not None: - if opts.retries in ('inf', 'infinite'): - opts_retries = float('inf') + if opts.sleep_interval is not None: + if opts.sleep_interval < 0: + parser.error('sleep interval must be positive or 0') + if opts.max_sleep_interval is not None: + if opts.max_sleep_interval < 0: + parser.error('max sleep interval must be positive or 0') + if opts.sleep_interval is None: + parser.error('min sleep interval must be specified, use --min-sleep-interval') + if opts.max_sleep_interval < opts.sleep_interval: + parser.error('max sleep interval must be greater than or equal to min sleep interval') + else: + opts.max_sleep_interval = opts.sleep_interval + if opts.ap_mso and opts.ap_mso not in MSO_INFO: + parser.error('Unsupported TV Provider, use --ap-list-mso to get a list of supported TV Providers') + + def parse_retries(retries): + if retries in ('inf', 'infinite'): + parsed_retries = float('inf') else: try: - opts_retries = int(opts.retries) + parsed_retries = int(retries) except (TypeError, ValueError): parser.error('invalid retry count specified') + return parsed_retries + if opts.retries is not None: + opts.retries = parse_retries(opts.retries) + if opts.fragment_retries is not None: + opts.fragment_retries = parse_retries(opts.fragment_retries) if opts.buffersize is not None: numeric_buffersize = FileDownloader.parse_bytes(opts.buffersize) if numeric_buffersize is None: parser.error('invalid buffer size specified') opts.buffersize = numeric_buffersize + if opts.http_chunk_size is not None: + numeric_chunksize = FileDownloader.parse_bytes(opts.http_chunk_size) + if not numeric_chunksize: + parser.error('invalid http chunk size specified') + opts.http_chunk_size = numeric_chunksize if opts.playliststart <= 0: raise ValueError('Playlist start must be positive') if opts.playlistend not in (-1, None) and opts.playlistend < opts.playliststart: raise ValueError('Playlist end must be greater than playlist start') if opts.extractaudio: - if opts.audioformat not in ['best', 'aac', 'mp3', 'm4a', 'opus', 'vorbis', 'wav']: + if opts.audioformat not in ['best', 'aac', 'flac', 'mp3', 'm4a', 'opus', 'vorbis', 'wav']: parser.error('invalid audio format specified') if opts.audioquality: opts.audioquality = opts.audioquality.strip('k').strip('K') @@ -172,7 +213,7 @@ if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg', 'mkv', 'avi']: parser.error('invalid video recode format specified') if opts.convertsubtitles is not None: - if opts.convertsubtitles not in ['srt', 'vtt', 'ass']: + if opts.convertsubtitles not in ['srt', 'vtt', 'ass', 'lrc']: parser.error('invalid subtitle format specified') if opts.date is not None: @@ -189,14 +230,14 @@ if opts.allsubtitles and not opts.writeautomaticsub: opts.writesubtitles = True - outtmpl = ((opts.outtmpl is not None and opts.outtmpl) or - (opts.format == '-1' and opts.usetitle and '%(title)s-%(id)s-%(format)s.%(ext)s') or - (opts.format == '-1' and '%(id)s-%(format)s.%(ext)s') or - (opts.usetitle and opts.autonumber and '%(autonumber)s-%(title)s-%(id)s.%(ext)s') or - (opts.usetitle and '%(title)s-%(id)s.%(ext)s') or - (opts.useid and '%(id)s.%(ext)s') or - (opts.autonumber and '%(autonumber)s-%(id)s.%(ext)s') or - DEFAULT_OUTTMPL) + outtmpl = ((opts.outtmpl is not None and opts.outtmpl) + or (opts.format == '-1' and opts.usetitle and '%(title)s-%(id)s-%(format)s.%(ext)s') + or (opts.format == '-1' and '%(id)s-%(format)s.%(ext)s') + or (opts.usetitle and opts.autonumber and '%(autonumber)s-%(title)s-%(id)s.%(ext)s') + or (opts.usetitle and '%(title)s-%(id)s.%(ext)s') + or (opts.useid and '%(id)s.%(ext)s') + or (opts.autonumber and '%(autonumber)s-%(id)s.%(ext)s') + or DEFAULT_OUTTMPL) if not os.path.splitext(outtmpl)[1] and opts.extractaudio: parser.error('Cannot download a video and extract audio into the same' ' file! Use "{0}.%(ext)s" instead of "{0}" as the output' @@ -204,18 +245,15 @@ any_getting = opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson or opts.dump_single_json any_printing = opts.print_json - download_archive_fn = compat_expanduser(opts.download_archive) if opts.download_archive is not None else opts.download_archive + download_archive_fn = expand_path(opts.download_archive) if opts.download_archive is not None else opts.download_archive # PostProcessors postprocessors = [] - # Add the metadata pp first, the other pps will copy it if opts.metafromtitle: postprocessors.append({ 'key': 'MetadataFromTitle', 'titleformat': opts.metafromtitle }) - if opts.addmetadata: - postprocessors.append({'key': 'FFmpegMetadata'}) if opts.extractaudio: postprocessors.append({ 'key': 'FFmpegExtractAudio', @@ -228,6 +266,16 @@ 'key': 'FFmpegVideoConvertor', 'preferedformat': opts.recodevideo, }) + # FFmpegMetadataPP should be run after FFmpegVideoConvertorPP and + # FFmpegExtractAudioPP as containers before conversion may not support + # metadata (3gp, webm, etc.) + # And this post-processor should be placed before other metadata + # manipulating post-processors (FFmpegEmbedSubtitle) to prevent loss of + # extra metadata. By default ffmpeg preserves metadata applicable for both + # source and target containers. From this point the container won't change, + # so metadata can be added here. + if opts.addmetadata: + postprocessors.append({'key': 'FFmpegMetadata'}) if opts.convertsubtitles: postprocessors.append({ 'key': 'FFmpegSubtitlesConvertor', @@ -237,8 +285,6 @@ postprocessors.append({ 'key': 'FFmpegEmbedSubtitle', }) - if opts.xattrs: - postprocessors.append({'key': 'XAttrMetadata'}) if opts.embedthumbnail: already_have_thumbnail = opts.writethumbnail or opts.write_all_thumbnails postprocessors.append({ @@ -247,6 +293,10 @@ }) if not already_have_thumbnail: opts.writethumbnail = True + # XAttrMetadataPP should be run after post-processors that may change file + # contents + if opts.xattrs: + postprocessors.append({'key': 'XAttrMetadata'}) # Please keep ExecAfterDownload towards the bottom as it allows the user to modify the final file in any way. # So if the user is able to remove the file before your postprocessor runs it might cause a few problems. if opts.exec_cmd: @@ -254,12 +304,6 @@ 'key': 'ExecAfterDownload', 'exec_cmd': opts.exec_cmd, }) - if opts.xattr_set_filesize: - try: - import xattr - xattr # Confuse flake8 - except ImportError: - parser.error('setting filesize xattr requested but python-xattr is not available') external_downloader_args = None if opts.external_downloader_args: external_downloader_args = compat_shlex_split(opts.external_downloader_args) @@ -276,6 +320,9 @@ 'password': opts.password, 'twofactor': opts.twofactor, 'videopassword': opts.videopassword, + 'ap_mso': opts.ap_mso, + 'ap_username': opts.ap_username, + 'ap_password': opts.ap_password, 'quiet': (opts.quiet or any_getting or any_printing), 'no_warnings': opts.no_warnings, 'forceurl': opts.geturl, @@ -294,20 +341,26 @@ 'listformats': opts.listformats, 'outtmpl': outtmpl, 'autonumber_size': opts.autonumber_size, + 'autonumber_start': opts.autonumber_start, 'restrictfilenames': opts.restrictfilenames, 'ignoreerrors': opts.ignoreerrors, 'force_generic_extractor': opts.force_generic_extractor, 'ratelimit': opts.ratelimit, 'nooverwrites': opts.nooverwrites, - 'retries': opts_retries, + 'retries': opts.retries, + 'fragment_retries': opts.fragment_retries, + 'skip_unavailable_fragments': opts.skip_unavailable_fragments, + 'keep_fragments': opts.keep_fragments, 'buffersize': opts.buffersize, 'noresizebuffer': opts.noresizebuffer, + 'http_chunk_size': opts.http_chunk_size, 'continuedl': opts.continue_dl, 'noprogress': opts.noprogress, 'progress_with_newline': opts.progress_with_newline, 'playliststart': opts.playliststart, 'playlistend': opts.playlistend, 'playlistreverse': opts.playlist_reverse, + 'playlistrandom': opts.playlist_random, 'noplaylist': opts.noplaylist, 'logtostderr': opts.outtmpl == '-', 'consoletitle': opts.consoletitle, @@ -355,12 +408,14 @@ 'youtube_include_dash_manifest': opts.youtube_include_dash_manifest, 'encoding': opts.encoding, 'extract_flat': opts.extract_flat, + 'mark_watched': opts.mark_watched, 'merge_output_format': opts.merge_output_format, 'postprocessors': postprocessors, 'fixup': opts.fixup, 'source_address': opts.source_address, 'call_home': opts.call_home, 'sleep_interval': opts.sleep_interval, + 'max_sleep_interval': opts.max_sleep_interval, 'external_downloader': opts.external_downloader, 'list_thumbnails': opts.list_thumbnails, 'playlist_items': opts.playlist_items, @@ -373,6 +428,14 @@ 'external_downloader_args': external_downloader_args, 'postprocessor_args': postprocessor_args, 'cn_verification_proxy': opts.cn_verification_proxy, + 'geo_verification_proxy': opts.geo_verification_proxy, + 'config_location': opts.config_location, + 'geo_bypass': opts.geo_bypass, + 'geo_bypass_country': opts.geo_bypass_country, + 'geo_bypass_ip_block': opts.geo_bypass_ip_block, + # just for deprecation check + 'autonumber': opts.autonumber if opts.autonumber is True else None, + 'usetitle': opts.usetitle if opts.usetitle is True else None, } with YoutubeDL(ydl_opts) as ydl: @@ -396,7 +459,7 @@ try: if opts.load_info_filename is not None: - retcode = ydl.download_with_info_file(opts.load_info_filename) + retcode = ydl.download_with_info_file(expand_path(opts.load_info_filename)) else: retcode = ydl.download(all_urls) except MaxDownloadsReached: @@ -416,4 +479,5 @@ except KeyboardInterrupt: sys.exit('\nERROR: Interrupted by user') + __all__ = ['main', 'YoutubeDL', 'gen_extractors', 'list_extractors'] diff -Nru youtube-dl-2016.02.22/youtube_dl/jsinterp.py youtube-dl-2019.05.20/youtube_dl/jsinterp.py --- youtube-dl-2016.02.22/youtube_dl/jsinterp.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/jsinterp.py 2019-06-07 18:49:07.000000000 +0000 @@ -6,6 +6,7 @@ from .utils import ( ExtractorError, + remove_quotes, ) _OPERATORS = [ @@ -57,7 +58,6 @@ def interpret_expression(self, expr, local_vars, allow_recursion): expr = expr.strip() - if expr == '': # Empty expression return None @@ -121,11 +121,19 @@ pass m = re.match( - r'(?P<var>%s)\.(?P<member>[^(]+)(?:\(+(?P<args>[^()]*)\))?$' % _NAME_RE, + r'(?P<in>%s)\[(?P<idx>.+)\]$' % _NAME_RE, expr) + if m: + val = local_vars[m.group('in')] + idx = self.interpret_expression( + m.group('idx'), local_vars, allow_recursion - 1) + return val[idx] + + m = re.match( + r'(?P<var>%s)(?:\.(?P<member>[^(]+)|\[(?P<member2>[^]]+)\])\s*(?:\(+(?P<args>[^()]*)\))?$' % _NAME_RE, expr) if m: variable = m.group('var') - member = m.group('member') + member = remove_quotes(m.group('member') or m.group('member2')) arg_str = m.group('args') if variable in local_vars: @@ -173,14 +181,6 @@ return obj[member](argvals) - m = re.match( - r'(?P<in>%s)\[(?P<idx>.+)\]$' % _NAME_RE, expr) - if m: - val = local_vars[m.group('in')] - idx = self.interpret_expression( - m.group('idx'), local_vars, allow_recursion - 1) - return val[idx] - for op, opfunc in _OPERATORS: m = re.match(r'(?P<x>.+?)%s(?P<y>.+)' % re.escape(op), expr) if not m: @@ -198,12 +198,12 @@ return opfunc(x, y) m = re.match( - r'^(?P<func>%s)\((?P<args>[a-zA-Z0-9_$,]+)\)$' % _NAME_RE, expr) + r'^(?P<func>%s)\((?P<args>[a-zA-Z0-9_$,]*)\)$' % _NAME_RE, expr) if m: fname = m.group('func') argvals = tuple([ int(v) if v.isdigit() else local_vars[v] - for v in m.group('args').split(',')]) + for v in m.group('args').split(',')]) if len(m.group('args')) > 0 else tuple() if fname not in self._functions: self._functions[fname] = self.extract_function(fname) return self._functions[fname](argvals) @@ -211,28 +211,32 @@ raise ExtractorError('Unsupported JS expression %r' % expr) def extract_object(self, objname): + _FUNC_NAME_RE = r'''(?:[a-zA-Z$0-9]+|"[a-zA-Z$0-9]+"|'[a-zA-Z$0-9]+')''' obj = {} obj_m = re.search( - (r'(?:var\s+)?%s\s*=\s*\{' % re.escape(objname)) + - r'\s*(?P<fields>([a-zA-Z$0-9]+\s*:\s*function\(.*?\)\s*\{.*?\}(?:,\s*)?)*)' + - r'\}\s*;', + r'''(?x) + (?<!this\.)%s\s*=\s*{\s* + (?P<fields>(%s\s*:\s*function\s*\(.*?\)\s*{.*?}(?:,\s*)?)*) + }\s*; + ''' % (re.escape(objname), _FUNC_NAME_RE), self.code) fields = obj_m.group('fields') # Currently, it only supports function definitions fields_m = re.finditer( - r'(?P<key>[a-zA-Z$0-9]+)\s*:\s*function' - r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}', + r'''(?x) + (?P<key>%s)\s*:\s*function\s*\((?P<args>[a-z,]+)\){(?P<code>[^}]+)} + ''' % _FUNC_NAME_RE, fields) for f in fields_m: argnames = f.group('args').split(',') - obj[f.group('key')] = self.build_function(argnames, f.group('code')) + obj[remove_quotes(f.group('key'))] = self.build_function(argnames, f.group('code')) return obj def extract_function(self, funcname): func_m = re.search( r'''(?x) - (?:function\s+%s|[{;,]%s\s*=\s*function|var\s+%s\s*=\s*function)\s* + (?:function\s+%s|[{;,]\s*%s\s*=\s*function|var\s+%s\s*=\s*function)\s* \((?P<args>[^)]*)\)\s* \{(?P<code>[^}]+)\}''' % ( re.escape(funcname), re.escape(funcname), re.escape(funcname)), diff -Nru youtube-dl-2016.02.22/youtube_dl/options.py youtube-dl-2019.05.20/youtube_dl/options.py --- youtube-dl-2016.02.22/youtube_dl/options.py 2016-02-14 22:26:17.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/options.py 2019-06-07 18:53:00.000000000 +0000 @@ -2,6 +2,7 @@ import os.path import optparse +import re import sys from .downloader.external import list_external_downloaders @@ -19,6 +20,24 @@ from .version import __version__ +def _hide_login_info(opts): + PRIVATE_OPTS = set(['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username']) + eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$') + + def _scrub_eq(o): + m = eqre.match(o) + if m: + return m.group('key') + '=PRIVATE' + else: + return o + + opts = list(map(_scrub_eq, opts)) + for idx, opt in enumerate(opts): + if opt in PRIVATE_OPTS and idx + 1 < len(opts): + opts[idx + 1] = 'PRIVATE' + return opts + + def parseOpts(overrideArguments=None): def _readOptions(filename_bytes, default=[]): try: @@ -26,9 +45,11 @@ except IOError: return default # silently skip if file is not present try: - res = [] - for l in optionf: - res += compat_shlex_split(l, comments=True) + # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56 + contents = optionf.read() + if sys.version_info < (3,): + contents = contents.decode(preferredencoding()) + res = compat_shlex_split(contents, comments=True) finally: optionf.close() return res @@ -90,16 +111,6 @@ def _comma_separated_values_options_callback(option, opt_str, value, parser): setattr(parser.values, option.dest, value.split(',')) - def _hide_login_info(opts): - opts = list(opts) - for private_opt in ['-p', '--password', '-u', '--username', '--video-password']: - try: - i = opts.index(private_opt) - opts[i + 1] = 'PRIVATE' - except ValueError: - pass - return opts - # No need to wrap help messages if we're on a wide console columns = compat_get_terminal_size().columns max_width = columns if columns else 80 @@ -166,11 +177,23 @@ 'Do not read the user configuration in ~/.config/youtube-dl/config ' '(%APPDATA%/youtube-dl/config.txt on Windows)') general.add_option( + '--config-location', + dest='config_location', metavar='PATH', + help='Location of the configuration file; either the path to the config or its containing directory.') + general.add_option( '--flat-playlist', action='store_const', dest='extract_flat', const='in_playlist', default=False, help='Do not extract the videos of a playlist, only list them.') general.add_option( + '--mark-watched', + action='store_true', dest='mark_watched', default=False, + help='Mark videos watched (YouTube only)') + general.add_option( + '--no-mark-watched', + action='store_false', dest='mark_watched', default=False, + help='Do not mark videos watched (YouTube only)') + general.add_option( '--no-color', '--no-colors', action='store_true', dest='no_color', default=False, @@ -180,7 +203,10 @@ network.add_option( '--proxy', dest='proxy', default=None, metavar='URL', - help='Use the specified HTTP/HTTPS proxy. Pass in an empty string (--proxy "") for direct connection') + help='Use the specified HTTP/HTTPS/SOCKS proxy. To enable ' + 'SOCKS proxy, specify a proper scheme. For example ' + 'socks5://127.0.0.1:1080/. Pass in an empty string (--proxy "") ' + 'for direct connection') network.add_option( '--socket-timeout', dest='socket_timeout', type=float, default=None, metavar='SECONDS', @@ -188,24 +214,45 @@ network.add_option( '--source-address', metavar='IP', dest='source_address', default=None, - help='Client-side IP address to bind to (experimental)', + help='Client-side IP address to bind to', ) network.add_option( '-4', '--force-ipv4', action='store_const', const='0.0.0.0', dest='source_address', - help='Make all connections via IPv4 (experimental)', + help='Make all connections via IPv4', ) network.add_option( '-6', '--force-ipv6', action='store_const', const='::', dest='source_address', - help='Make all connections via IPv6 (experimental)', + help='Make all connections via IPv6', ) - network.add_option( + + geo = optparse.OptionGroup(parser, 'Geo Restriction') + geo.add_option( + '--geo-verification-proxy', + dest='geo_verification_proxy', default=None, metavar='URL', + help='Use this proxy to verify the IP address for some geo-restricted sites. ' + 'The default proxy specified by --proxy (or none, if the option is not present) is used for the actual downloading.') + geo.add_option( '--cn-verification-proxy', dest='cn_verification_proxy', default=None, metavar='URL', - help='Use this proxy to verify the IP address for some Chinese sites. ' - 'The default proxy specified by --proxy (or none, if the options is not present) is used for the actual downloading. (experimental)' - ) + help=optparse.SUPPRESS_HELP) + geo.add_option( + '--geo-bypass', + action='store_true', dest='geo_bypass', default=True, + help='Bypass geographic restriction via faking X-Forwarded-For HTTP header') + geo.add_option( + '--no-geo-bypass', + action='store_false', dest='geo_bypass', default=True, + help='Do not bypass geographic restriction via faking X-Forwarded-For HTTP header') + geo.add_option( + '--geo-bypass-country', metavar='CODE', + dest='geo_bypass_country', default=None, + help='Force bypass geographic restriction with explicitly provided two-letter ISO 3166-2 country code') + geo.add_option( + '--geo-bypass-ip-block', metavar='IP_BLOCK', + dest='geo_bypass_ip_block', default=None, + help='Force bypass geographic restriction with explicitly provided IP block in CIDR notation') selection = optparse.OptionGroup(parser, 'Video Selection') selection.add_option( @@ -264,15 +311,17 @@ '--match-filter', metavar='FILTER', dest='match_filter', default=None, help=( - 'Generic video filter (experimental). ' - 'Specify any key (see help for -o for a list of available keys) to' - ' match if the key is present, ' - '!key to check if the key is not present,' + 'Generic video filter. ' + 'Specify any key (see the "OUTPUT TEMPLATE" for a list of available keys) to ' + 'match if the key is present, ' + '!key to check if the key is not present, ' 'key > NUMBER (like "comment_count > 12", also works with ' - '>=, <, <=, !=, =) to compare against a number, and ' - '& to require multiple matches. ' - 'Values which are not known are excluded unless you' - ' put a question mark (?) after the operator.' + '>=, <, <=, !=, =) to compare against a number, ' + 'key = \'LITERAL\' (like "uploader = \'Mike Smith\'", also works with !=) ' + 'to match against a string literal ' + 'and & to require multiple matches. ' + 'Values which are not known are excluded unless you ' + 'put a question mark (?) after the operator. ' 'For example, to only match videos that have been liked more than ' '100 times and disliked less than 50 times (or the dislike ' 'functionality is not available at the given service), but who ' @@ -312,7 +361,7 @@ authentication.add_option( '-2', '--twofactor', dest='twofactor', metavar='TWOFACTOR', - help='Two-factor auth code') + help='Two-factor authentication code') authentication.add_option( '-n', '--netrc', action='store_true', dest='usenetrc', default=False, @@ -322,6 +371,24 @@ dest='videopassword', metavar='PASSWORD', help='Video password (vimeo, smotri, youku)') + adobe_pass = optparse.OptionGroup(parser, 'Adobe Pass Options') + adobe_pass.add_option( + '--ap-mso', + dest='ap_mso', metavar='MSO', + help='Adobe Pass multiple-system operator (TV provider) identifier, use --ap-list-mso for a list of available MSOs') + adobe_pass.add_option( + '--ap-username', + dest='ap_username', metavar='USERNAME', + help='Multiple-system operator account login') + adobe_pass.add_option( + '--ap-password', + dest='ap_password', metavar='PASSWORD', + help='Multiple-system operator account password. If this option is left out, youtube-dl will ask interactively.') + adobe_pass.add_option( + '--ap-list-mso', + action='store_true', dest='ap_list_mso', default=False, + help='List all supported multiple-system operators') + video_format = optparse.OptionGroup(parser, 'Video Format Options') video_format.add_option( '-f', '--format', @@ -384,14 +451,30 @@ downloader = optparse.OptionGroup(parser, 'Download Options') downloader.add_option( - '-r', '--rate-limit', - dest='ratelimit', metavar='LIMIT', + '-r', '--limit-rate', '--rate-limit', + dest='ratelimit', metavar='RATE', help='Maximum download rate in bytes per second (e.g. 50K or 4.2M)') downloader.add_option( '-R', '--retries', dest='retries', metavar='RETRIES', default=10, help='Number of retries (default is %default), or "infinite".') downloader.add_option( + '--fragment-retries', + dest='fragment_retries', metavar='RETRIES', default=10, + help='Number of retries for a fragment (default is %default), or "infinite" (DASH, hlsnative and ISM)') + downloader.add_option( + '--skip-unavailable-fragments', + action='store_true', dest='skip_unavailable_fragments', default=True, + help='Skip unavailable fragments (DASH, hlsnative and ISM)') + downloader.add_option( + '--abort-on-unavailable-fragment', + action='store_false', dest='skip_unavailable_fragments', + help='Abort downloading when some fragment is not available') + downloader.add_option( + '--keep-fragments', + action='store_true', dest='keep_fragments', default=False, + help='Keep downloaded fragments on disk after downloading is finished; fragments are erased by default') + downloader.add_option( '--buffer-size', dest='buffersize', metavar='SIZE', default='1024', help='Size of download buffer (e.g. 1024 or 16K) (default is %default)') @@ -400,6 +483,11 @@ action='store_true', dest='noresizebuffer', default=False, help='Do not automatically adjust the buffer size. By default, the buffer size is automatically resized from an initial value of SIZE.') downloader.add_option( + '--http-chunk-size', + dest='http_chunk_size', metavar='SIZE', default=None, + help='Size of a chunk for chunk-based HTTP downloading (e.g. 10485760 or 10M) (default is disabled). ' + 'May be useful for bypassing bandwidth throttling imposed by a webserver (experimental)') + downloader.add_option( '--test', action='store_true', dest='test', default=False, help=optparse.SUPPRESS_HELP) @@ -408,13 +496,21 @@ action='store_true', help='Download playlist videos in reverse order') downloader.add_option( + '--playlist-random', + action='store_true', + help='Download playlist videos in random order') + downloader.add_option( '--xattr-set-filesize', dest='xattr_set_filesize', action='store_true', - help='Set file xattribute ytdl.filesize with expected filesize (experimental)') + help='Set file xattribute ytdl.filesize with expected file size') downloader.add_option( '--hls-prefer-native', - dest='hls_prefer_native', action='store_true', - help='Use the native HLS downloader instead of ffmpeg (experimental)') + dest='hls_prefer_native', action='store_true', default=None, + help='Use the native HLS downloader instead of ffmpeg') + downloader.add_option( + '--hls-prefer-ffmpeg', + dest='hls_prefer_native', action='store_false', default=None, + help='Use ffmpeg instead of the native HLS downloader') downloader.add_option( '--hls-use-mpegts', dest='hls_use_mpegts', action='store_true', @@ -462,9 +558,20 @@ dest='bidi_workaround', action='store_true', help='Work around terminals that lack bidirectional text support. Requires bidiv or fribidi executable in PATH') workarounds.add_option( - '--sleep-interval', metavar='SECONDS', + '--sleep-interval', '--min-sleep-interval', metavar='SECONDS', dest='sleep_interval', type=float, - help='Number of seconds to sleep before each download.') + help=( + 'Number of seconds to sleep before each download when used alone ' + 'or a lower bound of a range for randomized sleep before each download ' + '(minimum possible number of seconds to sleep) when used along with ' + '--max-sleep-interval.')) + workarounds.add_option( + '--max-sleep-interval', metavar='SECONDS', + dest='max_sleep_interval', type=float, + help=( + 'Upper bound of a range for randomized sleep before each download ' + '(maximum possible number of seconds to sleep). Must only be used ' + 'along with --min-sleep-interval.')) verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options') verbosity.add_option( @@ -518,7 +625,7 @@ verbosity.add_option( '-j', '--dump-json', action='store_true', dest='dumpjson', default=False, - help='Simulate, quiet but print JSON information. See --output for a description of available keys.') + help='Simulate, quiet but print JSON information. See the "OUTPUT TEMPLATE" for a description of available keys.') verbosity.add_option( '-J', '--dump-single-json', action='store_true', dest='dump_single_json', default=False, @@ -573,33 +680,23 @@ filesystem.add_option( '-a', '--batch-file', dest='batchfile', metavar='FILE', - help='File containing URLs to download (\'-\' for stdin)') + help="File containing URLs to download ('-' for stdin), one URL per line. " + "Lines starting with '#', ';' or ']' are considered as comments and ignored.") filesystem.add_option( '--id', default=False, action='store_true', dest='useid', help='Use only video ID in file name') filesystem.add_option( '-o', '--output', dest='outtmpl', metavar='TEMPLATE', - help=('Output filename template. Use %(title)s to get the title, ' - '%(uploader)s for the uploader name, %(uploader_id)s for the uploader nickname if different, ' - '%(autonumber)s to get an automatically incremented number, ' - '%(ext)s for the filename extension, ' - '%(format)s for the format description (like "22 - 1280x720" or "HD"), ' - '%(format_id)s for the unique id of the format (like YouTube\'s itags: "137"), ' - '%(upload_date)s for the upload date (YYYYMMDD), ' - '%(extractor)s for the provider (youtube, metacafe, etc), ' - '%(id)s for the video id, ' - '%(playlist_title)s, %(playlist_id)s, or %(playlist)s (=title if present, ID otherwise) for the playlist the video is in, ' - '%(playlist_index)s for the position in the playlist. ' - '%(height)s and %(width)s for the width and height of the video format. ' - '%(resolution)s for a textual description of the resolution of the video format. ' - '%% for a literal percent. ' - 'Use - to output to stdout. Can also be used to download to a different directory, ' - 'for example with -o \'/my/downloads/%(uploader)s/%(title)s-%(id)s.%(ext)s\' .')) + help=('Output filename template, see the "OUTPUT TEMPLATE" for all the info')) filesystem.add_option( '--autonumber-size', - dest='autonumber_size', metavar='NUMBER', - help='Specify the number of digits in %(autonumber)s when it is present in output filename template or --auto-number option is given') + dest='autonumber_size', metavar='NUMBER', type=int, + help=optparse.SUPPRESS_HELP) + filesystem.add_option( + '--autonumber-start', + dest='autonumber_start', metavar='NUMBER', default=1, type=int, + help='Specify the start value for %(autonumber)s (default is %default)') filesystem.add_option( '--restrict-filenames', action='store_true', dest='restrictfilenames', default=False, @@ -607,15 +704,15 @@ filesystem.add_option( '-A', '--auto-number', action='store_true', dest='autonumber', default=False, - help='[deprecated; use -o "%(autonumber)s-%(title)s.%(ext)s" ] Number downloaded files starting from 00000') + help=optparse.SUPPRESS_HELP) filesystem.add_option( '-t', '--title', action='store_true', dest='usetitle', default=False, - help='[deprecated] Use title in file name (default)') + help=optparse.SUPPRESS_HELP) filesystem.add_option( '-l', '--literal', default=False, action='store_true', dest='usetitle', - help='[deprecated] Alias of --title') + help=optparse.SUPPRESS_HELP) filesystem.add_option( '-w', '--no-overwrites', action='store_true', dest='nooverwrites', default=False, @@ -649,7 +746,7 @@ action='store_true', dest='writeannotations', default=False, help='Write video annotations to a .annotations.xml file') filesystem.add_option( - '--load-info', + '--load-info-json', '--load-info', dest='load_info_filename', metavar='FILE', help='JSON file containing the video information (created with the "--write-info-json" option)') filesystem.add_option( @@ -688,7 +785,7 @@ help='Convert video files to audio-only files (requires ffmpeg or avconv and ffprobe or avprobe)') postproc.add_option( '--audio-format', metavar='FORMAT', dest='audioformat', default='best', - help='Specify audio format: "best", "aac", "vorbis", "mp3", "m4a", "opus", or "wav"; "%default" by default') + help='Specify audio format: "best", "aac", "flac", "mp3", "m4a", "opus", "vorbis", or "wav"; "%default" by default; No effect without -x') postproc.add_option( '--audio-quality', metavar='QUALITY', dest='audioquality', default='5', @@ -712,7 +809,7 @@ postproc.add_option( '--embed-subs', action='store_true', dest='embedsubtitles', default=False, - help='Embed subtitles in the video (only for mkv and mp4 videos)') + help='Embed subtitles in the video (only for mp4, webm and mkv videos)') postproc.add_option( '--embed-thumbnail', action='store_true', dest='embedthumbnail', default=False, @@ -725,11 +822,12 @@ '--metadata-from-title', metavar='FORMAT', dest='metafromtitle', help='Parse additional metadata like song title / artist from the video title. ' - 'The format syntax is the same as --output, ' - 'the parsed parameters replace existing values. ' - 'Additional templates: %(album)s, %(artist)s. ' + 'The format syntax is the same as --output. Regular expression with ' + 'named capture groups may also be used. ' + 'The parsed parameters replace existing values. ' 'Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like ' - '"Coldplay - Paradise"') + '"Coldplay - Paradise". ' + 'Example (regex): --metadata-from-title "(?P<artist>.+?) - (?P<title>.+)"') postproc.add_option( '--xattrs', action='store_true', dest='xattrs', default=False, @@ -743,11 +841,11 @@ postproc.add_option( '--prefer-avconv', action='store_false', dest='prefer_ffmpeg', - help='Prefer avconv over ffmpeg for running the postprocessors (default)') + help='Prefer avconv over ffmpeg for running the postprocessors') postproc.add_option( '--prefer-ffmpeg', action='store_true', dest='prefer_ffmpeg', - help='Prefer ffmpeg over avconv for running the postprocessors') + help='Prefer ffmpeg over avconv for running the postprocessors (default)') postproc.add_option( '--ffmpeg-location', '--avconv-location', metavar='PATH', dest='ffmpeg_location', @@ -759,10 +857,11 @@ postproc.add_option( '--convert-subs', '--convert-subtitles', metavar='FORMAT', dest='convertsubtitles', default=None, - help='Convert the subtitles to other format (currently supported: srt|ass|vtt)') + help='Convert the subtitles to other format (currently supported: srt|ass|vtt|lrc)') parser.add_option_group(general) parser.add_option_group(network) + parser.add_option_group(geo) parser.add_option_group(selection) parser.add_option_group(downloader) parser.add_option_group(filesystem) @@ -772,6 +871,7 @@ parser.add_option_group(video_format) parser.add_option_group(subtitles) parser.add_option_group(authentication) + parser.add_option_group(adobe_pass) parser.add_option_group(postproc) if overrideArguments is not None: @@ -785,22 +885,32 @@ return conf command_line_conf = compat_conf(sys.argv[1:]) + opts, args = parser.parse_args(command_line_conf) + + system_conf = user_conf = custom_conf = [] - if '--ignore-config' in command_line_conf: - system_conf = [] - user_conf = [] + if '--config-location' in command_line_conf: + location = compat_expanduser(opts.config_location) + if os.path.isdir(location): + location = os.path.join(location, 'youtube-dl.conf') + if not os.path.exists(location): + parser.error('config-location %s does not exist.' % location) + custom_conf = _readOptions(location) + elif '--ignore-config' in command_line_conf: + pass else: - system_conf = compat_conf(_readOptions('/etc/youtube-dl.conf')) - if '--ignore-config' in system_conf: - user_conf = [] - else: - user_conf = compat_conf(_readUserConf()) - argv = system_conf + user_conf + command_line_conf + system_conf = _readOptions('/etc/youtube-dl.conf') + if '--ignore-config' not in system_conf: + user_conf = _readUserConf() + argv = system_conf + user_conf + custom_conf + command_line_conf opts, args = parser.parse_args(argv) if opts.verbose: - write_string('[debug] System config: ' + repr(_hide_login_info(system_conf)) + '\n') - write_string('[debug] User config: ' + repr(_hide_login_info(user_conf)) + '\n') - write_string('[debug] Command-line args: ' + repr(_hide_login_info(command_line_conf)) + '\n') + for conf_label, conf in ( + ('System config', system_conf), + ('User config', user_conf), + ('Custom config', custom_conf), + ('Command-line args', command_line_conf)): + write_string('[debug] %s: %s\n' % (conf_label, repr(_hide_login_info(conf)))) return parser, opts, args diff -Nru youtube-dl-2016.02.22/youtube_dl/postprocessor/embedthumbnail.py youtube-dl-2019.05.20/youtube_dl/postprocessor/embedthumbnail.py --- youtube-dl-2016.02.22/youtube_dl/postprocessor/embedthumbnail.py 2016-02-22 10:57:19.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/postprocessor/embedthumbnail.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals @@ -31,7 +31,8 @@ temp_filename = prepend_extension(filename, 'temp') if not info.get('thumbnails'): - raise EmbedThumbnailPPError('Thumbnail was not found. Nothing to do.') + self._downloader.to_screen('[embedthumbnail] There aren\'t any thumbnails to embed') + return [], info thumbnail_filename = info['thumbnails'][-1]['filename'] @@ -40,7 +41,7 @@ 'Skipping embedding the thumbnail because the file is missing.') return [], info - if info['ext'] in ('mp3', 'mkv'): + if info['ext'] == 'mp3': options = [ '-c', 'copy', '-map', '0', '-map', '1', '-metadata:s:v', 'title="Album cover"', '-metadata:s:v', 'comment="Cover (Front)"'] diff -Nru youtube-dl-2016.02.22/youtube_dl/postprocessor/execafterdownload.py youtube-dl-2019.05.20/youtube_dl/postprocessor/execafterdownload.py --- youtube-dl-2016.02.22/youtube_dl/postprocessor/execafterdownload.py 2016-02-14 22:26:17.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/postprocessor/execafterdownload.py 2019-06-07 18:49:07.000000000 +0000 @@ -3,8 +3,11 @@ import subprocess from .common import PostProcessor -from ..compat import shlex_quote -from ..utils import PostProcessingError +from ..compat import compat_shlex_quote +from ..utils import ( + encodeArgument, + PostProcessingError, +) class ExecAfterDownloadPP(PostProcessor): @@ -17,10 +20,10 @@ if '{}' not in cmd: cmd += ' {}' - cmd = cmd.replace('{}', shlex_quote(information['filepath'])) + cmd = cmd.replace('{}', compat_shlex_quote(information['filepath'])) self._downloader.to_screen('[exec] Executing command: %s' % cmd) - retCode = subprocess.call(cmd, shell=True) + retCode = subprocess.call(encodeArgument(cmd), shell=True) if retCode != 0: raise PostProcessingError( 'Command returned error code %d' % retCode) diff -Nru youtube-dl-2016.02.22/youtube_dl/postprocessor/ffmpeg.py youtube-dl-2019.05.20/youtube_dl/postprocessor/ffmpeg.py --- youtube-dl-2016.02.22/youtube_dl/postprocessor/ffmpeg.py 2016-02-09 11:57:41.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/postprocessor/ffmpeg.py 2019-06-07 18:49:07.000000000 +0000 @@ -4,13 +4,11 @@ import os import subprocess import time +import re from .common import AudioConversionError, PostProcessor -from ..compat import ( - compat_subprocess_get_DEVNULL, -) from ..utils import ( encodeArgument, encodeFilename, @@ -22,9 +20,33 @@ subtitles_filename, dfxp2srt, ISO639Utils, + replace_extension, ) +EXT_TO_OUT_FORMATS = { + 'aac': 'adts', + 'flac': 'flac', + 'm4a': 'ipod', + 'mka': 'matroska', + 'mkv': 'matroska', + 'mpg': 'mpeg', + 'ogv': 'ogg', + 'ts': 'mpegts', + 'wma': 'asf', + 'wmv': 'asf', +} +ACODECS = { + 'mp3': 'libmp3lame', + 'aac': 'aac', + 'flac': 'flac', + 'm4a': 'aac', + 'opus': 'libopus', + 'vorbis': 'libvorbis', + 'wav': None, +} + + class FFmpegPostProcessorError(PostProcessingError): pass @@ -52,7 +74,21 @@ def _determine_executables(self): programs = ['avprobe', 'avconv', 'ffmpeg', 'ffprobe'] - prefer_ffmpeg = False + prefer_ffmpeg = True + + def get_ffmpeg_version(path): + ver = get_exe_version(path, args=['-version']) + if ver: + regexs = [ + r'(?:\d+:)?([0-9.]+)-[0-9]+ubuntu[0-9.]+$', # Ubuntu, see [1] + r'n([0-9.]+)$', # Arch Linux + # 1. http://www.ducea.com/2006/06/17/ubuntu-package-version-naming-explanation/ + ] + for regex in regexs: + mobj = re.match(regex, ver) + if mobj: + ver = mobj.group(1) + return ver self.basename = None self.probe_basename = None @@ -60,7 +96,7 @@ self._paths = None self._versions = None if self._downloader: - prefer_ffmpeg = self._downloader.params.get('prefer_ffmpeg', False) + prefer_ffmpeg = self._downloader.params.get('prefer_ffmpeg', True) location = self._downloader.params.get('ffmpeg_location') if location is not None: if not os.path.exists(location): @@ -85,26 +121,25 @@ self._paths = dict( (p, os.path.join(location, p)) for p in programs) self._versions = dict( - (p, get_exe_version(self._paths[p], args=['-version'])) - for p in programs) + (p, get_ffmpeg_version(self._paths[p])) for p in programs) if self._versions is None: self._versions = dict( - (p, get_exe_version(p, args=['-version'])) for p in programs) + (p, get_ffmpeg_version(p)) for p in programs) self._paths = dict((p, p) for p in programs) - if prefer_ffmpeg: - prefs = ('ffmpeg', 'avconv') - else: + if prefer_ffmpeg is False: prefs = ('avconv', 'ffmpeg') + else: + prefs = ('ffmpeg', 'avconv') for p in prefs: if self._versions[p]: self.basename = p break - if prefer_ffmpeg: - prefs = ('ffprobe', 'avprobe') - else: + if prefer_ffmpeg is False: prefs = ('avprobe', 'ffprobe') + else: + prefs = ('ffprobe', 'avprobe') for p in prefs: if self._versions[p]: self.probe_basename = p @@ -126,6 +161,48 @@ def probe_executable(self): return self._paths[self.probe_basename] + def get_audio_codec(self, path): + if not self.probe_available and not self.available: + raise PostProcessingError('ffprobe/avprobe and ffmpeg/avconv not found. Please install one.') + try: + if self.probe_available: + cmd = [ + encodeFilename(self.probe_executable, True), + encodeArgument('-show_streams')] + else: + cmd = [ + encodeFilename(self.executable, True), + encodeArgument('-i')] + cmd.append(encodeFilename(self._ffmpeg_filename_argument(path), True)) + if self._downloader.params.get('verbose', False): + self._downloader.to_screen( + '[debug] %s command line: %s' % (self.basename, shell_quote(cmd))) + handle = subprocess.Popen( + cmd, stderr=subprocess.PIPE, + stdout=subprocess.PIPE, stdin=subprocess.PIPE) + stdout_data, stderr_data = handle.communicate() + expected_ret = 0 if self.probe_available else 1 + if handle.wait() != expected_ret: + return None + except (IOError, OSError): + return None + output = (stdout_data if self.probe_available else stderr_data).decode('ascii', 'ignore') + if self.probe_available: + audio_codec = None + for line in output.split('\n'): + if line.startswith('codec_name='): + audio_codec = line.split('=')[1].strip() + elif line.strip() == 'codec_type=audio' and audio_codec is not None: + return audio_codec + else: + # Stream #FILE_INDEX:STREAM_INDEX[STREAM_ID](LANGUAGE): CODEC_TYPE: CODEC_NAME + mobj = re.search( + r'Stream\s*#\d+:\d+(?:\[0x[0-9a-f]+\])?(?:\([a-z]{3}\))?:\s*Audio:\s*([0-9a-z]+)', + output) + if mobj: + return mobj.group(1) + return None + def run_ffmpeg_multiple_files(self, input_paths, out_path, opts): self.check_version() @@ -140,10 +217,13 @@ encodeArgument('-i'), encodeFilename(self._ffmpeg_filename_argument(path), True) ]) - cmd = ([encodeFilename(self.executable, True), encodeArgument('-y')] + - files_cmd + - [encodeArgument(o) for o in opts] + - [encodeFilename(self._ffmpeg_filename_argument(out_path), True)]) + cmd = [encodeFilename(self.executable, True), encodeArgument('-y')] + # avconv does not have repeat option + if self.basename == 'ffmpeg': + cmd += [encodeArgument('-loglevel'), encodeArgument('repeat+info')] + cmd += (files_cmd + + [encodeArgument(o) for o in opts] + + [encodeFilename(self._ffmpeg_filename_argument(out_path), True)]) if self._downloader.params.get('verbose', False): self._downloader.to_screen('[debug] ffmpeg command line: %s' % shell_quote(cmd)) @@ -162,7 +242,8 @@ # Always use 'file:' because the filename may contain ':' (ffmpeg # interprets that as a protocol) or can start with '-' (-- is broken in # ffmpeg, see https://ffmpeg.org/trac/ffmpeg/ticket/2127 for details) - return 'file:' + fn + # Also leave '-' intact in order not to break streaming to stdout. + return 'file:' + fn if fn != '-' else fn class FFmpegExtractAudioPP(FFmpegPostProcessor): @@ -174,31 +255,6 @@ self._preferredquality = preferredquality self._nopostoverwrites = nopostoverwrites - def get_audio_codec(self, path): - - if not self.probe_available: - raise PostProcessingError('ffprobe or avprobe not found. Please install one.') - try: - cmd = [ - encodeFilename(self.probe_executable, True), - encodeArgument('-show_streams'), - encodeFilename(self._ffmpeg_filename_argument(path), True)] - if self._downloader.params.get('verbose', False): - self._downloader.to_screen('[debug] %s command line: %s' % (self.basename, shell_quote(cmd))) - handle = subprocess.Popen(cmd, stderr=compat_subprocess_get_DEVNULL(), stdout=subprocess.PIPE, stdin=subprocess.PIPE) - output = handle.communicate()[0] - if handle.wait() != 0: - return None - except (IOError, OSError): - return None - audio_codec = None - for line in output.decode('ascii', 'ignore').split('\n'): - if line.startswith('codec_name='): - audio_codec = line.split('=')[1].strip() - elif line.strip() == 'codec_type=audio' and audio_codec is not None: - return audio_codec - return None - def run_ffmpeg(self, path, out_path, codec, more_opts): if codec is None: acodec_opts = [] @@ -224,7 +280,7 @@ acodec = 'copy' extension = 'm4a' more_opts = ['-bsf:a', 'aac_adtstoasc'] - elif filecodec in ['aac', 'mp3', 'vorbis', 'opus']: + elif filecodec in ['aac', 'flac', 'mp3', 'vorbis', 'opus']: # Lossless if possible acodec = 'copy' extension = filecodec @@ -243,8 +299,8 @@ else: more_opts += ['-b:a', self._preferredquality + 'k'] else: - # We convert the audio (lossy) - acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'opus': 'opus', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec] + # We convert the audio (lossy if codec is lossy) + acodec = ACODECS[self._preferredcodec] extension = self._preferredcodec more_opts = [] if self._preferredquality is not None: @@ -266,9 +322,12 @@ prefix, sep, ext = path.rpartition('.') # not os.path.splitext, since the latter does not work on unicode in all setups new_path = prefix + sep + extension + information['filepath'] = new_path + information['ext'] = extension + # If we download foo.mp3 and convert it to... foo.mp3, then don't delete foo.mp3, silly. - if (new_path == path or - (self._nopostoverwrites and os.path.exists(encodeFilename(new_path)))): + if (new_path == path + or (self._nopostoverwrites and os.path.exists(encodeFilename(new_path)))): self._downloader.to_screen('[ffmpeg] Post-process file %s exists, skipping' % new_path) return [], information @@ -287,9 +346,6 @@ new_path, time.time(), information['filetime'], errnote='Cannot update utime of audio file') - information['filepath'] = new_path - information['ext'] = extension - return [path], information @@ -318,17 +374,34 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): def run(self, information): - if information['ext'] not in ['mp4', 'mkv']: - self._downloader.to_screen('[ffmpeg] Subtitles can only be embedded in mp4 or mkv files') + if information['ext'] not in ('mp4', 'webm', 'mkv'): + self._downloader.to_screen('[ffmpeg] Subtitles can only be embedded in mp4, webm or mkv files') return [], information subtitles = information.get('requested_subtitles') if not subtitles: self._downloader.to_screen('[ffmpeg] There aren\'t any subtitles to embed') return [], information - sub_langs = list(subtitles.keys()) filename = information['filepath'] - sub_filenames = [subtitles_filename(filename, lang, sub_info['ext']) for lang, sub_info in subtitles.items()] + + ext = information['ext'] + sub_langs = [] + sub_filenames = [] + webm_vtt_warn = False + + for lang, sub_info in subtitles.items(): + sub_ext = sub_info['ext'] + if ext != 'webm' or ext == 'webm' and sub_ext == 'vtt': + sub_langs.append(lang) + sub_filenames.append(subtitles_filename(filename, lang, sub_ext)) + else: + if not webm_vtt_warn and ext == 'webm' and sub_ext != 'vtt': + webm_vtt_warn = True + self._downloader.to_screen('[ffmpeg] Only WebVTT subtitles can be embedded in webm files') + + if not sub_langs: + return [], information + input_files = [filename] + sub_filenames opts = [ @@ -337,14 +410,16 @@ # Don't copy the existing subtitles, we may be running the # postprocessor a second time '-map', '-0:s', + # Don't copy Apple TV chapters track, bin_data (see #19042, #19024, + # https://trac.ffmpeg.org/ticket/6016) + '-map', '-0:d', ] if information['ext'] == 'mp4': opts += ['-c:s', 'mov_text'] for (i, lang) in enumerate(sub_langs): opts.extend(['-map', '%d:0' % (i + 1)]) - lang_code = ISO639Utils.short2long(lang) - if lang_code is not None: - opts.extend(['-metadata:s:s:%d' % i, 'language=%s' % lang_code]) + lang_code = ISO639Utils.short2long(lang) or lang + opts.extend(['-metadata:s:s:%d' % i, 'language=%s' % lang_code]) temp_filename = prepend_extension(filename, 'temp') self._downloader.to_screen('[ffmpeg] Embedding subtitles in \'%s\'' % filename) @@ -358,23 +433,30 @@ class FFmpegMetadataPP(FFmpegPostProcessor): def run(self, info): metadata = {} - if info.get('title') is not None: - metadata['title'] = info['title'] - if info.get('upload_date') is not None: - metadata['date'] = info['upload_date'] - if info.get('artist') is not None: - metadata['artist'] = info['artist'] - elif info.get('uploader') is not None: - metadata['artist'] = info['uploader'] - elif info.get('uploader_id') is not None: - metadata['artist'] = info['uploader_id'] - if info.get('description') is not None: - metadata['description'] = info['description'] - metadata['comment'] = info['description'] - if info.get('webpage_url') is not None: - metadata['purl'] = info['webpage_url'] - if info.get('album') is not None: - metadata['album'] = info['album'] + + def add(meta_list, info_list=None): + if not info_list: + info_list = meta_list + if not isinstance(meta_list, (list, tuple)): + meta_list = (meta_list,) + if not isinstance(info_list, (list, tuple)): + info_list = (info_list,) + for info_f in info_list: + if info.get(info_f) is not None: + for meta_f in meta_list: + metadata[meta_f] = info[info_f] + break + + add('title', ('track', 'title')) + add('date', 'upload_date') + add(('description', 'comment'), 'description') + add('purl', 'webpage_url') + add('track', 'track_number') + add('artist', ('artist', 'creator', 'uploader', 'uploader_id')) + add('genre') + add('album') + add('album_artist') + add('disc', 'disc_number') if not metadata: self._downloader.to_screen('[ffmpeg] There isn\'t any metadata to add') @@ -382,21 +464,40 @@ filename = info['filepath'] temp_filename = prepend_extension(filename, 'temp') + in_filenames = [filename] + options = [] if info['ext'] == 'm4a': - options = ['-vn', '-acodec', 'copy'] + options.extend(['-vn', '-acodec', 'copy']) else: - options = ['-c', 'copy'] + options.extend(['-c', 'copy']) for (name, value) in metadata.items(): options.extend(['-metadata', '%s=%s' % (name, value)]) - # https://github.com/rg3/youtube-dl/issues/8350 - if info.get('protocol') == 'm3u8_native' or info.get('protocol') == 'm3u8' and self._downloader.params.get('hls_prefer_native', False): - options.extend(['-bsf:a', 'aac_adtstoasc']) + chapters = info.get('chapters', []) + if chapters: + metadata_filename = replace_extension(filename, 'meta') + with io.open(metadata_filename, 'wt', encoding='utf-8') as f: + def ffmpeg_escape(text): + return re.sub(r'(=|;|#|\\|\n)', r'\\\1', text) + + metadata_file_content = ';FFMETADATA1\n' + for chapter in chapters: + metadata_file_content += '[CHAPTER]\nTIMEBASE=1/1000\n' + metadata_file_content += 'START=%d\n' % (chapter['start_time'] * 1000) + metadata_file_content += 'END=%d\n' % (chapter['end_time'] * 1000) + chapter_title = chapter.get('title') + if chapter_title: + metadata_file_content += 'title=%s\n' % ffmpeg_escape(chapter_title) + f.write(metadata_file_content) + in_filenames.append(metadata_filename) + options.extend(['-map_metadata', '1']) self._downloader.to_screen('[ffmpeg] Adding metadata to \'%s\'' % filename) - self.run_ffmpeg(filename, temp_filename, options) + self.run_ffmpeg_multiple_files(in_filenames, temp_filename, options) + if chapters: + os.remove(metadata_filename) os.remove(encodeFilename(filename)) os.rename(encodeFilename(temp_filename), encodeFilename(filename)) return [], info @@ -467,6 +568,21 @@ return [], info +class FFmpegFixupM3u8PP(FFmpegPostProcessor): + def run(self, info): + filename = info['filepath'] + if self.get_audio_codec(filename) == 'aac': + temp_filename = prepend_extension(filename, 'temp') + + options = ['-c', 'copy', '-f', 'mp4', '-bsf:a', 'aac_adtstoasc'] + self._downloader.to_screen('[ffmpeg] Fixing malformed AAC bitstream in "%s"' % filename) + self.run_ffmpeg(filename, temp_filename, options) + + os.remove(encodeFilename(filename)) + os.rename(encodeFilename(temp_filename), encodeFilename(filename)) + return [], info + + class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor): def __init__(self, downloader=None, format=None): super(FFmpegSubtitlesConvertorPP, self).__init__(downloader) @@ -488,14 +604,13 @@ ext = sub['ext'] if ext == new_ext: self._downloader.to_screen( - '[ffmpeg] Subtitle file for %s is already in the requested' - 'format' % new_ext) + '[ffmpeg] Subtitle file for %s is already in the requested format' % new_ext) continue old_file = subtitles_filename(filename, lang, ext) sub_filenames.append(old_file) new_file = subtitles_filename(filename, lang, new_ext) - if ext == 'dfxp' or ext == 'ttml': + if ext in ('dfxp', 'ttml', 'tt'): self._downloader.report_warning( 'You have requested to convert dfxp (TTML) subtitles into another format, ' 'which results in style information loss') @@ -503,7 +618,7 @@ dfxp_file = old_file srt_file = subtitles_filename(filename, lang, 'srt') - with io.open(dfxp_file, 'rt', encoding='utf-8') as f: + with open(dfxp_file, 'rb') as f: srt_data = dfxp2srt(f.read()) with io.open(srt_file, 'wt', encoding='utf-8') as f: diff -Nru youtube-dl-2016.02.22/youtube_dl/postprocessor/__init__.py youtube-dl-2019.05.20/youtube_dl/postprocessor/__init__.py --- youtube-dl-2016.02.22/youtube_dl/postprocessor/__init__.py 2015-12-30 19:30:33.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/postprocessor/__init__.py 2019-06-07 18:49:07.000000000 +0000 @@ -6,6 +6,7 @@ FFmpegEmbedSubtitlePP, FFmpegExtractAudioPP, FFmpegFixupStretchedPP, + FFmpegFixupM3u8PP, FFmpegFixupM4aPP, FFmpegMergerPP, FFmpegMetadataPP, @@ -26,6 +27,7 @@ 'ExecAfterDownloadPP', 'FFmpegEmbedSubtitlePP', 'FFmpegExtractAudioPP', + 'FFmpegFixupM3u8PP', 'FFmpegFixupM4aPP', 'FFmpegFixupStretchedPP', 'FFmpegMergerPP', diff -Nru youtube-dl-2016.02.22/youtube_dl/postprocessor/metadatafromtitle.py youtube-dl-2019.05.20/youtube_dl/postprocessor/metadatafromtitle.py --- youtube-dl-2016.02.22/youtube_dl/postprocessor/metadatafromtitle.py 2016-02-14 22:26:17.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/postprocessor/metadatafromtitle.py 2019-06-07 18:49:07.000000000 +0000 @@ -3,21 +3,18 @@ import re from .common import PostProcessor -from ..utils import PostProcessingError - - -class MetadataFromTitlePPError(PostProcessingError): - pass class MetadataFromTitlePP(PostProcessor): def __init__(self, downloader, titleformat): super(MetadataFromTitlePP, self).__init__(downloader) self._titleformat = titleformat - self._titleregex = self.format_to_regex(titleformat) + self._titleregex = (self.format_to_regex(titleformat) + if re.search(r'%\(\w+\)s', titleformat) + else titleformat) def format_to_regex(self, fmt): - """ + r""" Converts a string like '%(title)s - %(artist)s' to a regex like @@ -31,17 +28,21 @@ regex += r'(?P<' + match.group(1) + '>.+)' lastpos = match.end() if lastpos < len(fmt): - regex += re.escape(fmt[lastpos:len(fmt)]) + regex += re.escape(fmt[lastpos:]) return regex def run(self, info): title = info['title'] match = re.match(self._titleregex, title) if match is None: - raise MetadataFromTitlePPError('Could not interpret title of video as "%s"' % self._titleformat) + self._downloader.to_screen( + '[fromtitle] Could not interpret title of video as "%s"' + % self._titleformat) + return [], info for attribute, value in match.groupdict().items(): - value = match.group(attribute) info[attribute] = value - self._downloader.to_screen('[fromtitle] parsed ' + attribute + ': ' + value) + self._downloader.to_screen( + '[fromtitle] parsed %s: %s' + % (attribute, value if value is not None else 'NA')) return [], info diff -Nru youtube-dl-2016.02.22/youtube_dl/postprocessor/xattrpp.py youtube-dl-2019.05.20/youtube_dl/postprocessor/xattrpp.py --- youtube-dl-2016.02.22/youtube_dl/postprocessor/xattrpp.py 2016-02-14 22:26:17.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/postprocessor/xattrpp.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,36 +1,15 @@ from __future__ import unicode_literals -import os -import subprocess -import sys -import errno - from .common import PostProcessor +from ..compat import compat_os_name from ..utils import ( - check_executable, hyphenate_date, - version_tuple, - PostProcessingError, - encodeArgument, - encodeFilename, + write_xattr, + XAttrMetadataError, + XAttrUnavailableError, ) -class XAttrMetadataError(PostProcessingError): - def __init__(self, code=None, msg='Unknown error'): - super(XAttrMetadataError, self).__init__(msg) - self.code = code - - # Parsing code and msg - if (self.code in (errno.ENOSPC, errno.EDQUOT) or - 'No space left' in self.msg or 'Disk quota excedded' in self.msg): - self.reason = 'NO_SPACE' - elif self.code == errno.E2BIG or 'Argument list too long' in self.msg: - self.reason = 'VALUE_TOO_LONG' - else: - self.reason = 'NOT_SUPPORTED' - - class XAttrMetadataPP(PostProcessor): # @@ -47,88 +26,6 @@ def run(self, info): """ Set extended attributes on downloaded file (if xattr support is found). """ - # This mess below finds the best xattr tool for the job and creates a - # "write_xattr" function. - try: - # try the pyxattr module... - import xattr - - # Unicode arguments are not supported in python-pyxattr until - # version 0.5.0 - # See https://github.com/rg3/youtube-dl/issues/5498 - pyxattr_required_version = '0.5.0' - if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version): - self._downloader.report_warning( - 'python-pyxattr is detected but is too old. ' - 'youtube-dl requires %s or above while your version is %s. ' - 'Falling back to other xattr implementations' % ( - pyxattr_required_version, xattr.__version__)) - - raise ImportError - - def write_xattr(path, key, value): - try: - xattr.set(path, key, value) - except EnvironmentError as e: - raise XAttrMetadataError(e.errno, e.strerror) - - except ImportError: - if os.name == 'nt': - # Write xattrs to NTFS Alternate Data Streams: - # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29 - def write_xattr(path, key, value): - assert ':' not in key - assert os.path.exists(path) - - ads_fn = path + ':' + key - try: - with open(ads_fn, 'wb') as f: - f.write(value) - except EnvironmentError as e: - raise XAttrMetadataError(e.errno, e.strerror) - else: - user_has_setfattr = check_executable('setfattr', ['--version']) - user_has_xattr = check_executable('xattr', ['-h']) - - if user_has_setfattr or user_has_xattr: - - def write_xattr(path, key, value): - value = value.decode('utf-8') - if user_has_setfattr: - executable = 'setfattr' - opts = ['-n', key, '-v', value] - elif user_has_xattr: - executable = 'xattr' - opts = ['-w', key, value] - - cmd = ([encodeFilename(executable, True)] + - [encodeArgument(o) for o in opts] + - [encodeFilename(path, True)]) - - try: - p = subprocess.Popen( - cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) - except EnvironmentError as e: - raise XAttrMetadataError(e.errno, e.strerror) - stdout, stderr = p.communicate() - stderr = stderr.decode('utf-8', 'replace') - if p.returncode != 0: - raise XAttrMetadataError(p.returncode, stderr) - - else: - # On Unix, and can't find pyxattr, setfattr, or xattr. - if sys.platform.startswith('linux'): - self._downloader.report_error( - "Couldn't find a tool to set the xattrs. " - "Install either the python 'pyxattr' or 'xattr' " - "modules, or the GNU 'attr' package " - "(which contains the 'setfattr' tool).") - else: - self._downloader.report_error( - "Couldn't find a tool to set the xattrs. " - "Install either the python 'xattr' module, " - "or the 'xattr' binary.") - # Write the metadata to the file's xattrs self._downloader.to_screen('[metadata] Writing metadata to file\'s xattrs') @@ -145,6 +42,7 @@ 'user.dublincore.format': 'format', } + num_written = 0 for xattrname, infoname in xattr_mapping.items(): value = info.get(infoname) @@ -155,20 +53,25 @@ byte_value = value.encode('utf-8') write_xattr(filename, xattrname, byte_value) + num_written += 1 + + return [], info + except XAttrUnavailableError as e: + self._downloader.report_error(str(e)) return [], info except XAttrMetadataError as e: if e.reason == 'NO_SPACE': self._downloader.report_warning( - 'There\'s no disk space left or disk quota exceeded. ' + - 'Extended attributes are not written.') + 'There\'s no disk space left, disk quota exceeded or filesystem xattr limit exceeded. ' + + (('Some ' if num_written else '') + 'extended attributes are not written.').capitalize()) elif e.reason == 'VALUE_TOO_LONG': self._downloader.report_warning( 'Unable to write extended attributes due to too long values.') else: msg = 'This filesystem doesn\'t support extended attributes. ' - if os.name == 'nt': + if compat_os_name == 'nt': msg += 'You need to use NTFS.' else: msg += '(You may have to enable them in your /etc/fstab)' diff -Nru youtube-dl-2016.02.22/youtube_dl/socks.py youtube-dl-2019.05.20/youtube_dl/socks.py --- youtube-dl-2016.02.22/youtube_dl/socks.py 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/socks.py 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,273 @@ +# Public Domain SOCKS proxy protocol implementation +# Adapted from https://gist.github.com/bluec0re/cafd3764412967417fd3 + +from __future__ import unicode_literals + +# References: +# SOCKS4 protocol http://www.openssh.com/txt/socks4.protocol +# SOCKS4A protocol http://www.openssh.com/txt/socks4a.protocol +# SOCKS5 protocol https://tools.ietf.org/html/rfc1928 +# SOCKS5 username/password authentication https://tools.ietf.org/html/rfc1929 + +import collections +import socket + +from .compat import ( + compat_ord, + compat_struct_pack, + compat_struct_unpack, +) + +__author__ = 'Timo Schmid <coding@timoschmid.de>' + +SOCKS4_VERSION = 4 +SOCKS4_REPLY_VERSION = 0x00 +# Excerpt from SOCKS4A protocol: +# if the client cannot resolve the destination host's domain name to find its +# IP address, it should set the first three bytes of DSTIP to NULL and the last +# byte to a non-zero value. +SOCKS4_DEFAULT_DSTIP = compat_struct_pack('!BBBB', 0, 0, 0, 0xFF) + +SOCKS5_VERSION = 5 +SOCKS5_USER_AUTH_VERSION = 0x01 +SOCKS5_USER_AUTH_SUCCESS = 0x00 + + +class Socks4Command(object): + CMD_CONNECT = 0x01 + CMD_BIND = 0x02 + + +class Socks5Command(Socks4Command): + CMD_UDP_ASSOCIATE = 0x03 + + +class Socks5Auth(object): + AUTH_NONE = 0x00 + AUTH_GSSAPI = 0x01 + AUTH_USER_PASS = 0x02 + AUTH_NO_ACCEPTABLE = 0xFF # For server response + + +class Socks5AddressType(object): + ATYP_IPV4 = 0x01 + ATYP_DOMAINNAME = 0x03 + ATYP_IPV6 = 0x04 + + +class ProxyError(socket.error): + ERR_SUCCESS = 0x00 + + def __init__(self, code=None, msg=None): + if code is not None and msg is None: + msg = self.CODES.get(code) or 'unknown error' + super(ProxyError, self).__init__(code, msg) + + +class InvalidVersionError(ProxyError): + def __init__(self, expected_version, got_version): + msg = ('Invalid response version from server. Expected {0:02x} got ' + '{1:02x}'.format(expected_version, got_version)) + super(InvalidVersionError, self).__init__(0, msg) + + +class Socks4Error(ProxyError): + ERR_SUCCESS = 90 + + CODES = { + 91: 'request rejected or failed', + 92: 'request rejected because SOCKS server cannot connect to identd on the client', + 93: 'request rejected because the client program and identd report different user-ids' + } + + +class Socks5Error(ProxyError): + ERR_GENERAL_FAILURE = 0x01 + + CODES = { + 0x01: 'general SOCKS server failure', + 0x02: 'connection not allowed by ruleset', + 0x03: 'Network unreachable', + 0x04: 'Host unreachable', + 0x05: 'Connection refused', + 0x06: 'TTL expired', + 0x07: 'Command not supported', + 0x08: 'Address type not supported', + 0xFE: 'unknown username or invalid password', + 0xFF: 'all offered authentication methods were rejected' + } + + +class ProxyType(object): + SOCKS4 = 0 + SOCKS4A = 1 + SOCKS5 = 2 + + +Proxy = collections.namedtuple('Proxy', ( + 'type', 'host', 'port', 'username', 'password', 'remote_dns')) + + +class sockssocket(socket.socket): + def __init__(self, *args, **kwargs): + self._proxy = None + super(sockssocket, self).__init__(*args, **kwargs) + + def setproxy(self, proxytype, addr, port, rdns=True, username=None, password=None): + assert proxytype in (ProxyType.SOCKS4, ProxyType.SOCKS4A, ProxyType.SOCKS5) + + self._proxy = Proxy(proxytype, addr, port, username, password, rdns) + + def recvall(self, cnt): + data = b'' + while len(data) < cnt: + cur = self.recv(cnt - len(data)) + if not cur: + raise EOFError('{0} bytes missing'.format(cnt - len(data))) + data += cur + return data + + def _recv_bytes(self, cnt): + data = self.recvall(cnt) + return compat_struct_unpack('!{0}B'.format(cnt), data) + + @staticmethod + def _len_and_data(data): + return compat_struct_pack('!B', len(data)) + data + + def _check_response_version(self, expected_version, got_version): + if got_version != expected_version: + self.close() + raise InvalidVersionError(expected_version, got_version) + + def _resolve_address(self, destaddr, default, use_remote_dns): + try: + return socket.inet_aton(destaddr) + except socket.error: + if use_remote_dns and self._proxy.remote_dns: + return default + else: + return socket.inet_aton(socket.gethostbyname(destaddr)) + + def _setup_socks4(self, address, is_4a=False): + destaddr, port = address + + ipaddr = self._resolve_address(destaddr, SOCKS4_DEFAULT_DSTIP, use_remote_dns=is_4a) + + packet = compat_struct_pack('!BBH', SOCKS4_VERSION, Socks4Command.CMD_CONNECT, port) + ipaddr + + username = (self._proxy.username or '').encode('utf-8') + packet += username + b'\x00' + + if is_4a and self._proxy.remote_dns: + packet += destaddr.encode('utf-8') + b'\x00' + + self.sendall(packet) + + version, resp_code, dstport, dsthost = compat_struct_unpack('!BBHI', self.recvall(8)) + + self._check_response_version(SOCKS4_REPLY_VERSION, version) + + if resp_code != Socks4Error.ERR_SUCCESS: + self.close() + raise Socks4Error(resp_code) + + return (dsthost, dstport) + + def _setup_socks4a(self, address): + self._setup_socks4(address, is_4a=True) + + def _socks5_auth(self): + packet = compat_struct_pack('!B', SOCKS5_VERSION) + + auth_methods = [Socks5Auth.AUTH_NONE] + if self._proxy.username and self._proxy.password: + auth_methods.append(Socks5Auth.AUTH_USER_PASS) + + packet += compat_struct_pack('!B', len(auth_methods)) + packet += compat_struct_pack('!{0}B'.format(len(auth_methods)), *auth_methods) + + self.sendall(packet) + + version, method = self._recv_bytes(2) + + self._check_response_version(SOCKS5_VERSION, version) + + if method == Socks5Auth.AUTH_NO_ACCEPTABLE or ( + method == Socks5Auth.AUTH_USER_PASS and (not self._proxy.username or not self._proxy.password)): + self.close() + raise Socks5Error(Socks5Auth.AUTH_NO_ACCEPTABLE) + + if method == Socks5Auth.AUTH_USER_PASS: + username = self._proxy.username.encode('utf-8') + password = self._proxy.password.encode('utf-8') + packet = compat_struct_pack('!B', SOCKS5_USER_AUTH_VERSION) + packet += self._len_and_data(username) + self._len_and_data(password) + self.sendall(packet) + + version, status = self._recv_bytes(2) + + self._check_response_version(SOCKS5_USER_AUTH_VERSION, version) + + if status != SOCKS5_USER_AUTH_SUCCESS: + self.close() + raise Socks5Error(Socks5Error.ERR_GENERAL_FAILURE) + + def _setup_socks5(self, address): + destaddr, port = address + + ipaddr = self._resolve_address(destaddr, None, use_remote_dns=True) + + self._socks5_auth() + + reserved = 0 + packet = compat_struct_pack('!BBB', SOCKS5_VERSION, Socks5Command.CMD_CONNECT, reserved) + if ipaddr is None: + destaddr = destaddr.encode('utf-8') + packet += compat_struct_pack('!B', Socks5AddressType.ATYP_DOMAINNAME) + packet += self._len_and_data(destaddr) + else: + packet += compat_struct_pack('!B', Socks5AddressType.ATYP_IPV4) + ipaddr + packet += compat_struct_pack('!H', port) + + self.sendall(packet) + + version, status, reserved, atype = self._recv_bytes(4) + + self._check_response_version(SOCKS5_VERSION, version) + + if status != Socks5Error.ERR_SUCCESS: + self.close() + raise Socks5Error(status) + + if atype == Socks5AddressType.ATYP_IPV4: + destaddr = self.recvall(4) + elif atype == Socks5AddressType.ATYP_DOMAINNAME: + alen = compat_ord(self.recv(1)) + destaddr = self.recvall(alen) + elif atype == Socks5AddressType.ATYP_IPV6: + destaddr = self.recvall(16) + destport = compat_struct_unpack('!H', self.recvall(2))[0] + + return (destaddr, destport) + + def _make_proxy(self, connect_func, address): + if not self._proxy: + return connect_func(self, address) + + result = connect_func(self, (self._proxy.host, self._proxy.port)) + if result != 0 and result is not None: + return result + setup_funcs = { + ProxyType.SOCKS4: self._setup_socks4, + ProxyType.SOCKS4A: self._setup_socks4a, + ProxyType.SOCKS5: self._setup_socks5, + } + setup_funcs[self._proxy.type](address) + return result + + def connect(self, address): + self._make_proxy(socket.socket.connect, address) + + def connect_ex(self, address): + return self._make_proxy(socket.socket.connect_ex, address) diff -Nru youtube-dl-2016.02.22/youtube_dl/swfinterp.py youtube-dl-2019.05.20/youtube_dl/swfinterp.py --- youtube-dl-2016.02.22/youtube_dl/swfinterp.py 2016-01-14 09:35:12.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/swfinterp.py 2019-06-07 18:49:07.000000000 +0000 @@ -4,10 +4,12 @@ import io import zlib -from .compat import compat_str +from .compat import ( + compat_str, + compat_struct_unpack, +) from .utils import ( ExtractorError, - struct_unpack, ) @@ -23,17 +25,17 @@ file_contents[:1]) # Determine number of bits in framesize rectangle - framesize_nbits = struct_unpack('!B', content[:1])[0] >> 3 + framesize_nbits = compat_struct_unpack('!B', content[:1])[0] >> 3 framesize_len = (5 + 4 * framesize_nbits + 7) // 8 pos = framesize_len + 2 + 2 while pos < len(content): - header16 = struct_unpack('<H', content[pos:pos + 2])[0] + header16 = compat_struct_unpack('<H', content[pos:pos + 2])[0] pos += 2 tag_code = header16 >> 6 tag_len = header16 & 0x3f if tag_len == 0x3f: - tag_len = struct_unpack('<I', content[pos:pos + 4])[0] + tag_len = compat_struct_unpack('<I', content[pos:pos + 4])[0] pos += 4 assert pos + tag_len <= len(content), \ ('Tag %d ends at %d+%d - that\'s longer than the file (%d)' @@ -101,7 +103,7 @@ for _ in range(5): buf = reader.read(1) assert len(buf) == 1 - b = struct_unpack('<B', buf)[0] + b = compat_struct_unpack('<B', buf)[0] res = res | ((b & 0x7f) << shift) if b & 0x80 == 0: break @@ -113,6 +115,8 @@ res = _read_int(reader) assert res & 0xf0000000 == 0 return res + + _u32 = _read_int @@ -127,7 +131,7 @@ bs = reader.read(3) assert len(bs) == 3 last_byte = b'\xff' if (ord(bs[2:3]) >= 0x80) else b'\x00' - return struct_unpack('<i', bs + last_byte)[0] + return compat_struct_unpack('<i', bs + last_byte)[0] def _read_string(reader): @@ -146,7 +150,7 @@ def _read_byte(reader): resb = _read_bytes(1, reader=reader) - res = struct_unpack('<B', resb)[0] + res = compat_struct_unpack('<B', resb)[0] return res @@ -174,6 +178,7 @@ return 'undefined' __repr__ = __str__ + undefined = _Undefined() diff -Nru youtube-dl-2016.02.22/youtube_dl/update.py youtube-dl-2019.05.20/youtube_dl/update.py --- youtube-dl-2016.02.22/youtube_dl/update.py 2016-02-14 22:26:17.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/update.py 2019-06-07 18:49:07.000000000 +0000 @@ -31,7 +31,7 @@ def update_self(to_screen, verbose, opener): """Update the program file with the latest version from the repository""" - UPDATE_URL = 'https://rg3.github.io/youtube-dl/update/' + UPDATE_URL = 'https://yt-dl.org/update/' VERSION_URL = UPDATE_URL + 'LATEST_VERSION' JSON_URL = UPDATE_URL + 'versions.json' UPDATES_RSA_KEY = (0x9d60ee4d8f805312fdb15a62f87b95bd66177b91df176765d13514a0f1754bcd2057295c5b6f1d35daa6742c3ffc9a82d3e118861c207995a8031e151d863c9927e304576bc80692bc8e094896fcf11b66f3e29e04e3a71e9a11558558acea1840aec37fc396fb6b65dc81a1c4144e03bd1c011de62e3f1357b327d08426fe93, 65537) @@ -83,11 +83,8 @@ print_notes(to_screen, versions_info['versions']) - filename = sys.argv[0] - # Py2EXE: Filename could be different - if hasattr(sys, 'frozen') and not os.path.isfile(filename): - if os.path.isfile(filename + '.exe'): - filename += '.exe' + # sys.executable is set to the full pathname of the exe-file for py2exe + filename = sys.executable if hasattr(sys, 'frozen') else sys.argv[0] if not os.access(filename, os.W_OK): to_screen('ERROR: no write permissions on %s' % filename) @@ -95,7 +92,7 @@ # Py2EXE if hasattr(sys, 'frozen'): - exe = os.path.abspath(filename) + exe = filename directory = os.path.dirname(exe) if not os.access(directory, os.W_OK): to_screen('ERROR: no write permissions on %s' % directory) diff -Nru youtube-dl-2016.02.22/youtube_dl/utils.py youtube-dl-2019.05.20/youtube_dl/utils.py --- youtube-dl-2016.02.22/youtube_dl/utils.py 2016-02-22 10:57:19.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/utils.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,5 +1,5 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals @@ -11,22 +11,22 @@ import ctypes import datetime import email.utils +import email.header import errno import functools import gzip -import itertools import io +import itertools import json import locale import math import operator import os -import pipes import platform +import random import re -import ssl import socket -import struct +import ssl import subprocess import sys import tempfile @@ -35,29 +35,54 @@ import zlib from .compat import ( + compat_HTMLParseError, + compat_HTMLParser, compat_basestring, compat_chr, + compat_cookiejar, + compat_ctypes_WINFUNCTYPE, compat_etree_fromstring, + compat_expanduser, compat_html_entities, + compat_html_entities_html5, compat_http_client, compat_kwargs, + compat_os_name, compat_parse_qs, - compat_socket_create_connection, + compat_shlex_quote, compat_str, + compat_struct_pack, + compat_struct_unpack, compat_urllib_error, compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, + compat_urllib_parse_unquote_plus, compat_urllib_request, compat_urlparse, - shlex_quote, + compat_xpath, +) + +from .socks import ( + ProxyType, + sockssocket, ) +def register_socks_protocols(): + # "Register" SOCKS protocols + # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904 + # URLs with protocols not in urlparse.uses_netloc are not handled correctly + for scheme in ('socks', 'socks4', 'socks4a', 'socks5'): + if scheme not in compat_urlparse.uses_netloc: + compat_urlparse.uses_netloc.append(scheme) + + # This is not clearly defined otherwise compiled_regex_type = type(re.compile('')) std_headers = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)', + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', @@ -65,12 +90,24 @@ } +USER_AGENTS = { + 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27', +} + + NO_DEFAULT = object() ENGLISH_MONTH_NAMES = [ 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'] +MONTH_NAMES = { + 'en': ENGLISH_MONTH_NAMES, + 'fr': [ + 'janvier', 'février', 'mars', 'avril', 'mai', 'juin', + 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'], +} + KNOWN_EXTENSIONS = ( 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac', 'flv', 'f4v', 'f4a', 'f4b', @@ -86,6 +123,69 @@ 'wav', 'f4f', 'f4m', 'm3u8', 'smil') +# needed for sanitizing filenames in restricted mode +ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ', + itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'], + 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y'))) + +DATE_FORMATS = ( + '%d %B %Y', + '%d %b %Y', + '%B %d %Y', + '%B %dst %Y', + '%B %dnd %Y', + '%B %dth %Y', + '%b %d %Y', + '%b %dst %Y', + '%b %dnd %Y', + '%b %dth %Y', + '%b %dst %Y %I:%M', + '%b %dnd %Y %I:%M', + '%b %dth %Y %I:%M', + '%Y %m %d', + '%Y-%m-%d', + '%Y/%m/%d', + '%Y/%m/%d %H:%M', + '%Y/%m/%d %H:%M:%S', + '%Y-%m-%d %H:%M', + '%Y-%m-%d %H:%M:%S', + '%Y-%m-%d %H:%M:%S.%f', + '%d.%m.%Y %H:%M', + '%d.%m.%Y %H.%M', + '%Y-%m-%dT%H:%M:%SZ', + '%Y-%m-%dT%H:%M:%S.%fZ', + '%Y-%m-%dT%H:%M:%S.%f0Z', + '%Y-%m-%dT%H:%M:%S', + '%Y-%m-%dT%H:%M:%S.%f', + '%Y-%m-%dT%H:%M', + '%b %d %Y at %H:%M', + '%b %d %Y at %H:%M:%S', + '%B %d %Y at %H:%M', + '%B %d %Y at %H:%M:%S', +) + +DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS) +DATE_FORMATS_DAY_FIRST.extend([ + '%d-%m-%Y', + '%d.%m.%Y', + '%d.%m.%y', + '%d/%m/%Y', + '%d/%m/%y', + '%d/%m/%Y %H:%M:%S', +]) + +DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS) +DATE_FORMATS_MONTH_FIRST.extend([ + '%m-%d-%Y', + '%m.%d.%Y', + '%m/%d/%Y', + '%m/%d/%y', + '%m/%d/%Y %H:%M:%S', +]) + +PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)" +JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>' + def preferredencoding(): """Get preferred encoding. @@ -160,18 +260,11 @@ def find_xpath_attr(node, xpath, key, val=None): """ Find the xpath xpath[@key=val] """ assert re.match(r'^[a-zA-Z_-]+$', key) - if val: - assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val) expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val)) return node.find(expr) else: def find_xpath_attr(node, xpath, key, val=None): - # Here comes the crazy part: In 2.6, if the xpath is a unicode, - # .//node does not match if a node is a direct child of . ! - if isinstance(xpath, compat_str): - xpath = xpath.encode('ascii') - - for f in node.findall(xpath): + for f in node.findall(compat_xpath(xpath)): if key not in f.attrib: continue if val is None or f.attrib.get(key) == val: @@ -196,9 +289,7 @@ def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT): def _find_xpath(xpath): - if sys.version_info < (2, 7): # Crazy 2.6 - xpath = xpath.encode('ascii') - return node.find(xpath) + return node.find(compat_xpath(xpath)) if isinstance(xpath, (str, compat_str)): n = _find_xpath(xpath) @@ -252,27 +343,83 @@ return get_element_by_attribute('id', id, html) -def get_element_by_attribute(attribute, value, html): +def get_element_by_class(class_name, html): + """Return the content of the first tag with the specified class in the passed HTML document""" + retval = get_elements_by_class(class_name, html) + return retval[0] if retval else None + + +def get_element_by_attribute(attribute, value, html, escape_value=True): + retval = get_elements_by_attribute(attribute, value, html, escape_value) + return retval[0] if retval else None + + +def get_elements_by_class(class_name, html): + """Return the content of all tags with the specified class in the passed HTML document as a list""" + return get_elements_by_attribute( + 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name), + html, escape_value=False) + + +def get_elements_by_attribute(attribute, value, html, escape_value=True): """Return the content of the tag with the specified attribute in the passed HTML document""" - m = re.search(r'''(?xs) + value = re.escape(value) if escape_value else value + + retlist = [] + for m in re.finditer(r'''(?xs) <([a-zA-Z0-9:._-]+) - (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*? + (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*? \s+%s=['"]?%s['"]? - (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*? + (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*? \s*> (?P<content>.*?) </\1> - ''' % (re.escape(attribute), re.escape(value)), html) + ''' % (re.escape(attribute), value), html): + res = m.group('content') - if not m: - return None - res = m.group('content') + if res.startswith('"') or res.startswith("'"): + res = res[1:-1] - if res.startswith('"') or res.startswith("'"): - res = res[1:-1] + retlist.append(unescapeHTML(res)) - return unescapeHTML(res) + return retlist + + +class HTMLAttributeParser(compat_HTMLParser): + """Trivial HTML parser to gather the attributes for a single element""" + def __init__(self): + self.attrs = {} + compat_HTMLParser.__init__(self) + + def handle_starttag(self, tag, attrs): + self.attrs = dict(attrs) + + +def extract_attributes(html_element): + """Given a string for an HTML element such as + <el + a="foo" B="bar" c="&98;az" d=boz + empty= noval entity="&" + sq='"' dq="'" + > + Decode and return a dictionary of attributes. + { + 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz', + 'empty': '', 'noval': None, 'entity': '&', + 'sq': '"', 'dq': '\'' + }. + NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions, + but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5. + """ + parser = HTMLAttributeParser() + try: + parser.feed(html_element) + parser.close() + # Older Python may throw HTMLParseError in case of malformed HTML + except compat_HTMLParseError: + pass + return parser.attrs def clean_html(html): @@ -283,8 +430,8 @@ # Newline vs <br /> html = html.replace('\n', ' ') - html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html) - html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html) + html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html) + html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html) # Strip html tags html = re.sub('<.*?>', '', html) # Replace html entities @@ -336,9 +483,12 @@ def sanitize_filename(s, restricted=False, is_id=False): """Sanitizes a string so it could be used as part of a filename. If restricted is set, use a stricter subset of allowed characters. - Set is_id if this is not an arbitrary string, but an ID that should be kept if possible + Set is_id if this is not an arbitrary string, but an ID that should be kept + if possible. """ def replace_insane(char): + if restricted and char in ACCENT_CHARS: + return ACCENT_CHARS[char] if char == '?' or ord(char) < 32 or ord(char) == 127: return '' elif char == '"': @@ -382,18 +532,38 @@ if drive_or_unc: norm_path.pop(0) sanitized_path = [ - path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part) + path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part) for path_part in norm_path] if drive_or_unc: sanitized_path.insert(0, drive_or_unc + os.path.sep) return os.path.join(*sanitized_path) -# Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of -# unwanted failures due to missing protocol +def sanitize_url(url): + # Prepend protocol-less URLs with `http:` scheme in order to mitigate + # the number of unwanted failures due to missing protocol + if url.startswith('//'): + return 'http:%s' % url + # Fix some common typos seen so far + COMMON_TYPOS = ( + # https://github.com/ytdl-org/youtube-dl/issues/15649 + (r'^httpss://', r'https://'), + # https://bx1.be/lives/direct-tv/ + (r'^rmtp([es]?)://', r'rtmp\1://'), + ) + for mistake, fixup in COMMON_TYPOS: + if re.match(mistake, url): + return re.sub(mistake, fixup, url) + return url + + def sanitized_Request(url, *args, **kwargs): - return compat_urllib_request.Request( - 'http:%s' % url if url.startswith('//') else url, *args, **kwargs) + return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs) + + +def expand_path(s): + """Expand shell variables and ~""" + return os.path.expandvars(compat_expanduser(s)) def orderedSet(iterable): @@ -405,12 +575,19 @@ return res -def _htmlentity_transform(entity): +def _htmlentity_transform(entity_with_semicolon): """Transforms an HTML entity to a character.""" + entity = entity_with_semicolon[:-1] + # Known non-numeric HTML entity if entity in compat_html_entities.name2codepoint: return compat_chr(compat_html_entities.name2codepoint[entity]) + # TODO: HTML5 allows entities without a semicolon. For example, + # 'Éric' should be decoded as 'Éric'. + if entity_with_semicolon in compat_html_entities_html5: + return compat_html_entities_html5[entity_with_semicolon] + mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity) if mobj is not None: numstr = mobj.group(1) @@ -419,7 +596,7 @@ numstr = '0%s' % numstr else: base = 10 - # See https://github.com/rg3/youtube-dl/issues/7518 + # See https://github.com/ytdl-org/youtube-dl/issues/7518 try: return compat_chr(int(numstr, base)) except ValueError: @@ -435,7 +612,7 @@ assert type(s) == compat_str return re.sub( - r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s) + r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s) def get_subprocess_encoding(): @@ -467,6 +644,10 @@ if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5: return s + # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible + if sys.platform.startswith('java'): + return s + return s.encode(get_subprocess_encoding(), 'ignore') @@ -549,7 +730,12 @@ return msg -class ExtractorError(Exception): +class YoutubeDLError(Exception): + """Base exception for YoutubeDL errors.""" + pass + + +class ExtractorError(YoutubeDLError): """Error during info extraction.""" def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None): @@ -590,7 +776,19 @@ pass -class DownloadError(Exception): +class GeoRestrictedError(ExtractorError): + """Geographic restriction Error exception. + + This exception may be thrown when a video is not available from your + geographic location due to geographic restrictions imposed by a website. + """ + def __init__(self, msg, countries=None): + super(GeoRestrictedError, self).__init__(msg, expected=True) + self.msg = msg + self.countries = countries + + +class DownloadError(YoutubeDLError): """Download Error exception. This exception may be thrown by FileDownloader objects if they are not @@ -604,7 +802,7 @@ self.exc_info = exc_info -class SameFileError(Exception): +class SameFileError(YoutubeDLError): """Same File exception. This exception will be thrown by FileDownloader objects if they detect @@ -613,7 +811,7 @@ pass -class PostProcessingError(Exception): +class PostProcessingError(YoutubeDLError): """Post Processing exception. This exception may be raised by PostProcessor's .run() method to @@ -621,15 +819,16 @@ """ def __init__(self, msg): + super(PostProcessingError, self).__init__(msg) self.msg = msg -class MaxDownloadsReached(Exception): +class MaxDownloadsReached(YoutubeDLError): """ --max-downloads limit has been reached. """ pass -class UnavailableVideoError(Exception): +class UnavailableVideoError(YoutubeDLError): """Unavailable Format exception. This exception will be thrown when a video is requested @@ -638,7 +837,7 @@ pass -class ContentTooShortError(Exception): +class ContentTooShortError(YoutubeDLError): """Content Too Short exception. This exception may be raised by FileDownloader objects when a file they @@ -647,26 +846,87 @@ """ def __init__(self, downloaded, expected): + super(ContentTooShortError, self).__init__( + 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected) + ) # Both in bytes self.downloaded = downloaded self.expected = expected +class XAttrMetadataError(YoutubeDLError): + def __init__(self, code=None, msg='Unknown error'): + super(XAttrMetadataError, self).__init__(msg) + self.code = code + self.msg = msg + + # Parsing code and msg + if (self.code in (errno.ENOSPC, errno.EDQUOT) + or 'No space left' in self.msg or 'Disk quota excedded' in self.msg): + self.reason = 'NO_SPACE' + elif self.code == errno.E2BIG or 'Argument list too long' in self.msg: + self.reason = 'VALUE_TOO_LONG' + else: + self.reason = 'NOT_SUPPORTED' + + +class XAttrUnavailableError(YoutubeDLError): + pass + + def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs): # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting # expected HTTP responses to meet HTTP/1.0 or later (see also - # https://github.com/rg3/youtube-dl/issues/6727) + # https://github.com/ytdl-org/youtube-dl/issues/6727) if sys.version_info < (3, 0): - kwargs[b'strict'] = True - hc = http_class(*args, **kwargs) + kwargs['strict'] = True + hc = http_class(*args, **compat_kwargs(kwargs)) source_address = ydl_handler._params.get('source_address') + if source_address is not None: + # This is to workaround _create_connection() from socket where it will try all + # address data from getaddrinfo() including IPv6. This filters the result from + # getaddrinfo() based on the source_address value. + # This is based on the cpython socket.create_connection() function. + # https://github.com/python/cpython/blob/master/Lib/socket.py#L691 + def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None): + host, port = address + err = None + addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM) + af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6 + ip_addrs = [addr for addr in addrs if addr[0] == af] + if addrs and not ip_addrs: + ip_version = 'v4' if af == socket.AF_INET else 'v6' + raise socket.error( + "No remote IP%s addresses available for connect, can't use '%s' as source address" + % (ip_version, source_address[0])) + for res in ip_addrs: + af, socktype, proto, canonname, sa = res + sock = None + try: + sock = socket.socket(af, socktype, proto) + if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT: + sock.settimeout(timeout) + sock.bind(source_address) + sock.connect(sa) + err = None # Explicitly break reference cycle + return sock + except socket.error as _: + err = _ + if sock is not None: + sock.close() + if err is not None: + raise err + else: + raise socket.error('getaddrinfo returns an empty list') + if hasattr(hc, '_create_connection'): + hc._create_connection = _create_connection sa = (source_address, 0) if hasattr(hc, 'source_address'): # Python 2.7+ hc.source_address = sa else: # Python 2.6 def _hc_connect(self, *args, **kwargs): - sock = compat_socket_create_connection( + sock = _create_connection( (self.host, self.port), self.timeout, sa) if is_https: self.sock = ssl.wrap_socket( @@ -712,8 +972,15 @@ self._params = params def http_open(self, req): + conn_class = compat_http_client.HTTPConnection + + socks_proxy = req.headers.get('Ytdl-socks-proxy') + if socks_proxy: + conn_class = make_socks_conn_class(conn_class, socks_proxy) + del req.headers['Ytdl-socks-proxy'] + return self.do_open(functools.partial( - _create_http_connection, self, compat_http_client.HTTPConnection, False), + _create_http_connection, self, conn_class, False), req) @staticmethod @@ -723,14 +990,6 @@ except zlib.error: return zlib.decompress(data) - @staticmethod - def addinfourl_wrapper(stream, headers, url, code): - if hasattr(compat_urllib_request.addinfourl, 'getcode'): - return compat_urllib_request.addinfourl(stream, headers, url, code) - ret = compat_urllib_request.addinfourl(stream, headers, url) - ret.code = code - return ret - def http_request(self, req): # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not # always respected by websites, some tend to give out URLs with non percent-encoded @@ -745,12 +1004,7 @@ # Substitute URL if any change after escaping if url != url_escaped: - req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request - new_req = req_type( - url_escaped, data=req.data, headers=req.headers, - origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) - new_req.timeout = req.timeout - req = new_req + req = update_Request(req, url=url_escaped) for h, v in std_headers.items(): # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275 @@ -787,26 +1041,30 @@ break else: raise original_ioerror - resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code) + resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg del resp.headers['Content-encoding'] # deflate if resp.headers.get('Content-encoding', '') == 'deflate': gz = io.BytesIO(self.deflate(resp.read())) - resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) + resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg del resp.headers['Content-encoding'] # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see - # https://github.com/rg3/youtube-dl/issues/6457). + # https://github.com/ytdl-org/youtube-dl/issues/6457). if 300 <= resp.code < 400: location = resp.headers.get('Location') if location: # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3 if sys.version_info >= (3, 0): location = location.encode('iso-8859-1').decode('utf-8') + else: + location = location.decode('utf-8') location_escaped = escape_url(location) if location != location_escaped: del resp.headers['Location'] + if sys.version_info < (3, 0): + location_escaped = location_escaped.encode('utf-8') resp.headers['Location'] = location_escaped return resp @@ -814,6 +1072,49 @@ https_response = http_response +def make_socks_conn_class(base_class, socks_proxy): + assert issubclass(base_class, ( + compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection)) + + url_components = compat_urlparse.urlparse(socks_proxy) + if url_components.scheme.lower() == 'socks5': + socks_type = ProxyType.SOCKS5 + elif url_components.scheme.lower() in ('socks', 'socks4'): + socks_type = ProxyType.SOCKS4 + elif url_components.scheme.lower() == 'socks4a': + socks_type = ProxyType.SOCKS4A + + def unquote_if_non_empty(s): + if not s: + return s + return compat_urllib_parse_unquote_plus(s) + + proxy_args = ( + socks_type, + url_components.hostname, url_components.port or 1080, + True, # Remote DNS + unquote_if_non_empty(url_components.username), + unquote_if_non_empty(url_components.password), + ) + + class SocksConnection(base_class): + def connect(self): + self.sock = sockssocket() + self.sock.setproxy(*proxy_args) + if type(self.timeout) in (int, float): + self.sock.settimeout(self.timeout) + self.sock.connect((self.host, self.port)) + + if isinstance(self, compat_http_client.HTTPSConnection): + if hasattr(self, '_context'): # Python > 2.6 + self.sock = self._context.wrap_socket( + self.sock, server_hostname=self.host) + else: + self.sock = ssl.wrap_socket(self.sock) + + return SocksConnection + + class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler): def __init__(self, params, https_conn_class=None, *args, **kwargs): compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs) @@ -822,15 +1123,66 @@ def https_open(self, req): kwargs = {} + conn_class = self._https_conn_class + if hasattr(self, '_context'): # python > 2.6 kwargs['context'] = self._context if hasattr(self, '_check_hostname'): # python 3.x kwargs['check_hostname'] = self._check_hostname + + socks_proxy = req.headers.get('Ytdl-socks-proxy') + if socks_proxy: + conn_class = make_socks_conn_class(conn_class, socks_proxy) + del req.headers['Ytdl-socks-proxy'] + return self.do_open(functools.partial( - _create_http_connection, self, self._https_conn_class, True), + _create_http_connection, self, conn_class, True), req, **kwargs) +class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar): + _HTTPONLY_PREFIX = '#HttpOnly_' + + def save(self, filename=None, ignore_discard=False, ignore_expires=False): + # Store session cookies with `expires` set to 0 instead of an empty + # string + for cookie in self: + if cookie.expires is None: + cookie.expires = 0 + compat_cookiejar.MozillaCookieJar.save(self, filename, ignore_discard, ignore_expires) + + def load(self, filename=None, ignore_discard=False, ignore_expires=False): + """Load cookies from a file.""" + if filename is None: + if self.filename is not None: + filename = self.filename + else: + raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT) + + cf = io.StringIO() + with open(filename) as f: + for line in f: + if line.startswith(self._HTTPONLY_PREFIX): + line = line[len(self._HTTPONLY_PREFIX):] + cf.write(compat_str(line)) + cf.seek(0) + self._really_load(cf, filename, ignore_discard, ignore_expires) + # Session cookies are denoted by either `expires` field set to + # an empty string or 0. MozillaCookieJar only recognizes the former + # (see [1]). So we need force the latter to be recognized as session + # cookies on our own. + # Session cookies may be important for cookies-based authentication, + # e.g. usually, when user does not check 'Remember me' check box while + # logging in on a site, some important cookies are stored as session + # cookies so that not recognizing them will result in failed login. + # 1. https://bugs.python.org/issue17164 + for cookie in self: + # Treat `expires=0` cookies as session cookies + if cookie.expires == 0: + cookie.expires = None + cookie.discard = True + + class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor): def __init__(self, cookiejar=None): compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar) @@ -838,7 +1190,7 @@ def http_response(self, request, response): # Python 2 will choke on next HTTP request in row if there are non-ASCII # characters in Set-Cookie HTTP header of last response (see - # https://github.com/rg3/youtube-dl/issues/6769). + # https://github.com/ytdl-org/youtube-dl/issues/6769). # In order to at least prevent crashing we will percent encode Set-Cookie # header before HTTPCookieProcessor starts processing it. # if sys.version_info < (3, 0) and response.headers: @@ -855,6 +1207,24 @@ https_response = http_response +def extract_timezone(date_str): + m = re.search( + r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)', + date_str) + if not m: + timezone = datetime.timedelta() + else: + date_str = date_str[:-len(m.group('tz'))] + if not m.group('sign'): + timezone = datetime.timedelta() + else: + sign = 1 if m.group('sign') == '+' else -1 + timezone = datetime.timedelta( + hours=sign * int(m.group('hours')), + minutes=sign * int(m.group('minutes'))) + return timezone, date_str + + def parse_iso8601(date_str, delimiter='T', timezone=None): """ Return a UNIX timestamp from the given date """ @@ -864,20 +1234,8 @@ date_str = re.sub(r'\.[0-9]+', '', date_str) if timezone is None: - m = re.search( - r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)', - date_str) - if not m: - timezone = datetime.timedelta() - else: - date_str = date_str[:-len(m.group(0))] - if not m.group('sign'): - timezone = datetime.timedelta() - else: - sign = 1 if m.group('sign') == '+' else -1 - timezone = datetime.timedelta( - hours=sign * int(m.group('hours')), - minutes=sign * int(m.group('minutes'))) + timezone, date_str = extract_timezone(date_str) + try: date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter) dt = datetime.datetime.strptime(date_str, date_format) - timezone @@ -886,6 +1244,10 @@ pass +def date_formats(day_first=True): + return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST + + def unified_strdate(date_str, day_first=True): """Return a string with the date in the format YYYYMMDD""" @@ -894,52 +1256,11 @@ upload_date = None # Replace commas date_str = date_str.replace(',', ' ') - # %z (UTC offset) is only supported in python>=3.2 - if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str): - date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str) # Remove AM/PM + timezone date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str) + _, date_str = extract_timezone(date_str) - format_expressions = [ - '%d %B %Y', - '%d %b %Y', - '%B %d %Y', - '%b %d %Y', - '%b %dst %Y %I:%M%p', - '%b %dnd %Y %I:%M%p', - '%b %dth %Y %I:%M%p', - '%Y %m %d', - '%Y-%m-%d', - '%Y/%m/%d', - '%Y/%m/%d %H:%M:%S', - '%Y-%m-%d %H:%M:%S', - '%Y-%m-%d %H:%M:%S.%f', - '%d.%m.%Y %H:%M', - '%d.%m.%Y %H.%M', - '%Y-%m-%dT%H:%M:%SZ', - '%Y-%m-%dT%H:%M:%S.%fZ', - '%Y-%m-%dT%H:%M:%S.%f0Z', - '%Y-%m-%dT%H:%M:%S', - '%Y-%m-%dT%H:%M:%S.%f', - '%Y-%m-%dT%H:%M', - ] - if day_first: - format_expressions.extend([ - '%d-%m-%Y', - '%d.%m.%Y', - '%d/%m/%Y', - '%d/%m/%y', - '%d/%m/%Y %H:%M:%S', - ]) - else: - format_expressions.extend([ - '%m-%d-%Y', - '%m.%d.%Y', - '%m/%d/%Y', - '%m/%d/%y', - '%m/%d/%Y %H:%M:%S', - ]) - for expression in format_expressions: + for expression in date_formats(day_first): try: upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d') except ValueError: @@ -947,13 +1268,49 @@ if upload_date is None: timetuple = email.utils.parsedate_tz(date_str) if timetuple: - upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d') + try: + upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d') + except ValueError: + pass if upload_date is not None: return compat_str(upload_date) +def unified_timestamp(date_str, day_first=True): + if date_str is None: + return None + + date_str = re.sub(r'[,|]', '', date_str) + + pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0 + timezone, date_str = extract_timezone(date_str) + + # Remove AM/PM + timezone + date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str) + + # Remove unrecognized timezones from ISO 8601 alike timestamps + m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str) + if m: + date_str = date_str[:-len(m.group('tz'))] + + # Python only supports microseconds, so remove nanoseconds + m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str) + if m: + date_str = m.group(1) + + for expression in date_formats(day_first): + try: + dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta) + return calendar.timegm(dt.timetuple()) + except ValueError: + pass + timetuple = email.utils.parsedate_tz(date_str) + if timetuple: + return calendar.timegm(timetuple) + pm_delta * 3600 + + def determine_ext(url, default_ext='unknown_video'): - if url is None: + if url is None or '.' not in url: return default_ext guess = url.partition('?')[0].rpartition('.')[2] if re.match(r'^[A-Za-z0-9]+$', guess): @@ -978,7 +1335,7 @@ return today if date_str == 'yesterday': return today - datetime.timedelta(days=1) - match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str) + match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str) if match is not None: sign = match.group('sign') time = int(match.group('time')) @@ -1073,31 +1430,31 @@ if fileno not in WIN_OUTPUT_IDS: return False - GetStdHandle = ctypes.WINFUNCTYPE( + GetStdHandle = compat_ctypes_WINFUNCTYPE( ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)( - (b'GetStdHandle', ctypes.windll.kernel32)) + ('GetStdHandle', ctypes.windll.kernel32)) h = GetStdHandle(WIN_OUTPUT_IDS[fileno]) - WriteConsoleW = ctypes.WINFUNCTYPE( + WriteConsoleW = compat_ctypes_WINFUNCTYPE( ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR, ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD), - ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32)) + ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32)) written = ctypes.wintypes.DWORD(0) - GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32)) + GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32)) FILE_TYPE_CHAR = 0x0002 FILE_TYPE_REMOTE = 0x8000 - GetConsoleMode = ctypes.WINFUNCTYPE( + GetConsoleMode = compat_ctypes_WINFUNCTYPE( ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.POINTER(ctypes.wintypes.DWORD))( - (b'GetConsoleMode', ctypes.windll.kernel32)) + ('GetConsoleMode', ctypes.windll.kernel32)) INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value def not_a_console(handle): if handle == INVALID_HANDLE_VALUE or handle is None: return True - return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or - GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0) + return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR + or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0) if not_a_console(h): return False @@ -1133,8 +1490,8 @@ if _windows_write_string(s, out): return - if ('b' in getattr(out, 'mode', '') or - sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr + if ('b' in getattr(out, 'mode', '') + or sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr byt = s.encode(encoding or preferredencoding(), 'ignore') out.write(byt) elif hasattr(out, 'buffer'): @@ -1158,7 +1515,7 @@ def intlist_to_bytes(xs): if not xs: return b'' - return struct_pack('%dB' % len(xs), *xs) + return compat_struct_pack('%dB' % len(xs), *xs) # Cross-platform file locking @@ -1217,13 +1574,23 @@ raise OSError('Unlocking file failed: %r' % ctypes.FormatError()) else: - import fcntl + # Some platforms, such as Jython, is missing fcntl + try: + import fcntl - def _lock_file(f, exclusive): - fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH) + def _lock_file(f, exclusive): + fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH) - def _unlock_file(f): - fcntl.flock(f, fcntl.LOCK_UN) + def _unlock_file(f): + fcntl.flock(f, fcntl.LOCK_UN) + except ImportError: + UNSUPPORTED_MSG = 'file locking is not supported on this platform' + + def _lock_file(f, exclusive): + raise IOError(UNSUPPORTED_MSG) + + def _unlock_file(f): + raise IOError(UNSUPPORTED_MSG) class locked_file(object): @@ -1269,14 +1636,16 @@ if isinstance(a, bytes): # We may get a filename encoded with 'encodeFilename' a = a.decode(encoding) - quoted_args.append(pipes.quote(a)) + quoted_args.append(compat_shlex_quote(a)) return ' '.join(quoted_args) def smuggle_url(url, data): """ Pass additional data in a URL for internal use. """ - sdata = compat_urllib_parse.urlencode( + url, idata = unsmuggle_url(url, {}) + data.update(idata) + sdata = compat_urllib_parse_urlencode( {'__youtubedl_smuggle': json.dumps(data)}) return url + '#' + sdata @@ -1304,6 +1673,17 @@ return '%.2f%s' % (converted, suffix) +def lookup_unit_table(unit_table, s): + units_re = '|'.join(re.escape(u) for u in unit_table) + m = re.match( + r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s) + if not m: + return None + num_str = m.group('num').replace(',', '.') + mult = unit_table[m.group('unit')] + return int(float(num_str) * mult) + + def parse_filesize(s): if s is None: return None @@ -1313,56 +1693,126 @@ _UNIT_TABLE = { 'B': 1, 'b': 1, + 'bytes': 1, 'KiB': 1024, 'KB': 1000, 'kB': 1024, 'Kb': 1000, + 'kb': 1000, + 'kilobytes': 1000, + 'kibibytes': 1024, 'MiB': 1024 ** 2, 'MB': 1000 ** 2, 'mB': 1024 ** 2, 'Mb': 1000 ** 2, + 'mb': 1000 ** 2, + 'megabytes': 1000 ** 2, + 'mebibytes': 1024 ** 2, 'GiB': 1024 ** 3, 'GB': 1000 ** 3, 'gB': 1024 ** 3, 'Gb': 1000 ** 3, + 'gb': 1000 ** 3, + 'gigabytes': 1000 ** 3, + 'gibibytes': 1024 ** 3, 'TiB': 1024 ** 4, 'TB': 1000 ** 4, 'tB': 1024 ** 4, 'Tb': 1000 ** 4, + 'tb': 1000 ** 4, + 'terabytes': 1000 ** 4, + 'tebibytes': 1024 ** 4, 'PiB': 1024 ** 5, 'PB': 1000 ** 5, 'pB': 1024 ** 5, 'Pb': 1000 ** 5, + 'pb': 1000 ** 5, + 'petabytes': 1000 ** 5, + 'pebibytes': 1024 ** 5, 'EiB': 1024 ** 6, 'EB': 1000 ** 6, 'eB': 1024 ** 6, 'Eb': 1000 ** 6, + 'eb': 1000 ** 6, + 'exabytes': 1000 ** 6, + 'exbibytes': 1024 ** 6, 'ZiB': 1024 ** 7, 'ZB': 1000 ** 7, 'zB': 1024 ** 7, 'Zb': 1000 ** 7, + 'zb': 1000 ** 7, + 'zettabytes': 1000 ** 7, + 'zebibytes': 1024 ** 7, 'YiB': 1024 ** 8, 'YB': 1000 ** 8, 'yB': 1024 ** 8, 'Yb': 1000 ** 8, + 'yb': 1000 ** 8, + 'yottabytes': 1000 ** 8, + 'yobibytes': 1024 ** 8, } - units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE) - m = re.match( - r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s) - if not m: + return lookup_unit_table(_UNIT_TABLE, s) + + +def parse_count(s): + if s is None: return None - num_str = m.group('num').replace(',', '.') - mult = _UNIT_TABLE[m.group('unit')] - return int(float(num_str) * mult) + s = s.strip() + if re.match(r'^[\d,.]+$', s): + return str_to_int(s) -def month_by_name(name): + _UNIT_TABLE = { + 'k': 1000, + 'K': 1000, + 'm': 1000 ** 2, + 'M': 1000 ** 2, + 'kk': 1000 ** 2, + 'KK': 1000 ** 2, + } + + return lookup_unit_table(_UNIT_TABLE, s) + + +def parse_resolution(s): + if s is None: + return {} + + mobj = re.search(r'\b(?P<w>\d+)\s*[xX×]\s*(?P<h>\d+)\b', s) + if mobj: + return { + 'width': int(mobj.group('w')), + 'height': int(mobj.group('h')), + } + + mobj = re.search(r'\b(\d+)[pPiI]\b', s) + if mobj: + return {'height': int(mobj.group(1))} + + mobj = re.search(r'\b([48])[kK]\b', s) + if mobj: + return {'height': int(mobj.group(1)) * 540} + + return {} + + +def parse_bitrate(s): + if not isinstance(s, compat_str): + return + mobj = re.search(r'\b(\d+)\s*kbps', s) + if mobj: + return int(mobj.group(1)) + + +def month_by_name(name, lang='en'): """ Return the number of a month by (locale-independently) English name """ + month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en']) + try: - return ENGLISH_MONTH_NAMES.index(name) + 1 + return month_names.index(name) + 1 except ValueError: return None @@ -1387,10 +1837,21 @@ def setproctitle(title): assert isinstance(title, compat_str) + + # ctypes in Jython is not complete + # http://bugs.jython.org/issue2148 + if sys.platform.startswith('java'): + return + try: libc = ctypes.cdll.LoadLibrary('libc.so.6') except OSError: return + except TypeError: + # LoadLibrary in Windows Python 2.7.13 only expects + # a bytestring, but since unicode_literals turns + # every string into a unicode string, it fails. + return title_bytes = title.encode('utf-8') buf = ctypes.create_string_buffer(len(title_bytes)) buf.value = title_bytes @@ -1401,15 +1862,11 @@ def remove_start(s, start): - if s.startswith(start): - return s[len(start):] - return s + return s[len(start):] if s is not None and s.startswith(start) else s def remove_end(s, end): - if s.endswith(end): - return s[:-len(end)] - return s + return s[:-len(end)] if s is not None and s.endswith(end) else s def remove_quotes(s): @@ -1426,11 +1883,35 @@ return path.strip('/').split('/')[-1] +def base_url(url): + return re.match(r'https?://[^?#&]+/', url).group() + + +def urljoin(base, path): + if isinstance(path, bytes): + path = path.decode('utf-8') + if not isinstance(path, compat_str) or not path: + return None + if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path): + return path + if isinstance(base, bytes): + base = base.decode('utf-8') + if not isinstance(base, compat_str) or not re.match( + r'^(?:https?:)?//', base): + return None + return compat_urlparse.urljoin(base, path) + + class HEADRequest(compat_urllib_request.Request): def get_method(self): return 'HEAD' +class PUTRequest(compat_urllib_request.Request): + def get_method(self): + return 'PUT' + + def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1): if get_attr: if v is not None: @@ -1441,7 +1922,7 @@ return default try: return int(v) * invscale // scale - except ValueError: + except (ValueError, TypeError): return default @@ -1462,54 +1943,81 @@ return default try: return float(v) * invscale / scale - except ValueError: + except (ValueError, TypeError): return default +def bool_or_none(v, default=None): + return v if isinstance(v, bool) else default + + +def strip_or_none(v, default=None): + return v.strip() if isinstance(v, compat_str) else default + + +def url_or_none(url): + if not url or not isinstance(url, compat_str): + return None + url = url.strip() + return url if re.match(r'^(?:[a-zA-Z][\da-zA-Z.+-]*:)?//', url) else None + + def parse_duration(s): if not isinstance(s, compat_basestring): return None s = s.strip() - m = re.match( - r'''(?ix)(?:P?T)? - (?: - (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*| - (?P<only_hours>[0-9.]+)\s*(?:hours?)| - - \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*| - (?: + days, hours, mins, secs, ms = [None] * 5 + m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s) + if m: + days, hours, mins, secs, ms = m.groups() + else: + m = re.match( + r'''(?ix)(?:P? (?: - (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)? - (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s* + [0-9]+\s*y(?:ears?)?\s* )? - (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s* - )? - (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)? - )$''', s) - if not m: - return None - res = 0 - if m.group('only_mins'): - return float_or_none(m.group('only_mins'), invscale=60) - if m.group('only_hours'): - return float_or_none(m.group('only_hours'), invscale=60 * 60) - if m.group('secs'): - res += int(m.group('secs')) - if m.group('mins_reversed'): - res += int(m.group('mins_reversed')) * 60 - if m.group('mins'): - res += int(m.group('mins')) * 60 - if m.group('hours'): - res += int(m.group('hours')) * 60 * 60 - if m.group('hours_reversed'): - res += int(m.group('hours_reversed')) * 60 * 60 - if m.group('days'): - res += int(m.group('days')) * 24 * 60 * 60 - if m.group('ms'): - res += float(m.group('ms')) - return res + (?: + [0-9]+\s*m(?:onths?)?\s* + )? + (?: + [0-9]+\s*w(?:eeks?)?\s* + )? + (?: + (?P<days>[0-9]+)\s*d(?:ays?)?\s* + )? + T)? + (?: + (?P<hours>[0-9]+)\s*h(?:ours?)?\s* + )? + (?: + (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s* + )? + (?: + (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s* + )?Z?$''', s) + if m: + days, hours, mins, secs, ms = m.groups() + else: + m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s) + if m: + hours, mins = m.groups() + else: + return None + + duration = 0 + if secs: + duration += float(secs) + if mins: + duration += float(mins) * 60 + if hours: + duration += float(hours) * 60 * 60 + if days: + duration += float(days) * 24 * 60 * 60 + if ms: + duration += float(ms) + return duration def prepend_extension(filename, ext, expected_real_ext=None): @@ -1542,8 +2050,12 @@ """ Returns the version of the specified executable, or False if the executable is not present """ try: + # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers + # SIGTTOU if youtube-dl is run in the background. + # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656 out, _ = subprocess.Popen( [encodeArgument(exe)] + args, + stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate() except OSError: return False @@ -1570,9 +2082,12 @@ class OnDemandPagedList(PagedList): - def __init__(self, pagefunc, pagesize): + def __init__(self, pagefunc, pagesize, use_cache=True): self._pagefunc = pagefunc self._pagesize = pagesize + self._use_cache = use_cache + if use_cache: + self._cache = {} def getslice(self, start=0, end=None): res = [] @@ -1582,7 +2097,13 @@ if start >= nextfirstid: continue - page_results = list(self._pagefunc(pagenum)) + page_results = None + if self._use_cache: + page_results = self._cache.get(pagenum) + if page_results is None: + page_results = list(self._pagefunc(pagenum)) + if self._use_cache: + self._cache[pagenum] = page_results startv = ( start % self._pagesize @@ -1668,29 +2189,13 @@ """Escape URL as suggested by RFC 3986""" url_parsed = compat_urllib_parse_urlparse(url) return url_parsed._replace( + netloc=url_parsed.netloc.encode('idna').decode('ascii'), path=escape_rfc3986(url_parsed.path), params=escape_rfc3986(url_parsed.params), query=escape_rfc3986(url_parsed.query), fragment=escape_rfc3986(url_parsed.fragment) ).geturl() -try: - struct.pack('!I', 0) -except TypeError: - # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument - def struct_pack(spec, *args): - if isinstance(spec, compat_str): - spec = spec.encode('ascii') - return struct.pack(spec, *args) - - def struct_unpack(spec, *args): - if isinstance(spec, compat_str): - spec = spec.encode('ascii') - return struct.unpack(spec, *args) -else: - struct_pack = struct.pack - struct_unpack = struct.unpack - def read_batch_urls(batch_fd): def fixup(url): @@ -1709,13 +2214,89 @@ def urlencode_postdata(*args, **kargs): - return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii') + return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii') + + +def update_url_query(url, query): + if not query: + return url + parsed_url = compat_urlparse.urlparse(url) + qs = compat_parse_qs(parsed_url.query) + qs.update(query) + return compat_urlparse.urlunparse(parsed_url._replace( + query=compat_urllib_parse_urlencode(qs, True))) + + +def update_Request(req, url=None, data=None, headers={}, query={}): + req_headers = req.headers.copy() + req_headers.update(headers) + req_data = data or req.data + req_url = update_url_query(url or req.get_full_url(), query) + req_get_method = req.get_method() + if req_get_method == 'HEAD': + req_type = HEADRequest + elif req_get_method == 'PUT': + req_type = PUTRequest + else: + req_type = compat_urllib_request.Request + new_req = req_type( + req_url, data=req_data, headers=req_headers, + origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) + if hasattr(req, 'timeout'): + new_req.timeout = req.timeout + return new_req + + +def _multipart_encode_impl(data, boundary): + content_type = 'multipart/form-data; boundary=%s' % boundary + + out = b'' + for k, v in data.items(): + out += b'--' + boundary.encode('ascii') + b'\r\n' + if isinstance(k, compat_str): + k = k.encode('utf-8') + if isinstance(v, compat_str): + v = v.encode('utf-8') + # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578 + # suggests sending UTF-8 directly. Firefox sends UTF-8, too + content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n' + if boundary.encode('ascii') in content: + raise ValueError('Boundary overlaps with data') + out += content + + out += b'--' + boundary.encode('ascii') + b'--\r\n' + + return out, content_type -def encode_dict(d, encoding='utf-8'): - def encode(v): - return v.encode(encoding) if isinstance(v, compat_basestring) else v - return dict((encode(k), encode(v)) for k, v in d.items()) +def multipart_encode(data, boundary=None): + ''' + Encode a dict to RFC 7578-compliant form-data + + data: + A dict where keys and values can be either Unicode or bytes-like + objects. + boundary: + If specified a Unicode object, it's used as the boundary. Otherwise + a random boundary is generated. + + Reference: https://tools.ietf.org/html/rfc7578 + ''' + has_specified_boundary = boundary is not None + + while True: + if boundary is None: + boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff)) + + try: + out, content_type = _multipart_encode_impl(data, boundary) + break + except ValueError: + if has_specified_boundary: + raise + boundary = None + + return out, content_type def dict_get(d, key_or_keys, default=None, skip_false_values=True): @@ -1728,6 +2309,33 @@ return d.get(key_or_keys, default) +def try_get(src, getter, expected_type=None): + if not isinstance(getter, (list, tuple)): + getter = [getter] + for get in getter: + try: + v = get(src) + except (AttributeError, KeyError, TypeError, IndexError): + pass + else: + if expected_type is None or isinstance(v, expected_type): + return v + + +def merge_dicts(*dicts): + merged = {} + for a_dict in dicts: + for k, v in a_dict.items(): + if v is None: + continue + if (k not in merged + or (isinstance(v, compat_str) and v + and isinstance(merged[k], compat_str) + and not merged[k])): + merged[k] = v + return merged + + def encode_compat_str(string, encoding=preferredencoding(), errors='strict'): return string if isinstance(string, compat_str) else compat_str(string, encoding, errors) @@ -1741,41 +2349,81 @@ } +TV_PARENTAL_GUIDELINES = { + 'TV-Y': 0, + 'TV-Y7': 7, + 'TV-G': 0, + 'TV-PG': 0, + 'TV-14': 14, + 'TV-MA': 17, +} + + def parse_age_limit(s): - if s is None: + if type(s) == int: + return s if 0 <= s <= 21 else None + if not isinstance(s, compat_basestring): return None m = re.match(r'^(?P<age>\d{1,2})\+?$', s) - return int(m.group('age')) if m else US_RATINGS.get(s) + if m: + return int(m.group('age')) + if s in US_RATINGS: + return US_RATINGS[s] + m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s) + if m: + return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)] + return None def strip_jsonp(code): return re.sub( - r'(?s)^[a-zA-Z0-9_.]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code) + r'''(?sx)^ + (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*) + (?:\s*&&\s*(?P=func_name))? + \s*\(\s*(?P<callback_data>.*)\);? + \s*?(?://[^\n]*)*$''', + r'\g<callback_data>', code) def js_to_json(code): + COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*' + SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE) + INTEGER_TABLE = ( + (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16), + (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8), + ) + def fix_kv(m): v = m.group(0) if v in ('true', 'false', 'null'): return v - if v.startswith('"'): - v = re.sub(r"\\'", "'", v[1:-1]) - elif v.startswith("'"): - v = v[1:-1] - v = re.sub(r"\\\\|\\'|\"", lambda m: { - '\\\\': '\\\\', - "\\'": "'", + elif v.startswith('/*') or v.startswith('//') or v == ',': + return "" + + if v[0] in ("'", '"'): + v = re.sub(r'(?s)\\.|"', lambda m: { '"': '\\"', - }[m.group(0)], v) + "\\'": "'", + '\\\n': '', + '\\x': '\\u00', + }.get(m.group(0), m.group(0)), v[1:-1]) + + for regex, base in INTEGER_TABLE: + im = re.match(regex, v) + if im: + i = int(im.group(1), base) + return '"%d":' % i if v.endswith(':') else '%d' % i + return '"%s"' % v - res = re.sub(r'''(?x) - "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"| - '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'| - [a-zA-Z_][.a-zA-Z_0-9]* - ''', fix_kv, code) - res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res) - return res + return re.sub(r'''(?sx) + "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"| + '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'| + {comment}|,(?={skip}[\]}}])| + (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*| + \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?| + [0-9]+(?={skip}:) + '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code) def qualities(quality_ids): @@ -1823,7 +2471,7 @@ def args_to_str(args): # Get a short string representation for a subprocess command - return ' '.join(shlex_quote(a) for a in args) + return ' '.join(compat_shlex_quote(a) for a in args) def error_to_compat_str(err): @@ -1836,33 +2484,80 @@ def mimetype2ext(mt): + if mt is None: + return None + ext = { 'audio/mp4': 'm4a', + # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as + # it's the most popular one + 'audio/mpeg': 'mp3', }.get(mt) if ext is not None: return ext _, _, res = mt.rpartition('/') + res = res.split(';')[0].strip().lower() return { '3gpp': '3gp', 'smptett+xml': 'tt', - 'srt': 'srt', 'ttaf+xml': 'dfxp', 'ttml+xml': 'ttml', - 'vtt': 'vtt', 'x-flv': 'flv', 'x-mp4-fragmented': 'mp4', + 'x-ms-sami': 'sami', 'x-ms-wmv': 'wmv', + 'mpegurl': 'm3u8', + 'x-mpegurl': 'm3u8', + 'vnd.apple.mpegurl': 'm3u8', + 'dash+xml': 'mpd', + 'f4m+xml': 'f4m', + 'hds+xml': 'f4m', + 'vnd.ms-sstr+xml': 'ism', + 'quicktime': 'mov', + 'mp2t': 'ts', }.get(res, res) +def parse_codecs(codecs_str): + # http://tools.ietf.org/html/rfc6381 + if not codecs_str: + return {} + splited_codecs = list(filter(None, map( + lambda str: str.strip(), codecs_str.strip().strip(',').split(',')))) + vcodec, acodec = None, None + for full_codec in splited_codecs: + codec = full_codec.split('.')[0] + if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1', 'av01'): + if not vcodec: + vcodec = full_codec + elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'): + if not acodec: + acodec = full_codec + else: + write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr) + if not vcodec and not acodec: + if len(splited_codecs) == 2: + return { + 'vcodec': vcodec, + 'acodec': acodec, + } + elif len(splited_codecs) == 1: + return { + 'vcodec': 'none', + 'acodec': vcodec, + } + else: + return { + 'vcodec': vcodec or 'none', + 'acodec': acodec or 'none', + } + return {} + + def urlhandle_detect_ext(url_handle): - try: - url_handle.headers - getheader = lambda h: url_handle.headers[h] - except AttributeError: # Python < 3 - getheader = url_handle.info().getheader + getheader = url_handle.headers.get cd = getheader('Content-Disposition') if cd: @@ -1953,6 +2648,7 @@ \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s* (?: (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)| + (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)| (?P<strval>(?![0-9.])[a-z0-9A-Z]*) ) \s*$ @@ -1960,11 +2656,22 @@ m = operator_rex.search(filter_part) if m: op = COMPARISON_OPERATORS[m.group('op')] - if m.group('strval') is not None: + actual_value = dct.get(m.group('key')) + if (m.group('quotedstrval') is not None + or m.group('strval') is not None + # If the original field is a string and matching comparisonvalue is + # a number we should respect the origin of the original field + # and process comparison value as a string (see + # https://github.com/ytdl-org/youtube-dl/issues/11082). + or actual_value is not None and m.group('intval') is not None + and isinstance(actual_value, compat_str)): if m.group('op') not in ('=', '!='): raise ValueError( 'Operator %s does not support string values!' % m.group('op')) - comparison_value = m.group('strval') + comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval') + quote = m.group('quote') + if quote is not None: + comparison_value = comparison_value.replace(r'\%s' % quote, quote) else: try: comparison_value = int(m.group('intval')) @@ -1976,14 +2683,13 @@ raise ValueError( 'Invalid integer value %r in filter part %r' % ( m.group('intval'), filter_part)) - actual_value = dct.get(m.group('key')) if actual_value is None: return m.group('none_inclusive') return op(actual_value, comparison_value) UNARY_OPERATORS = { - '': lambda v: v is not None, - '!': lambda v: v is None, + '': lambda v: (v is True) if isinstance(v, bool) else (v is not None), + '!': lambda v: (v is False) if isinstance(v, bool) else (v is None), } operator_rex = re.compile(r'''(?x)\s* (?P<op>%s)\s*(?P<key>[a-z_]+) @@ -2033,26 +2739,102 @@ def dfxp2srt(dfxp_data): + ''' + @param dfxp_data A bytes-like object containing DFXP data + @returns A unicode object containing converted SRT data + ''' + LEGACY_NAMESPACES = ( + (b'http://www.w3.org/ns/ttml', [ + b'http://www.w3.org/2004/11/ttaf1', + b'http://www.w3.org/2006/04/ttaf1', + b'http://www.w3.org/2006/10/ttaf1', + ]), + (b'http://www.w3.org/ns/ttml#styling', [ + b'http://www.w3.org/ns/ttml#style', + ]), + ) + + SUPPORTED_STYLING = [ + 'color', + 'fontFamily', + 'fontSize', + 'fontStyle', + 'fontWeight', + 'textDecoration' + ] + _x = functools.partial(xpath_with_ns, ns_map={ + 'xml': 'http://www.w3.org/XML/1998/namespace', 'ttml': 'http://www.w3.org/ns/ttml', - 'ttaf1': 'http://www.w3.org/2006/10/ttaf1', + 'tts': 'http://www.w3.org/ns/ttml#styling', }) + styles = {} + default_style = {} + class TTMLPElementParser(object): - out = '' + _out = '' + _unclosed_elements = [] + _applied_styles = [] def start(self, tag, attrib): - if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'): - self.out += '\n' + if tag in (_x('ttml:br'), 'br'): + self._out += '\n' + else: + unclosed_elements = [] + style = {} + element_style_id = attrib.get('style') + if default_style: + style.update(default_style) + if element_style_id: + style.update(styles.get(element_style_id, {})) + for prop in SUPPORTED_STYLING: + prop_val = attrib.get(_x('tts:' + prop)) + if prop_val: + style[prop] = prop_val + if style: + font = '' + for k, v in sorted(style.items()): + if self._applied_styles and self._applied_styles[-1].get(k) == v: + continue + if k == 'color': + font += ' color="%s"' % v + elif k == 'fontSize': + font += ' size="%s"' % v + elif k == 'fontFamily': + font += ' face="%s"' % v + elif k == 'fontWeight' and v == 'bold': + self._out += '<b>' + unclosed_elements.append('b') + elif k == 'fontStyle' and v == 'italic': + self._out += '<i>' + unclosed_elements.append('i') + elif k == 'textDecoration' and v == 'underline': + self._out += '<u>' + unclosed_elements.append('u') + if font: + self._out += '<font' + font + '>' + unclosed_elements.append('font') + applied_style = {} + if self._applied_styles: + applied_style.update(self._applied_styles[-1]) + applied_style.update(style) + self._applied_styles.append(applied_style) + self._unclosed_elements.append(unclosed_elements) def end(self, tag): - pass + if tag not in (_x('ttml:br'), 'br'): + unclosed_elements = self._unclosed_elements.pop() + for element in reversed(unclosed_elements): + self._out += '</%s>' % element + if unclosed_elements and self._applied_styles: + self._applied_styles.pop() def data(self, data): - self.out += data + self._out += data def close(self): - return self.out.strip() + return self._out.strip() def parse_node(node): target = TTMLPElementParser() @@ -2060,13 +2842,47 @@ parser.feed(xml.etree.ElementTree.tostring(node)) return parser.close() - dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8')) + for k, v in LEGACY_NAMESPACES: + for ns in v: + dfxp_data = dfxp_data.replace(ns, k) + + dfxp = compat_etree_fromstring(dfxp_data) out = [] - paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p') + paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p') if not paras: raise ValueError('Invalid dfxp/TTML subtitle') + repeat = False + while True: + for style in dfxp.findall(_x('.//ttml:style')): + style_id = style.get('id') or style.get(_x('xml:id')) + if not style_id: + continue + parent_style_id = style.get('style') + if parent_style_id: + if parent_style_id not in styles: + repeat = True + continue + styles[style_id] = styles[parent_style_id].copy() + for prop in SUPPORTED_STYLING: + prop_val = style.get(_x('tts:' + prop)) + if prop_val: + styles.setdefault(style_id, {})[prop] = prop_val + if repeat: + repeat = False + else: + break + + for p in ('body', 'div'): + ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p]) + if ele is None: + continue + style = styles.get(ele.get('style')) + if not style: + continue + default_style.update(style) + for para, index in zip(paras, itertools.count(1)): begin_time = parse_dfxp_time_expr(para.attrib.get('begin')) end_time = parse_dfxp_time_expr(para.attrib.get('end')) @@ -2088,11 +2904,15 @@ def cli_option(params, command_option, param): param = params.get(param) + if param: + param = compat_str(param) return [command_option, param] if param is not None else [] def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None): param = params.get(param) + if param is None: + return [] assert isinstance(param, bool) if separator: return [command_option + separator + (true_value if param else false_value)] @@ -2172,6 +2992,7 @@ 'gv': 'glv', 'ha': 'hau', 'he': 'heb', + 'iw': 'heb', # Replaced by he in 1989 revision 'hi': 'hin', 'ho': 'hmo', 'hr': 'hrv', @@ -2181,6 +3002,7 @@ 'hz': 'her', 'ia': 'ina', 'id': 'ind', + 'in': 'ind', # Replaced by id in 1989 revision 'ie': 'ile', 'ig': 'ibo', 'ii': 'iii', @@ -2295,6 +3117,7 @@ 'wo': 'wol', 'xh': 'xho', 'yi': 'yid', + 'ji': 'yid', # Replaced by yi in 1989 revision 'yo': 'yor', 'za': 'zha', 'zh': 'zho', @@ -2574,6 +3397,263 @@ return cls._country_map.get(code.upper()) +class GeoUtils(object): + # Major IPv4 address blocks per country + _country_ip_map = { + 'AD': '85.94.160.0/19', + 'AE': '94.200.0.0/13', + 'AF': '149.54.0.0/17', + 'AG': '209.59.64.0/18', + 'AI': '204.14.248.0/21', + 'AL': '46.99.0.0/16', + 'AM': '46.70.0.0/15', + 'AO': '105.168.0.0/13', + 'AP': '159.117.192.0/21', + 'AR': '181.0.0.0/12', + 'AS': '202.70.112.0/20', + 'AT': '84.112.0.0/13', + 'AU': '1.128.0.0/11', + 'AW': '181.41.0.0/18', + 'AZ': '5.191.0.0/16', + 'BA': '31.176.128.0/17', + 'BB': '65.48.128.0/17', + 'BD': '114.130.0.0/16', + 'BE': '57.0.0.0/8', + 'BF': '129.45.128.0/17', + 'BG': '95.42.0.0/15', + 'BH': '37.131.0.0/17', + 'BI': '154.117.192.0/18', + 'BJ': '137.255.0.0/16', + 'BL': '192.131.134.0/24', + 'BM': '196.12.64.0/18', + 'BN': '156.31.0.0/16', + 'BO': '161.56.0.0/16', + 'BQ': '161.0.80.0/20', + 'BR': '152.240.0.0/12', + 'BS': '24.51.64.0/18', + 'BT': '119.2.96.0/19', + 'BW': '168.167.0.0/16', + 'BY': '178.120.0.0/13', + 'BZ': '179.42.192.0/18', + 'CA': '99.224.0.0/11', + 'CD': '41.243.0.0/16', + 'CF': '196.32.200.0/21', + 'CG': '197.214.128.0/17', + 'CH': '85.0.0.0/13', + 'CI': '154.232.0.0/14', + 'CK': '202.65.32.0/19', + 'CL': '152.172.0.0/14', + 'CM': '165.210.0.0/15', + 'CN': '36.128.0.0/10', + 'CO': '181.240.0.0/12', + 'CR': '201.192.0.0/12', + 'CU': '152.206.0.0/15', + 'CV': '165.90.96.0/19', + 'CW': '190.88.128.0/17', + 'CY': '46.198.0.0/15', + 'CZ': '88.100.0.0/14', + 'DE': '53.0.0.0/8', + 'DJ': '197.241.0.0/17', + 'DK': '87.48.0.0/12', + 'DM': '192.243.48.0/20', + 'DO': '152.166.0.0/15', + 'DZ': '41.96.0.0/12', + 'EC': '186.68.0.0/15', + 'EE': '90.190.0.0/15', + 'EG': '156.160.0.0/11', + 'ER': '196.200.96.0/20', + 'ES': '88.0.0.0/11', + 'ET': '196.188.0.0/14', + 'EU': '2.16.0.0/13', + 'FI': '91.152.0.0/13', + 'FJ': '144.120.0.0/16', + 'FM': '119.252.112.0/20', + 'FO': '88.85.32.0/19', + 'FR': '90.0.0.0/9', + 'GA': '41.158.0.0/15', + 'GB': '25.0.0.0/8', + 'GD': '74.122.88.0/21', + 'GE': '31.146.0.0/16', + 'GF': '161.22.64.0/18', + 'GG': '62.68.160.0/19', + 'GH': '45.208.0.0/14', + 'GI': '85.115.128.0/19', + 'GL': '88.83.0.0/19', + 'GM': '160.182.0.0/15', + 'GN': '197.149.192.0/18', + 'GP': '104.250.0.0/19', + 'GQ': '105.235.224.0/20', + 'GR': '94.64.0.0/13', + 'GT': '168.234.0.0/16', + 'GU': '168.123.0.0/16', + 'GW': '197.214.80.0/20', + 'GY': '181.41.64.0/18', + 'HK': '113.252.0.0/14', + 'HN': '181.210.0.0/16', + 'HR': '93.136.0.0/13', + 'HT': '148.102.128.0/17', + 'HU': '84.0.0.0/14', + 'ID': '39.192.0.0/10', + 'IE': '87.32.0.0/12', + 'IL': '79.176.0.0/13', + 'IM': '5.62.80.0/20', + 'IN': '117.192.0.0/10', + 'IO': '203.83.48.0/21', + 'IQ': '37.236.0.0/14', + 'IR': '2.176.0.0/12', + 'IS': '82.221.0.0/16', + 'IT': '79.0.0.0/10', + 'JE': '87.244.64.0/18', + 'JM': '72.27.0.0/17', + 'JO': '176.29.0.0/16', + 'JP': '126.0.0.0/8', + 'KE': '105.48.0.0/12', + 'KG': '158.181.128.0/17', + 'KH': '36.37.128.0/17', + 'KI': '103.25.140.0/22', + 'KM': '197.255.224.0/20', + 'KN': '198.32.32.0/19', + 'KP': '175.45.176.0/22', + 'KR': '175.192.0.0/10', + 'KW': '37.36.0.0/14', + 'KY': '64.96.0.0/15', + 'KZ': '2.72.0.0/13', + 'LA': '115.84.64.0/18', + 'LB': '178.135.0.0/16', + 'LC': '192.147.231.0/24', + 'LI': '82.117.0.0/19', + 'LK': '112.134.0.0/15', + 'LR': '41.86.0.0/19', + 'LS': '129.232.0.0/17', + 'LT': '78.56.0.0/13', + 'LU': '188.42.0.0/16', + 'LV': '46.109.0.0/16', + 'LY': '41.252.0.0/14', + 'MA': '105.128.0.0/11', + 'MC': '88.209.64.0/18', + 'MD': '37.246.0.0/16', + 'ME': '178.175.0.0/17', + 'MF': '74.112.232.0/21', + 'MG': '154.126.0.0/17', + 'MH': '117.103.88.0/21', + 'MK': '77.28.0.0/15', + 'ML': '154.118.128.0/18', + 'MM': '37.111.0.0/17', + 'MN': '49.0.128.0/17', + 'MO': '60.246.0.0/16', + 'MP': '202.88.64.0/20', + 'MQ': '109.203.224.0/19', + 'MR': '41.188.64.0/18', + 'MS': '208.90.112.0/22', + 'MT': '46.11.0.0/16', + 'MU': '105.16.0.0/12', + 'MV': '27.114.128.0/18', + 'MW': '105.234.0.0/16', + 'MX': '187.192.0.0/11', + 'MY': '175.136.0.0/13', + 'MZ': '197.218.0.0/15', + 'NA': '41.182.0.0/16', + 'NC': '101.101.0.0/18', + 'NE': '197.214.0.0/18', + 'NF': '203.17.240.0/22', + 'NG': '105.112.0.0/12', + 'NI': '186.76.0.0/15', + 'NL': '145.96.0.0/11', + 'NO': '84.208.0.0/13', + 'NP': '36.252.0.0/15', + 'NR': '203.98.224.0/19', + 'NU': '49.156.48.0/22', + 'NZ': '49.224.0.0/14', + 'OM': '5.36.0.0/15', + 'PA': '186.72.0.0/15', + 'PE': '186.160.0.0/14', + 'PF': '123.50.64.0/18', + 'PG': '124.240.192.0/19', + 'PH': '49.144.0.0/13', + 'PK': '39.32.0.0/11', + 'PL': '83.0.0.0/11', + 'PM': '70.36.0.0/20', + 'PR': '66.50.0.0/16', + 'PS': '188.161.0.0/16', + 'PT': '85.240.0.0/13', + 'PW': '202.124.224.0/20', + 'PY': '181.120.0.0/14', + 'QA': '37.210.0.0/15', + 'RE': '139.26.0.0/16', + 'RO': '79.112.0.0/13', + 'RS': '178.220.0.0/14', + 'RU': '5.136.0.0/13', + 'RW': '105.178.0.0/15', + 'SA': '188.48.0.0/13', + 'SB': '202.1.160.0/19', + 'SC': '154.192.0.0/11', + 'SD': '154.96.0.0/13', + 'SE': '78.64.0.0/12', + 'SG': '152.56.0.0/14', + 'SI': '188.196.0.0/14', + 'SK': '78.98.0.0/15', + 'SL': '197.215.0.0/17', + 'SM': '89.186.32.0/19', + 'SN': '41.82.0.0/15', + 'SO': '197.220.64.0/19', + 'SR': '186.179.128.0/17', + 'SS': '105.235.208.0/21', + 'ST': '197.159.160.0/19', + 'SV': '168.243.0.0/16', + 'SX': '190.102.0.0/20', + 'SY': '5.0.0.0/16', + 'SZ': '41.84.224.0/19', + 'TC': '65.255.48.0/20', + 'TD': '154.68.128.0/19', + 'TG': '196.168.0.0/14', + 'TH': '171.96.0.0/13', + 'TJ': '85.9.128.0/18', + 'TK': '27.96.24.0/21', + 'TL': '180.189.160.0/20', + 'TM': '95.85.96.0/19', + 'TN': '197.0.0.0/11', + 'TO': '175.176.144.0/21', + 'TR': '78.160.0.0/11', + 'TT': '186.44.0.0/15', + 'TV': '202.2.96.0/19', + 'TW': '120.96.0.0/11', + 'TZ': '156.156.0.0/14', + 'UA': '93.72.0.0/13', + 'UG': '154.224.0.0/13', + 'US': '3.0.0.0/8', + 'UY': '167.56.0.0/13', + 'UZ': '82.215.64.0/18', + 'VA': '212.77.0.0/19', + 'VC': '24.92.144.0/20', + 'VE': '186.88.0.0/13', + 'VG': '172.103.64.0/18', + 'VI': '146.226.0.0/16', + 'VN': '14.160.0.0/11', + 'VU': '202.80.32.0/20', + 'WF': '117.20.32.0/21', + 'WS': '202.4.32.0/19', + 'YE': '134.35.0.0/16', + 'YT': '41.242.116.0/22', + 'ZA': '41.0.0.0/11', + 'ZM': '165.56.0.0/13', + 'ZW': '41.85.192.0/19', + } + + @classmethod + def random_ipv4(cls, code_or_block): + if len(code_or_block) == 2: + block = cls._country_ip_map.get(code_or_block.upper()) + if not block: + return None + else: + block = code_or_block + addr, preflen = block.split('/') + addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0] + addr_max = addr_min | (0xffffffff >> int(preflen)) + return compat_str(socket.inet_ntoa( + compat_struct_pack('!L', random.randint(addr_min, addr_max)))) + + class PerRequestProxyHandler(compat_urllib_request.ProxyHandler): def __init__(self, proxies=None): # Set default handlers @@ -2581,7 +3661,7 @@ setattr(self, '%s_open' % type, lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open: meth(r, proxy, type)) - return compat_urllib_request.ProxyHandler.__init__(self, proxies) + compat_urllib_request.ProxyHandler.__init__(self, proxies) def proxy_open(self, req, proxy, type): req_proxy = req.headers.get('Ytdl-request-proxy') @@ -2591,10 +3671,65 @@ if proxy == '__noproxy__': return None # No Proxy + if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'): + req.add_header('Ytdl-socks-proxy', proxy) + # youtube-dl's http/https handlers do wrapping the socket with socks + return None return compat_urllib_request.ProxyHandler.proxy_open( self, req, proxy, type) +# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is +# released into Public Domain +# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387 + +def long_to_bytes(n, blocksize=0): + """long_to_bytes(n:long, blocksize:int) : string + Convert a long integer to a byte string. + + If optional blocksize is given and greater than zero, pad the front of the + byte string with binary zeros so that the length is a multiple of + blocksize. + """ + # after much testing, this algorithm was deemed to be the fastest + s = b'' + n = int(n) + while n > 0: + s = compat_struct_pack('>I', n & 0xffffffff) + s + n = n >> 32 + # strip off leading zeros + for i in range(len(s)): + if s[i] != b'\000'[0]: + break + else: + # only happens when n == 0 + s = b'\000' + i = 0 + s = s[i:] + # add back some pad bytes. this could be done more efficiently w.r.t. the + # de-padding being done above, but sigh... + if blocksize > 0 and len(s) % blocksize: + s = (blocksize - len(s) % blocksize) * b'\000' + s + return s + + +def bytes_to_long(s): + """bytes_to_long(string) : long + Convert a byte string to a long integer. + + This is (essentially) the inverse of long_to_bytes(). + """ + acc = 0 + length = len(s) + if length % 4: + extra = (4 - length % 4) + s = b'\000' * extra + s + length = length + extra + for i in range(0, length, 4): + acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0] + return acc + + def ohdave_rsa_encrypt(data, exponent, modulus): ''' Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/ @@ -2610,3 +3745,270 @@ payload = int(binascii.hexlify(data[::-1]), 16) encrypted = pow(payload, exponent, modulus) return '%x' % encrypted + + +def pkcs1pad(data, length): + """ + Padding input data with PKCS#1 scheme + + @param {int[]} data input data + @param {int} length target length + @returns {int[]} padded data + """ + if len(data) > length - 11: + raise ValueError('Input data too long for PKCS#1 padding') + + pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)] + return [0, 2] + pseudo_random + [0] + data + + +def encode_base_n(num, n, table=None): + FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' + if not table: + table = FULL_TABLE[:n] + + if n > len(table): + raise ValueError('base %d exceeds table length %d' % (n, len(table))) + + if num == 0: + return table[0] + + ret = '' + while num: + ret = table[num % n] + ret + num = num // n + return ret + + +def decode_packed_codes(code): + mobj = re.search(PACKED_CODES_RE, code) + obfucasted_code, base, count, symbols = mobj.groups() + base = int(base) + count = int(count) + symbols = symbols.split('|') + symbol_table = {} + + while count: + count -= 1 + base_n_count = encode_base_n(count, base) + symbol_table[base_n_count] = symbols[count] or base_n_count + + return re.sub( + r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)], + obfucasted_code) + + +def parse_m3u8_attributes(attrib): + info = {} + for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib): + if val.startswith('"'): + val = val[1:-1] + info[key] = val + return info + + +def urshift(val, n): + return val >> n if val >= 0 else (val + 0x100000000) >> n + + +# Based on png2str() written by @gdkchan and improved by @yokrysty +# Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706 +def decode_png(png_data): + # Reference: https://www.w3.org/TR/PNG/ + header = png_data[8:] + + if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR': + raise IOError('Not a valid PNG file.') + + int_map = {1: '>B', 2: '>H', 4: '>I'} + unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0] + + chunks = [] + + while header: + length = unpack_integer(header[:4]) + header = header[4:] + + chunk_type = header[:4] + header = header[4:] + + chunk_data = header[:length] + header = header[length:] + + header = header[4:] # Skip CRC + + chunks.append({ + 'type': chunk_type, + 'length': length, + 'data': chunk_data + }) + + ihdr = chunks[0]['data'] + + width = unpack_integer(ihdr[:4]) + height = unpack_integer(ihdr[4:8]) + + idat = b'' + + for chunk in chunks: + if chunk['type'] == b'IDAT': + idat += chunk['data'] + + if not idat: + raise IOError('Unable to read PNG data.') + + decompressed_data = bytearray(zlib.decompress(idat)) + + stride = width * 3 + pixels = [] + + def _get_pixel(idx): + x = idx % stride + y = idx // stride + return pixels[y][x] + + for y in range(height): + basePos = y * (1 + stride) + filter_type = decompressed_data[basePos] + + current_row = [] + + pixels.append(current_row) + + for x in range(stride): + color = decompressed_data[1 + basePos + x] + basex = y * stride + x + left = 0 + up = 0 + + if x > 2: + left = _get_pixel(basex - 3) + if y > 0: + up = _get_pixel(basex - stride) + + if filter_type == 1: # Sub + color = (color + left) & 0xff + elif filter_type == 2: # Up + color = (color + up) & 0xff + elif filter_type == 3: # Average + color = (color + ((left + up) >> 1)) & 0xff + elif filter_type == 4: # Paeth + a = left + b = up + c = 0 + + if x > 2 and y > 0: + c = _get_pixel(basex - stride - 3) + + p = a + b - c + + pa = abs(p - a) + pb = abs(p - b) + pc = abs(p - c) + + if pa <= pb and pa <= pc: + color = (color + a) & 0xff + elif pb <= pc: + color = (color + b) & 0xff + else: + color = (color + c) & 0xff + + current_row.append(color) + + return width, height, pixels + + +def write_xattr(path, key, value): + # This mess below finds the best xattr tool for the job + try: + # try the pyxattr module... + import xattr + + if hasattr(xattr, 'set'): # pyxattr + # Unicode arguments are not supported in python-pyxattr until + # version 0.5.0 + # See https://github.com/ytdl-org/youtube-dl/issues/5498 + pyxattr_required_version = '0.5.0' + if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version): + # TODO: fallback to CLI tools + raise XAttrUnavailableError( + 'python-pyxattr is detected but is too old. ' + 'youtube-dl requires %s or above while your version is %s. ' + 'Falling back to other xattr implementations' % ( + pyxattr_required_version, xattr.__version__)) + + setxattr = xattr.set + else: # xattr + setxattr = xattr.setxattr + + try: + setxattr(path, key, value) + except EnvironmentError as e: + raise XAttrMetadataError(e.errno, e.strerror) + + except ImportError: + if compat_os_name == 'nt': + # Write xattrs to NTFS Alternate Data Streams: + # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29 + assert ':' not in key + assert os.path.exists(path) + + ads_fn = path + ':' + key + try: + with open(ads_fn, 'wb') as f: + f.write(value) + except EnvironmentError as e: + raise XAttrMetadataError(e.errno, e.strerror) + else: + user_has_setfattr = check_executable('setfattr', ['--version']) + user_has_xattr = check_executable('xattr', ['-h']) + + if user_has_setfattr or user_has_xattr: + + value = value.decode('utf-8') + if user_has_setfattr: + executable = 'setfattr' + opts = ['-n', key, '-v', value] + elif user_has_xattr: + executable = 'xattr' + opts = ['-w', key, value] + + cmd = ([encodeFilename(executable, True)] + + [encodeArgument(o) for o in opts] + + [encodeFilename(path, True)]) + + try: + p = subprocess.Popen( + cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) + except EnvironmentError as e: + raise XAttrMetadataError(e.errno, e.strerror) + stdout, stderr = p.communicate() + stderr = stderr.decode('utf-8', 'replace') + if p.returncode != 0: + raise XAttrMetadataError(p.returncode, stderr) + + else: + # On Unix, and can't find pyxattr, setfattr, or xattr. + if sys.platform.startswith('linux'): + raise XAttrUnavailableError( + "Couldn't find a tool to set the xattrs. " + "Install either the python 'pyxattr' or 'xattr' " + "modules, or the GNU 'attr' package " + "(which contains the 'setfattr' tool).") + else: + raise XAttrUnavailableError( + "Couldn't find a tool to set the xattrs. " + "Install either the python 'xattr' module, " + "or the 'xattr' binary.") + + +def random_birthday(year_field, month_field, day_field): + start_date = datetime.date(1950, 1, 1) + end_date = datetime.date(1995, 12, 31) + offset = random.randint(0, (end_date - start_date).days) + random_date = start_date + datetime.timedelta(offset) + return { + year_field: str(random_date.year), + month_field: str(random_date.month), + day_field: str(random_date.day), + } diff -Nru youtube-dl-2016.02.22/youtube_dl/version.py youtube-dl-2019.05.20/youtube_dl/version.py --- youtube-dl-2016.02.22/youtube_dl/version.py 2016-02-22 10:57:28.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/version.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.02.22' +__version__ = '2019.05.20' diff -Nru youtube-dl-2016.02.22/youtube_dl/YoutubeDL.py youtube-dl-2019.05.20/youtube_dl/YoutubeDL.py --- youtube-dl-2016.02.22/youtube_dl/YoutubeDL.py 2016-02-14 22:26:17.000000000 +0000 +++ youtube-dl-2019.05.20/youtube_dl/YoutubeDL.py 2019-06-07 18:49:07.000000000 +0000 @@ -1,10 +1,11 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import absolute_import, unicode_literals import collections import contextlib +import copy import datetime import errno import fileinput @@ -23,17 +24,18 @@ import time import tokenize import traceback +import random -if os.name == 'nt': - import ctypes +from string import ascii_letters from .compat import ( compat_basestring, compat_cookiejar, - compat_expanduser, compat_get_terminal_size, compat_http_client, compat_kwargs, + compat_numeric_types, + compat_os_name, compat_str, compat_tokenize_tokenize, compat_urllib_error, @@ -41,6 +43,8 @@ compat_urllib_request_DataHandler, ) from .utils import ( + age_restricted, + args_to_str, ContentTooShortError, date_from_str, DateRange, @@ -51,42 +55,51 @@ encode_compat_str, encodeFilename, error_to_compat_str, + expand_path, ExtractorError, format_bytes, formatSeconds, + GeoRestrictedError, + int_or_none, + ISO3166Utils, locked_file, make_HTTPS_handler, MaxDownloadsReached, + orderedSet, PagedList, parse_filesize, PerRequestProxyHandler, - PostProcessingError, platform_name, + PostProcessingError, preferredencoding, + prepend_extension, + register_socks_protocols, render_table, + replace_extension, SameFileError, sanitize_filename, sanitize_path, + sanitize_url, sanitized_Request, std_headers, + str_or_none, subtitles_filename, UnavailableVideoError, url_basename, version_tuple, write_json_file, write_string, + YoutubeDLCookieJar, YoutubeDLCookieProcessor, YoutubeDLHandler, - prepend_extension, - replace_extension, - args_to_str, - age_restricted, ) from .cache import Cache -from .extractor import get_info_extractor, gen_extractors +from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER +from .extractor.openload import PhantomJSwrapper from .downloader import get_suitable_downloader from .downloader.rtmp import rtmpdump_version from .postprocessor import ( + FFmpegFixupM3u8PP, FFmpegFixupM4aPP, FFmpegFixupStretchedPP, FFmpegMergerPP, @@ -95,6 +108,9 @@ ) from .version import __version__ +if compat_os_name == 'nt': + import ctypes + class YoutubeDL(object): """YoutubeDL class. @@ -126,6 +142,9 @@ username: Username for authentication purposes. password: Password for authentication purposes. videopassword: Password for accessing a video. + ap_mso: Adobe Pass multiple-system operator identifier. + ap_username: Multiple-system operator account username. + ap_password: Multiple-system operator account password. usenetrc: Use netrc for authentication instead. verbose: Print additional info to stdout. quiet: Do not print messages to stdout. @@ -151,6 +170,7 @@ playlistend: Playlist item to end at. playlist_items: Specific indices of playlist to download. playlistreverse: Download playlist items in reverse order. + playlistrandom: Download playlist items in random order. matchtitle: Download only matching titles. rejecttitle: Reject downloads for matching titles. logger: Log messages to a logging.Logger instance. @@ -192,8 +212,8 @@ prefer_insecure: Use HTTP instead of HTTPS to retrieve information. At the moment, this is only supported by YouTube. proxy: URL of the proxy server to use - cn_verification_proxy: URL of the proxy to use for IP address verification - on Chinese sites. (Experimental) + geo_verification_proxy: URL of the proxy to use for IP address verification + on geo-restricted sites. socket_timeout: Time to wait for unresponsive hosts, in seconds bidi_workaround: Work around buggy terminals without bidirectional text support, using fridibi @@ -241,10 +261,19 @@ - "warn": only emit a warning - "detect_or_warn": check whether we can do anything about it, warn otherwise (default) - source_address: (Experimental) Client-side IP address to bind to. + source_address: Client-side IP address to bind to. call_home: Boolean, true iff we are allowed to contact the youtube-dl servers for debugging. - sleep_interval: Number of seconds to sleep before each download. + sleep_interval: Number of seconds to sleep before each download when + used alone or a lower bound of a range for randomized + sleep before each download (minimum possible number + of seconds to sleep) when used along with + max_sleep_interval. + max_sleep_interval:Upper bound of a range for randomized sleep before each + download (maximum possible number of seconds to sleep). + Must only be used along with sleep_interval. + Actual sleep time will be a random float from range + [sleep_interval; max_sleep_interval]. listformats: Print an overview of available video formats and exit. list_thumbnails: Print a table of all thumbnails and exit. match_filter: A function that gets called with the info_dict of @@ -253,25 +282,56 @@ If it returns None, the video is downloaded. match_filter_func in utils.py is one example for this. no_color: Do not emit color codes in output. + geo_bypass: Bypass geographic restriction via faking X-Forwarded-For + HTTP header + geo_bypass_country: + Two-letter ISO 3166-2 country code that will be used for + explicit geographic restriction bypassing via faking + X-Forwarded-For HTTP header + geo_bypass_ip_block: + IP range in CIDR notation that will be used similarly to + geo_bypass_country The following options determine which downloader is picked: external_downloader: Executable of the external downloader to call. None or unset for standard (built-in) downloader. - hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv. + hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv + if True, otherwise use ffmpeg/avconv if False, otherwise + use downloader suggested by extractor if None. The following parameters are not used by YoutubeDL itself, they are used by the downloader (see youtube_dl/downloader/common.py): nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test, noresizebuffer, retries, continuedl, noprogress, consoletitle, - xattr_set_filesize, external_downloader_args, hls_use_mpegts. + xattr_set_filesize, external_downloader_args, hls_use_mpegts, + http_chunk_size. The following options are used by the post processors: - prefer_ffmpeg: If True, use ffmpeg instead of avconv if both are available, - otherwise prefer avconv. + prefer_ffmpeg: If False, use avconv instead of ffmpeg if both are available, + otherwise prefer ffmpeg. + ffmpeg_location: Location of the ffmpeg/avconv binary; either the path + to the binary or its containing directory. postprocessor_args: A list of additional command-line arguments for the postprocessor. + + The following options are used by the Youtube extractor: + youtube_include_dash_manifest: If True (default), DASH manifests and related + data will be downloaded and processed by extractor. + You can reduce network I/O by disabling it if you don't + care about DASH. """ + _NUMERIC_FIELDS = set(( + 'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx', + 'timestamp', 'upload_year', 'upload_month', 'upload_day', + 'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count', + 'average_rating', 'comment_count', 'age_limit', + 'start_time', 'end_time', + 'chapter_number', 'season_number', 'episode_number', + 'track_number', 'disc_number', 'release_year', + 'playlist_index', + )) + params = None _ies = [] _pps = [] @@ -298,6 +358,21 @@ self.params.update(params) self.cache = Cache(self) + def check_deprecated(param, option, suggestion): + if self.params.get(param) is not None: + self.report_warning( + '%s is deprecated. Use %s instead.' % (option, suggestion)) + return True + return False + + if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'): + if self.params.get('geo_verification_proxy') is None: + self.params['geo_verification_proxy'] = self.params['cn_verification_proxy'] + + check_deprecated('autonumber_size', '--autonumber-size', 'output template with %(autonumber)0Nd, where N in the number of digits') + check_deprecated('autonumber', '--auto-number', '-o "%(autonumber)s-%(title)s.%(ext)s"') + check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"') + if params.get('bidi_workaround', False): try: import pty @@ -320,15 +395,15 @@ ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs) self._output_channel = os.fdopen(master, 'rb') except OSError as ose: - if ose.errno == 2: + if ose.errno == errno.ENOENT: self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that fribidi is an executable file in one of the directories in your $PATH.') else: raise - if (sys.version_info >= (3,) and sys.platform != 'win32' and - sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and - not params.get('restrictfilenames', False)): - # On Python 3, the Unicode filesystem API will throw errors (#1474) + if (sys.platform != 'win32' + and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] + and not params.get('restrictfilenames', False)): + # Unicode filesystem API will throw errors (#1474, #13027) self.report_warning( 'Assuming --restrict-filenames since file system encoding ' 'cannot encode all characters. ' @@ -356,6 +431,8 @@ for ph in self.params.get('progress_hooks', []): self.add_progress_hook(ph) + register_socks_protocols() + def warn_if_short_id(self, argv): # short YouTube ID starting with dash? idxs = [ @@ -363,9 +440,9 @@ if re.match(r'^-[0-9A-Za-z_-]{10}$', a)] if idxs: correct_argv = ( - ['youtube-dl'] + - [a for i, a in enumerate(argv) if i not in idxs] + - ['--'] + [argv[i] for i in idxs] + ['youtube-dl'] + + [a for i, a in enumerate(argv) if i not in idxs] + + ['--'] + [argv[i] for i in idxs] ) self.report_warning( 'Long argument string detected. ' @@ -375,8 +452,9 @@ def add_info_extractor(self, ie): """Add an InfoExtractor object to the end of the list.""" self._ies.append(ie) - self._ies_instances[ie.ie_key()] = ie - ie.set_downloader(self) + if not isinstance(ie, type): + self._ies_instances[ie.ie_key()] = ie + ie.set_downloader(self) def get_info_extractor(self, ie_key): """ @@ -394,7 +472,7 @@ """ Add the InfoExtractors returned by gen_extractors to the end of the list """ - for ie in gen_extractors(): + for ie in gen_extractor_classes(): self.add_info_extractor(ie) def add_post_processor(self, pp): @@ -450,24 +528,29 @@ def to_console_title(self, message): if not self.params.get('consoletitle', False): return - if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow(): - # c_wchar_p() might not be necessary if `message` is - # already of type unicode() - ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message)) + if compat_os_name == 'nt': + if ctypes.windll.kernel32.GetConsoleWindow(): + # c_wchar_p() might not be necessary if `message` is + # already of type unicode() + ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message)) elif 'TERM' in os.environ: self._write_string('\033]0;%s\007' % message, self._screen_file) def save_console_title(self): if not self.params.get('consoletitle', False): return - if 'TERM' in os.environ: + if self.params.get('simulate', False): + return + if compat_os_name != 'nt' and 'TERM' in os.environ: # Save the title on stack self._write_string('\033[22;0t', self._screen_file) def restore_console_title(self): if not self.params.get('consoletitle', False): return - if 'TERM' in os.environ: + if self.params.get('simulate', False): + return + if compat_os_name != 'nt' and 'TERM' in os.environ: # Restore the title from stack self._write_string('\033[23;0t', self._screen_file) @@ -479,7 +562,7 @@ self.restore_console_title() if self.params.get('cookiefile') is not None: - self.cookiejar.save() + self.cookiejar.save(ignore_discard=True, ignore_expires=True) def trouble(self, message=None, tb=None): """Determine action to take when a download problem appears. @@ -521,7 +604,7 @@ else: if self.params.get('no_warnings'): return - if not self.params.get('no_color') and self._err_file.isatty() and os.name != 'nt': + if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt': _msg_header = '\033[0;33mWARNING:\033[0m' else: _msg_header = 'WARNING:' @@ -533,7 +616,7 @@ Do the same as trouble, but prefixes the message with 'ERROR:', colored in red if stderr is a tty file. ''' - if not self.params.get('no_color') and self._err_file.isatty() and os.name != 'nt': + if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt': _msg_header = '\033[0;31mERROR:\033[0m' else: _msg_header = 'ERROR:' @@ -556,30 +639,76 @@ autonumber_size = self.params.get('autonumber_size') if autonumber_size is None: autonumber_size = 5 - autonumber_templ = '%0' + str(autonumber_size) + 'd' - template_dict['autonumber'] = autonumber_templ % self._num_downloads - if template_dict.get('playlist_index') is not None: - template_dict['playlist_index'] = '%0*d' % (len(str(template_dict['n_entries'])), template_dict['playlist_index']) + template_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads if template_dict.get('resolution') is None: if template_dict.get('width') and template_dict.get('height'): template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height']) elif template_dict.get('height'): template_dict['resolution'] = '%sp' % template_dict['height'] elif template_dict.get('width'): - template_dict['resolution'] = '?x%d' % template_dict['width'] + template_dict['resolution'] = '%dx?' % template_dict['width'] sanitize = lambda k, v: sanitize_filename( compat_str(v), restricted=self.params.get('restrictfilenames'), - is_id=(k == 'id')) - template_dict = dict((k, sanitize(k, v)) + is_id=(k == 'id' or k.endswith('_id'))) + template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v)) for k, v in template_dict.items() - if v is not None) + if v is not None and not isinstance(v, (list, tuple, dict))) template_dict = collections.defaultdict(lambda: 'NA', template_dict) outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) - tmpl = compat_expanduser(outtmpl) - filename = tmpl % template_dict + + # For fields playlist_index and autonumber convert all occurrences + # of %(field)s to %(field)0Nd for backward compatibility + field_size_compat_map = { + 'playlist_index': len(str(template_dict['n_entries'])), + 'autonumber': autonumber_size, + } + FIELD_SIZE_COMPAT_RE = r'(?<!%)%\((?P<field>autonumber|playlist_index)\)s' + mobj = re.search(FIELD_SIZE_COMPAT_RE, outtmpl) + if mobj: + outtmpl = re.sub( + FIELD_SIZE_COMPAT_RE, + r'%%(\1)0%dd' % field_size_compat_map[mobj.group('field')], + outtmpl) + + # Missing numeric fields used together with integer presentation types + # in format specification will break the argument substitution since + # string 'NA' is returned for missing fields. We will patch output + # template for missing fields to meet string presentation type. + for numeric_field in self._NUMERIC_FIELDS: + if numeric_field not in template_dict: + # As of [1] format syntax is: + # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type + # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting + FORMAT_RE = r'''(?x) + (?<!%) + % + \({0}\) # mapping key + (?:[#0\-+ ]+)? # conversion flags (optional) + (?:\d+)? # minimum field width (optional) + (?:\.\d+)? # precision (optional) + [hlL]? # length modifier (optional) + [diouxXeEfFgGcrs%] # conversion type + ''' + outtmpl = re.sub( + FORMAT_RE.format(numeric_field), + r'%({0})s'.format(numeric_field), outtmpl) + + # expand_path translates '%%' into '%' and '$$' into '$' + # correspondingly that is not what we want since we need to keep + # '%%' intact for template dict substitution step. Working around + # with boundary-alike separator hack. + sep = ''.join([random.choice(ascii_letters) for _ in range(32)]) + outtmpl = outtmpl.replace('%%', '%{0}%'.format(sep)).replace('$$', '${0}$'.format(sep)) + + # outtmpl should be expand_path'ed before template dict substitution + # because meta fields may contain env variables we don't want to + # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and + # title "Hello $PATH", we don't want `$PATH` to be expanded. + filename = expand_path(outtmpl).replace(sep, '') % template_dict + # Temporary fix for #4787 # 'Treat' all problem characters by passing filename through preferredencoding # to workaround encoding issues with subprocess on python2 @ Windows @@ -658,6 +787,7 @@ if not ie.suitable(url): continue + ie = self.get_info_extractor(ie.ie_key()) if not ie.working(): self.report_warning('The program functionality for this site has been marked as broken, ' 'and will probably not work.') @@ -677,6 +807,14 @@ return self.process_ie_result(ie_result, download, extra_info) else: return ie_result + except GeoRestrictedError as e: + msg = e.msg + if e.countries: + msg += '\nThis video is available in %s.' % ', '.join( + map(ISO3166Utils.short2full, e.countries)) + msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.' + self.report_error(msg) + break except ExtractorError as e: # An error we somewhat expected self.report_error(compat_str(e), e.format_traceback()) break @@ -710,9 +848,10 @@ result_type = ie_result.get('_type', 'video') if result_type in ('url', 'url_transparent'): + ie_result['url'] = sanitize_url(ie_result['url']) extract_flat = self.params.get('extract_flat', False) - if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or - extract_flat is True): + if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) + or extract_flat is True): if self.params.get('forcejson', False): self.to_stdout(json.dumps(ie_result)) return ie_result @@ -733,19 +872,32 @@ ie_result['url'], ie_key=ie_result.get('ie_key'), extra_info=extra_info, download=False, process=False) + # extract_info may return None when ignoreerrors is enabled and + # extraction failed with an error, don't crash and return early + # in this case + if not info: + return info + force_properties = dict( (k, v) for k, v in ie_result.items() if v is not None) - for f in ('_type', 'url', 'ie_key'): + for f in ('_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'): if f in force_properties: del force_properties[f] new_result = info.copy() new_result.update(force_properties) - assert new_result.get('_type') != 'url_transparent' + # Extracted info may not be a video result (i.e. + # info.get('_type', 'video') != video) but rather an url or + # url_transparent. In such cases outer metadata (from ie_result) + # should be propagated to inner one (info). For this to happen + # _type of info should be overridden with url_transparent. This + # fixes issue from https://github.com/ytdl-org/youtube-dl/pull/11163. + if new_result.get('_type') == 'url': + new_result['_type'] = 'url_transparent' return self.process_ie_result( new_result, download=download, extra_info=extra_info) - elif result_type == 'playlist' or result_type == 'multi_video': + elif result_type in ('playlist', 'multi_video'): # We process each entry in the playlist playlist = ie_result.get('title') or ie_result.get('id') self.to_screen('[download] Downloading playlist: %s' % playlist) @@ -769,15 +921,25 @@ yield int(item) else: yield int(string_segment) - playlistitems = iter_playlistitems(playlistitems_str) + playlistitems = orderedSet(iter_playlistitems(playlistitems_str)) ie_entries = ie_result['entries'] + + def make_playlistitems_entries(list_ie_entries): + num_entries = len(list_ie_entries) + return [ + list_ie_entries[i - 1] for i in playlistitems + if -num_entries <= i - 1 < num_entries] + + def report_download(num_entries): + self.to_screen( + '[%s] playlist %s: Downloading %d videos' % + (ie_result['extractor'], playlist, num_entries)) + if isinstance(ie_entries, list): n_all_entries = len(ie_entries) if playlistitems: - entries = [ - ie_entries[i - 1] for i in playlistitems - if -n_all_entries <= i - 1 < n_all_entries] + entries = make_playlistitems_entries(ie_entries) else: entries = ie_entries[playliststart:playlistend] n_entries = len(entries) @@ -795,31 +957,38 @@ entries = ie_entries.getslice( playliststart, playlistend) n_entries = len(entries) - self.to_screen( - '[%s] playlist %s: Downloading %d videos' % - (ie_result['extractor'], playlist, n_entries)) + report_download(n_entries) else: # iterable if playlistitems: - entry_list = list(ie_entries) - entries = [entry_list[i - 1] for i in playlistitems] + entries = make_playlistitems_entries(list(itertools.islice( + ie_entries, 0, max(playlistitems)))) else: entries = list(itertools.islice( ie_entries, playliststart, playlistend)) n_entries = len(entries) - self.to_screen( - '[%s] playlist %s: Downloading %d videos' % - (ie_result['extractor'], playlist, n_entries)) + report_download(n_entries) if self.params.get('playlistreverse', False): entries = entries[::-1] + if self.params.get('playlistrandom', False): + random.shuffle(entries) + + x_forwarded_for = ie_result.get('__x_forwarded_for_ip') + for i, entry in enumerate(entries, 1): self.to_screen('[download] Downloading video %s of %s' % (i, n_entries)) + # This __x_forwarded_for_ip thing is a bit ugly but requires + # minimal changes + if x_forwarded_for: + entry['__x_forwarded_for_ip'] = x_forwarded_for extra = { 'n_entries': n_entries, 'playlist': playlist, 'playlist_id': ie_result.get('id'), 'playlist_title': ie_result.get('title'), + 'playlist_uploader': ie_result.get('uploader'), + 'playlist_uploader_id': ie_result.get('uploader_id'), 'playlist_index': i + playliststart, 'extractor': ie_result['extractor'], 'webpage_url': ie_result['webpage_url'], @@ -875,7 +1044,7 @@ '!=': operator.ne, } operator_rex = re.compile(r'''(?x)\s* - (?P<key>width|height|tbr|abr|vbr|asr|filesize|fps) + (?P<key>width|height|tbr|abr|vbr|asr|filesize|filesize_approx|fps) \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s* (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?) $ @@ -897,21 +1066,24 @@ if not m: STR_OPERATORS = { '=': operator.eq, - '!=': operator.ne, '^=': lambda attr, value: attr.startswith(value), '$=': lambda attr, value: attr.endswith(value), '*=': lambda attr, value: value in attr, } str_operator_rex = re.compile(r'''(?x) - \s*(?P<key>ext|acodec|vcodec|container|protocol) - \s*(?P<op>%s)(?P<none_inclusive>\s*\?)? + \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id) + \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)? \s*(?P<value>[a-zA-Z0-9._-]+) \s*$ ''' % '|'.join(map(re.escape, STR_OPERATORS.keys()))) m = str_operator_rex.search(filter_spec) if m: comparison_value = m.group('value') - op = STR_OPERATORS[m.group('op')] + str_op = STR_OPERATORS[m.group('op')] + if m.group('negation'): + op = lambda attr, value: not str_op(attr, value) + else: + op = str_op if not m: raise ValueError('Invalid filter specification %r' % filter_spec) @@ -923,6 +1095,30 @@ return op(actual_value, comparison_value) return _filter + def _default_format_spec(self, info_dict, download=True): + + def can_merge(): + merger = FFmpegMergerPP(self) + return merger.available and merger.can_merge() + + def prefer_best(): + if self.params.get('simulate', False): + return False + if not download: + return False + if self.params.get('outtmpl', DEFAULT_OUTTMPL) == '-': + return True + if info_dict.get('is_live'): + return True + if not can_merge(): + return True + return False + + req_format_list = ['bestvideo+bestaudio', 'best'] + if prefer_best(): + req_format_list.reverse() + return '/'.join(req_format_list) + def build_format_selector(self, format_spec): def syntax_error(note, start): message = ( @@ -1035,9 +1231,9 @@ if isinstance(selector, list): fs = [_build_selector_function(s) for s in selector] - def selector_function(formats): + def selector_function(ctx): for f in fs: - for format in f(formats): + for format in f(ctx): yield format return selector_function elif selector.type == GROUP: @@ -1045,17 +1241,17 @@ elif selector.type == PICKFIRST: fs = [_build_selector_function(s) for s in selector.selector] - def selector_function(formats): + def selector_function(ctx): for f in fs: - picked_formats = list(f(formats)) + picked_formats = list(f(ctx)) if picked_formats: return picked_formats return [] elif selector.type == SINGLE: format_spec = selector.selector - def selector_function(formats): - formats = list(formats) + def selector_function(ctx): + formats = list(ctx['formats']) if not formats: return if format_spec == 'all': @@ -1068,9 +1264,10 @@ if f.get('vcodec') != 'none' and f.get('acodec') != 'none'] if audiovideo_formats: yield audiovideo_formats[format_idx] - # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format - elif (all(f.get('acodec') != 'none' for f in formats) or - all(f.get('vcodec') != 'none' for f in formats)): + # for extractors with incomplete formats (audio only (soundcloud) + # or video only (imgur)) we will fallback to best/worst + # {video,audio}-only format + elif ctx['incomplete_formats']: yield formats[format_idx] elif format_spec == 'bestaudio': audio_formats = [ @@ -1144,17 +1341,18 @@ } video_selector, audio_selector = map(_build_selector_function, selector.selector) - def selector_function(formats): - formats = list(formats) - for pair in itertools.product(video_selector(formats), audio_selector(formats)): + def selector_function(ctx): + for pair in itertools.product( + video_selector(copy.deepcopy(ctx)), audio_selector(copy.deepcopy(ctx))): yield _merge(pair) filters = [self._build_format_filter(f) for f in selector.filters] - def final_selector(formats): + def final_selector(ctx): + ctx_copy = copy.deepcopy(ctx) for _filter in filters: - formats = list(filter(_filter, formats)) - return selector_function(formats) + ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats'])) + return selector_function(ctx_copy) return final_selector stream = io.BytesIO(format_spec.encode('utf-8')) @@ -1197,6 +1395,11 @@ if cookies: res['Cookie'] = cookies + if 'X-Forwarded-For' not in res: + x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip') + if x_forwarded_for_ip: + res['X-Forwarded-For'] = x_forwarded_for_ip + return res def _calc_cookies(self, info_dict): @@ -1212,6 +1415,29 @@ if 'title' not in info_dict: raise ExtractorError('Missing "title" field in extractor result') + def report_force_conversion(field, field_not, conversion): + self.report_warning( + '"%s" field is not %s - forcing %s conversion, there is an error in extractor' + % (field, field_not, conversion)) + + def sanitize_string_field(info, string_field): + field = info.get(string_field) + if field is None or isinstance(field, compat_str): + return + report_force_conversion(string_field, 'a string', 'string') + info[string_field] = compat_str(field) + + def sanitize_numeric_fields(info): + for numeric_field in self._NUMERIC_FIELDS: + field = info.get(numeric_field) + if field is None or isinstance(field, compat_numeric_types): + continue + report_force_conversion(numeric_field, 'numeric', 'int') + info[numeric_field] = int_or_none(field) + + sanitize_string_field(info_dict, 'id') + sanitize_numeric_fields(info_dict) + if 'playlist' not in info_dict: # It isn't part of a playlist info_dict['playlist'] = None @@ -1224,15 +1450,25 @@ info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}] if thumbnails: thumbnails.sort(key=lambda t: ( - t.get('preference'), t.get('width'), t.get('height'), - t.get('id'), t.get('url'))) + t.get('preference') if t.get('preference') is not None else -1, + t.get('width') if t.get('width') is not None else -1, + t.get('height') if t.get('height') is not None else -1, + t.get('id') if t.get('id') is not None else '', t.get('url'))) for i, t in enumerate(thumbnails): + t['url'] = sanitize_url(t['url']) if t.get('width') and t.get('height'): t['resolution'] = '%dx%d' % (t['width'], t['height']) if t.get('id') is None: t['id'] = '%d' % i - if thumbnails and 'thumbnail' not in info_dict: + if self.params.get('list_thumbnails'): + self.list_thumbnails(info_dict) + return + + thumbnail = info_dict.get('thumbnail') + if thumbnail: + info_dict['thumbnail'] = sanitize_url(thumbnail) + elif thumbnails: info_dict['thumbnail'] = thumbnails[-1]['url'] if 'display_id' not in info_dict and 'id' in info_dict: @@ -1253,21 +1489,28 @@ if info_dict.get('%s_number' % field) is not None and not info_dict.get(field): info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field]) + for cc_kind in ('subtitles', 'automatic_captions'): + cc = info_dict.get(cc_kind) + if cc: + for _, subtitle in cc.items(): + for subtitle_format in subtitle: + if subtitle_format.get('url'): + subtitle_format['url'] = sanitize_url(subtitle_format['url']) + if subtitle_format.get('ext') is None: + subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower() + + automatic_captions = info_dict.get('automatic_captions') subtitles = info_dict.get('subtitles') - if subtitles: - for _, subtitle in subtitles.items(): - for subtitle_format in subtitle: - if 'ext' not in subtitle_format: - subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower() if self.params.get('listsubtitles', False): if 'automatic_captions' in info_dict: - self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions') + self.list_subtitles( + info_dict['id'], automatic_captions, 'automatic captions') self.list_subtitles(info_dict['id'], subtitles, 'subtitles') return + info_dict['requested_subtitles'] = self.process_subtitles( - info_dict['id'], subtitles, - info_dict.get('automatic_captions')) + info_dict['id'], subtitles, automatic_captions) # We now pick which formats have to be downloaded if info_dict.get('formats') is None: @@ -1279,18 +1522,32 @@ if not formats: raise ExtractorError('No video formats found!') + def is_wellformed(f): + url = f.get('url') + if not url: + self.report_warning( + '"url" field is missing or empty - skipping format, ' + 'there is an error in extractor') + return False + if isinstance(url, bytes): + sanitize_string_field(f, 'url') + return True + + # Filter out malformed formats for better extraction robustness + formats = list(filter(is_wellformed, formats)) + formats_dict = {} # We check that all the formats have the format and format_id fields for i, format in enumerate(formats): - if 'url' not in format: - raise ExtractorError('Missing "url" key in result (index %d)' % i) - - if format.get('format_id') is None: + sanitize_string_field(format, 'format_id') + sanitize_numeric_fields(format) + format['url'] = sanitize_url(format['url']) + if not format.get('format_id'): format['format_id'] = compat_str(i) else: # Sanitize format_id from characters used in format selector expression - format['format_id'] = re.sub('[\s,/+\[\]()]', '_', format['format_id']) + format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id']) format_id = format['format_id'] if format_id not in formats_dict: formats_dict[format_id] = [] @@ -1310,17 +1567,20 @@ note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '', ) # Automatically determine file extension if missing - if 'ext' not in format: + if format.get('ext') is None: format['ext'] = determine_ext(format['url']).lower() # Automatically determine protocol if missing (useful for format # selection purposes) - if 'protocol' not in format: + if format.get('protocol') is None: format['protocol'] = determine_protocol(format) # Add HTTP headers, so that external programs can use them from the # json output full_format_info = info_dict.copy() full_format_info.update(format) format['http_headers'] = self._calc_headers(full_format_info) + # Remove private housekeeping stuff + if '__x_forwarded_for_ip' in info_dict: + del info_dict['__x_forwarded_for_ip'] # TODO Central sorting goes here @@ -1333,22 +1593,42 @@ if self.params.get('listformats'): self.list_formats(info_dict) return - if self.params.get('list_thumbnails'): - self.list_thumbnails(info_dict) - return req_format = self.params.get('format') if req_format is None: - req_format_list = [] - if (self.params.get('outtmpl', DEFAULT_OUTTMPL) != '-' and - not info_dict.get('is_live')): - merger = FFmpegMergerPP(self) - if merger.available and merger.can_merge(): - req_format_list.append('bestvideo+bestaudio') - req_format_list.append('best') - req_format = '/'.join(req_format_list) + req_format = self._default_format_spec(info_dict, download=download) + if self.params.get('verbose'): + self.to_stdout('[debug] Default format spec: %s' % req_format) + format_selector = self.build_format_selector(req_format) - formats_to_download = list(format_selector(formats)) + + # While in format selection we may need to have an access to the original + # format set in order to calculate some metrics or do some processing. + # For now we need to be able to guess whether original formats provided + # by extractor are incomplete or not (i.e. whether extractor provides only + # video-only or audio-only formats) for proper formats selection for + # extractors with such incomplete formats (see + # https://github.com/ytdl-org/youtube-dl/pull/5556). + # Since formats may be filtered during format selection and may not match + # the original formats the results may be incorrect. Thus original formats + # or pre-calculated metrics should be passed to format selection routines + # as well. + # We will pass a context object containing all necessary additional data + # instead of just formats. + # This fixes incorrect format selection issue (see + # https://github.com/ytdl-org/youtube-dl/issues/10083). + incomplete_formats = ( + # All formats are video-only or + all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats) + # all formats are audio-only + or all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats)) + + ctx = { + 'formats': formats, + 'incomplete_formats': incomplete_formats, + } + + formats_to_download = list(format_selector(ctx)) if not formats_to_download: raise ExtractorError('requested format not available', expected=True) @@ -1471,12 +1751,17 @@ if filename is None: return - try: - dn = os.path.dirname(sanitize_path(encodeFilename(filename))) - if dn and not os.path.exists(dn): - os.makedirs(dn) - except (OSError, IOError) as err: - self.report_error('unable to create directory ' + error_to_compat_str(err)) + def ensure_dir_exists(path): + try: + dn = os.path.dirname(path) + if dn and not os.path.exists(dn): + os.makedirs(dn) + return True + except (OSError, IOError) as err: + self.report_error('unable to create directory ' + error_to_compat_str(err)) + return False + + if not ensure_dir_exists(sanitize_path(encodeFilename(filename))): return if self.params.get('writedescription', False): @@ -1519,27 +1804,30 @@ ie = self.get_info_extractor(info_dict['extractor_key']) for sub_lang, sub_info in subtitles.items(): sub_format = sub_info['ext'] - if sub_info.get('data') is not None: - sub_data = sub_info['data'] + sub_filename = subtitles_filename(filename, sub_lang, sub_format) + if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)): + self.to_screen('[info] Video subtitle %s.%s is already present' % (sub_lang, sub_format)) else: - try: - sub_data = ie._download_webpage( - sub_info['url'], info_dict['id'], note=False) - except ExtractorError as err: - self.report_warning('Unable to download subtitle for "%s": %s' % - (sub_lang, error_to_compat_str(err.cause))) - continue - try: - sub_filename = subtitles_filename(filename, sub_lang, sub_format) - if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)): - self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format)) + self.to_screen('[info] Writing video subtitles to: ' + sub_filename) + if sub_info.get('data') is not None: + try: + # Use newline='' to prevent conversion of newline characters + # See https://github.com/ytdl-org/youtube-dl/issues/10268 + with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile: + subfile.write(sub_info['data']) + except (OSError, IOError): + self.report_error('Cannot write subtitles file ' + sub_filename) + return else: - self.to_screen('[info] Writing video subtitles to: ' + sub_filename) - with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile: - subfile.write(sub_data) - except (OSError, IOError): - self.report_error('Cannot write subtitles file ' + sub_filename) - return + try: + sub_data = ie._request_webpage( + sub_info['url'], info_dict['id'], note=False).read() + with io.open(encodeFilename(sub_filename), 'wb') as subfile: + subfile.write(sub_data) + except (ExtractorError, IOError, OSError, ValueError) as err: + self.report_warning('Unable to download subtitle for "%s": %s' % + (sub_lang, error_to_compat_str(err))) + continue if self.params.get('writeinfojson', False): infofn = replace_extension(filename, 'info.json', info_dict.get('ext')) @@ -1580,10 +1868,10 @@ def compatible_formats(formats): video, audio = formats # Check extension - video_ext, audio_ext = audio.get('ext'), video.get('ext') + video_ext, audio_ext = video.get('ext'), audio.get('ext') if video_ext and audio_ext: COMPATIBLE_EXTS = ( - ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v'), + ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma'), ('webm') ) for exts in COMPATIBLE_EXTS: @@ -1612,8 +1900,11 @@ for f in requested_formats: new_info = dict(info_dict) new_info.update(f) - fname = self.prepare_filename(new_info) - fname = prepend_extension(fname, 'f%s' % f['format_id'], new_info['ext']) + fname = prepend_extension( + self.prepare_filename(new_info), + 'f%s' % f['format_id'], new_info['ext']) + if not ensure_dir_exists(fname): + return downloaded.append(fname) partial_success = dl(fname, new_info) success = success and partial_success @@ -1623,7 +1914,7 @@ # Just a single file success = dl(filename, info_dict) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self.report_error('unable to download video data: %s' % str(err)) + self.report_error('unable to download video data: %s' % error_to_compat_str(err)) return except (OSError, IOError) as err: raise UnavailableVideoError(err) @@ -1631,12 +1922,14 @@ self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded)) return - if success: + if success and filename != '-': # Fixup content fixup_policy = self.params.get('fixup') if fixup_policy is None: fixup_policy = 'detect_or_warn' + INSTALL_FFMPEG_MESSAGE = 'Install ffmpeg or avconv to fix this automatically.' + stretched_ratio = info_dict.get('stretched_ratio') if stretched_ratio is not None and stretched_ratio != 1: if fixup_policy == 'warn': @@ -1649,15 +1942,18 @@ info_dict['__postprocessors'].append(stretched_pp) else: self.report_warning( - '%s: Non-uniform pixel ratio (%s). Install ffmpeg or avconv to fix this automatically.' % ( - info_dict['id'], stretched_ratio)) + '%s: Non-uniform pixel ratio (%s). %s' + % (info_dict['id'], stretched_ratio, INSTALL_FFMPEG_MESSAGE)) else: assert fixup_policy in ('ignore', 'never') - if info_dict.get('requested_formats') is None and info_dict.get('container') == 'm4a_dash': + if (info_dict.get('requested_formats') is None + and info_dict.get('container') == 'm4a_dash'): if fixup_policy == 'warn': - self.report_warning('%s: writing DASH m4a. Only some players support this container.' % ( - info_dict['id'])) + self.report_warning( + '%s: writing DASH m4a. ' + 'Only some players support this container.' + % info_dict['id']) elif fixup_policy == 'detect_or_warn': fixup_pp = FFmpegFixupM4aPP(self) if fixup_pp.available: @@ -1665,8 +1961,27 @@ info_dict['__postprocessors'].append(fixup_pp) else: self.report_warning( - '%s: writing DASH m4a. Only some players support this container. Install ffmpeg or avconv to fix this automatically.' % ( - info_dict['id'])) + '%s: writing DASH m4a. ' + 'Only some players support this container. %s' + % (info_dict['id'], INSTALL_FFMPEG_MESSAGE)) + else: + assert fixup_policy in ('ignore', 'never') + + if (info_dict.get('protocol') == 'm3u8_native' + or info_dict.get('protocol') == 'm3u8' + and self.params.get('hls_prefer_native')): + if fixup_policy == 'warn': + self.report_warning('%s: malformed AAC bitstream detected.' % ( + info_dict['id'])) + elif fixup_policy == 'detect_or_warn': + fixup_pp = FFmpegFixupM3u8PP(self) + if fixup_pp.available: + info_dict.setdefault('__postprocessors', []) + info_dict['__postprocessors'].append(fixup_pp) + else: + self.report_warning( + '%s: malformed AAC bitstream detected. %s' + % (info_dict['id'], INSTALL_FFMPEG_MESSAGE)) else: assert fixup_policy in ('ignore', 'never') @@ -1680,9 +1995,10 @@ def download(self, url_list): """Download a given list of URLs.""" outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) - if (len(url_list) > 1 and - '%' not in outtmpl and - self.params.get('max_downloads') != 1): + if (len(url_list) > 1 + and outtmpl != '-' + and '%' not in outtmpl + and self.params.get('max_downloads') != 1): raise SameFileError(outtmpl) for url in url_list: @@ -1747,15 +2063,24 @@ self.report_warning('Unable to remove downloaded original file') def _make_archive_id(self, info_dict): + video_id = info_dict.get('id') + if not video_id: + return # Future-proof against any change in case # and backwards compatibility with prior versions - extractor = info_dict.get('extractor_key') - if extractor is None: - if 'id' in info_dict: - extractor = info_dict.get('ie_key') # key in a playlist + extractor = info_dict.get('extractor_key') or info_dict.get('ie_key') # key in a playlist if extractor is None: - return None # Incomplete video information - return extractor.lower() + ' ' + info_dict['id'] + url = str_or_none(info_dict.get('url')) + if not url: + return + # Try to find matching extractor for the URL and take its ie_key + for ie in self._ies: + if ie.suitable(url): + extractor = ie.ie_key() + break + else: + return + return extractor.lower() + ' ' + video_id def in_download_archive(self, info_dict): fn = self.params.get('download_archive') @@ -1763,7 +2088,7 @@ return False vid_id = self._make_archive_id(info_dict) - if vid_id is None: + if not vid_id: return False # Incomplete video information try: @@ -1809,7 +2134,7 @@ if fdict.get('language'): if res: res += ' ' - res += '[%s]' % fdict['language'] + res += '[%s] ' % fdict['language'] if fdict.get('format_note') is not None: res += fdict['format_note'] + ' ' if fdict.get('tbr') is not None: @@ -1818,8 +2143,8 @@ if res: res += ', ' res += '%s container' % fdict['container'] - if (fdict.get('vcodec') is not None and - fdict.get('vcodec') != 'none'): + if (fdict.get('vcodec') is not None + and fdict.get('vcodec') != 'none'): if res: res += ', ' res += fdict['vcodec'] @@ -1830,7 +2155,9 @@ if fdict.get('vbr') is not None: res += '%4dk' % fdict['vbr'] if fdict.get('fps') is not None: - res += ', %sfps' % fdict['fps'] + if res: + res += ', ' + res += '%sfps' % fdict['fps'] if fdict.get('acodec') is not None: if res: res += ', ' @@ -1873,13 +2200,8 @@ def list_thumbnails(self, info_dict): thumbnails = info_dict.get('thumbnails') if not thumbnails: - tn_url = info_dict.get('thumbnail') - if tn_url: - thumbnails = [{'id': '0', 'url': tn_url}] - else: - self.to_screen( - '[info] No thumbnails present for %s' % info_dict['id']) - return + self.to_screen('[info] No thumbnails present for %s' % info_dict['id']) + return self.to_screen( '[info] Thumbnails for %s:' % info_dict['id']) @@ -1909,7 +2231,7 @@ return if type('') is not compat_str: - # Python 2.6 on SLES11 SP1 (https://github.com/rg3/youtube-dl/issues/3326) + # Python 2.6 on SLES11 SP1 (https://github.com/ytdl-org/youtube-dl/issues/3326) self.report_warning( 'Your Python is broken! Update to a newer and supported version') @@ -1924,6 +2246,8 @@ write_string(encoding_str, encoding=None) self._write_string('[debug] youtube-dl version ' + __version__ + '\n') + if _LAZY_LOADER: + self._write_string('[debug] Lazy loading extractors enabled' + '\n') try: sp = subprocess.Popen( ['git', 'rev-parse', '--short', 'HEAD'], @@ -1938,11 +2262,20 @@ sys.exc_clear() except Exception: pass - self._write_string('[debug] Python version %s - %s\n' % ( - platform.python_version(), platform_name())) + + def python_implementation(): + impl_name = platform.python_implementation() + if impl_name == 'PyPy' and hasattr(sys, 'pypy_version_info'): + return impl_name + ' version %d.%d.%d' % sys.pypy_version_info[:3] + return impl_name + + self._write_string('[debug] Python version %s (%s) - %s\n' % ( + platform.python_version(), python_implementation(), + platform_name())) exe_versions = FFmpegPostProcessor.get_versions(self) exe_versions['rtmpdump'] = rtmpdump_version() + exe_versions['phantomjs'] = PhantomJSwrapper._version() exe_str = ', '.join( '%s %s' % (exe, v) for exe, v in sorted(exe_versions.items()) @@ -1979,10 +2312,10 @@ if opts_cookiefile is None: self.cookiejar = compat_cookiejar.CookieJar() else: - self.cookiejar = compat_cookiejar.MozillaCookieJar( - opts_cookiefile) + opts_cookiefile = expand_path(opts_cookiefile) + self.cookiejar = YoutubeDLCookieJar(opts_cookiefile) if os.access(opts_cookiefile, os.R_OK): - self.cookiejar.load() + self.cookiejar.load(ignore_discard=True, ignore_expires=True) cookie_processor = YoutubeDLCookieProcessor(self.cookiejar) if opts_proxy is not None: @@ -1992,7 +2325,7 @@ proxies = {'http': opts_proxy, 'https': opts_proxy} else: proxies = compat_urllib_request.getproxies() - # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805) + # Set HTTPS proxy to HTTP one if given (https://github.com/ytdl-org/youtube-dl/issues/805) if 'http' in proxies and 'https' not in proxies: proxies['https'] = proxies['http'] proxy_handler = PerRequestProxyHandler(proxies) @@ -2005,7 +2338,7 @@ # When passing our own FileHandler instance, build_opener won't add the # default FileHandler and allows us to disable the file protocol, which # can be used for malicious purposes (see - # https://github.com/rg3/youtube-dl/issues/8227) + # https://github.com/ytdl-org/youtube-dl/issues/8227) file_handler = compat_urllib_request.FileHandler() def file_open(*args, **kwargs): @@ -2017,7 +2350,7 @@ # Delete the default user-agent header, which would otherwise apply in # cases where our custom HTTP handler doesn't come into play - # (See https://github.com/rg3/youtube-dl/issues/1309 for details) + # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details) opener.addheaders = [] self._opener = opener Binary files /tmp/tmpkXQ9ha/LpnaFicRQ_/youtube-dl-2016.02.22/youtube-dl and /tmp/tmpkXQ9ha/Fefc0pUly5/youtube-dl-2019.05.20/youtube-dl differ diff -Nru youtube-dl-2016.02.22/youtube-dl.1 youtube-dl-2019.05.20/youtube-dl.1 --- youtube-dl-2016.02.22/youtube-dl.1 2016-02-22 10:57:41.000000000 +0000 +++ youtube-dl-2019.05.20/youtube-dl.1 1970-01-01 00:00:00.000000000 +0000 @@ -1,2001 +0,0 @@ -.\" Automatically generated by Pandoc 1.16.0.2 -.\" -.TH "YOUTUBE\-DL" "1" "" "" "" -.hy -.SH NAME -.PP -youtube\-dl \- download videos from youtube.com or other video platforms -.SH SYNOPSIS -.PP -\f[B]youtube\-dl\f[] [OPTIONS] URL [URL...] -.SH DESCRIPTION -.PP -\f[B]youtube\-dl\f[] is a small command\-line program to download videos -from YouTube.com and a few more sites. -It requires the Python interpreter, version 2.6, 2.7, or 3.2+, and it is -not platform specific. -It should work on your Unix box, on Windows or on Mac OS X. -It is released to the public domain, which means you can modify it, -redistribute it or use it however you like. -.SH OPTIONS -.TP -.B \-h, \-\-help -Print this help text and exit -.RS -.RE -.TP -.B \-\-version -Print program version and exit -.RS -.RE -.TP -.B \-U, \-\-update -Update this program to latest version. -Make sure that you have sufficient permissions (run with sudo if needed) -.RS -.RE -.TP -.B \-i, \-\-ignore\-errors -Continue on download errors, for example to skip unavailable videos in a -playlist -.RS -.RE -.TP -.B \-\-abort\-on\-error -Abort downloading of further videos (in the playlist or the command -line) if an error occurs -.RS -.RE -.TP -.B \-\-dump\-user\-agent -Display the current browser identification -.RS -.RE -.TP -.B \-\-list\-extractors -List all supported extractors -.RS -.RE -.TP -.B \-\-extractor\-descriptions -Output descriptions of all supported extractors -.RS -.RE -.TP -.B \-\-force\-generic\-extractor -Force extraction to use the generic extractor -.RS -.RE -.TP -.B \-\-default\-search \f[I]PREFIX\f[] -Use this prefix for unqualified URLs. -For example "gvsearch2:" downloads two videos from google videos for -youtube\-dl "large apple". -Use the value "auto" to let youtube\-dl guess ("auto_warning" to emit a -warning when guessing). -"error" just throws an error. -The default value "fixup_error" repairs broken URLs, but emits an error -if this is not possible instead of searching. -.RS -.RE -.TP -.B \-\-ignore\-config -Do not read configuration files. -When given in the global configuration file /etc /youtube\-dl.conf: Do -not read the user configuration in ~/.config/youtube\- dl/config -(%APPDATA%/youtube\-dl/config.txt on Windows) -.RS -.RE -.TP -.B \-\-flat\-playlist -Do not extract the videos of a playlist, only list them. -.RS -.RE -.TP -.B \-\-no\-color -Do not emit color codes in output -.RS -.RE -.SS Network Options: -.TP -.B \-\-proxy \f[I]URL\f[] -Use the specified HTTP/HTTPS proxy. -Pass in an empty string (\-\-proxy "") for direct connection -.RS -.RE -.TP -.B \-\-socket\-timeout \f[I]SECONDS\f[] -Time to wait before giving up, in seconds -.RS -.RE -.TP -.B \-\-source\-address \f[I]IP\f[] -Client\-side IP address to bind to (experimental) -.RS -.RE -.TP -.B \-4, \-\-force\-ipv4 -Make all connections via IPv4 (experimental) -.RS -.RE -.TP -.B \-6, \-\-force\-ipv6 -Make all connections via IPv6 (experimental) -.RS -.RE -.TP -.B \-\-cn\-verification\-proxy \f[I]URL\f[] -Use this proxy to verify the IP address for some Chinese sites. -The default proxy specified by \-\-proxy (or none, if the options is not -present) is used for the actual downloading. -(experimental) -.RS -.RE -.SS Video Selection: -.TP -.B \-\-playlist\-start \f[I]NUMBER\f[] -Playlist video to start at (default is 1) -.RS -.RE -.TP -.B \-\-playlist\-end \f[I]NUMBER\f[] -Playlist video to end at (default is last) -.RS -.RE -.TP -.B \-\-playlist\-items \f[I]ITEM_SPEC\f[] -Playlist video items to download. -Specify indices of the videos in the playlist separated by commas like: -"\-\-playlist\-items 1,2,5,8" if you want to download videos indexed 1, -2, 5, 8 in the playlist. -You can specify range: "\-\-playlist\-items 1\-3,7,10\-13", it will -download the videos at index 1, 2, 3, 7, 10, 11, 12 and 13. -.RS -.RE -.TP -.B \-\-match\-title \f[I]REGEX\f[] -Download only matching titles (regex or caseless sub\-string) -.RS -.RE -.TP -.B \-\-reject\-title \f[I]REGEX\f[] -Skip download for matching titles (regex or caseless sub\-string) -.RS -.RE -.TP -.B \-\-max\-downloads \f[I]NUMBER\f[] -Abort after downloading NUMBER files -.RS -.RE -.TP -.B \-\-min\-filesize \f[I]SIZE\f[] -Do not download any videos smaller than SIZE (e.g. -50k or 44.6m) -.RS -.RE -.TP -.B \-\-max\-filesize \f[I]SIZE\f[] -Do not download any videos larger than SIZE (e.g. -50k or 44.6m) -.RS -.RE -.TP -.B \-\-date \f[I]DATE\f[] -Download only videos uploaded in this date -.RS -.RE -.TP -.B \-\-datebefore \f[I]DATE\f[] -Download only videos uploaded on or before this date (i.e. -inclusive) -.RS -.RE -.TP -.B \-\-dateafter \f[I]DATE\f[] -Download only videos uploaded on or after this date (i.e. -inclusive) -.RS -.RE -.TP -.B \-\-min\-views \f[I]COUNT\f[] -Do not download any videos with less than COUNT views -.RS -.RE -.TP -.B \-\-max\-views \f[I]COUNT\f[] -Do not download any videos with more than COUNT views -.RS -.RE -.TP -.B \-\-match\-filter \f[I]FILTER\f[] -Generic video filter (experimental). -Specify any key (see help for \-o for a list of available keys) to match -if the key is present, !key to check if the key is not present,key > -NUMBER (like "comment_count > 12", also works with >=, <, <=, !=, =) to -compare against a number, and & to require multiple matches. -Values which are not known are excluded unless you put a question mark -(?) after the operator.For example, to only match videos that have been -liked more than 100 times and disliked less than 50 times (or the -dislike functionality is not available at the given service), but who -also have a description, use \-\-match\-filter "like_count > 100 & -dislike_count <? -50 & description" . -.RS -.RE -.TP -.B \-\-no\-playlist -Download only the video, if the URL refers to a video and a playlist. -.RS -.RE -.TP -.B \-\-yes\-playlist -Download the playlist, if the URL refers to a video and a playlist. -.RS -.RE -.TP -.B \-\-age\-limit \f[I]YEARS\f[] -Download only videos suitable for the given age -.RS -.RE -.TP -.B \-\-download\-archive \f[I]FILE\f[] -Download only videos not listed in the archive file. -Record the IDs of all downloaded videos in it. -.RS -.RE -.TP -.B \-\-include\-ads -Download advertisements as well (experimental) -.RS -.RE -.SS Download Options: -.TP -.B \-r, \-\-rate\-limit \f[I]LIMIT\f[] -Maximum download rate in bytes per second (e.g. -50K or 4.2M) -.RS -.RE -.TP -.B \-R, \-\-retries \f[I]RETRIES\f[] -Number of retries (default is 10), or "infinite". -.RS -.RE -.TP -.B \-\-buffer\-size \f[I]SIZE\f[] -Size of download buffer (e.g. -1024 or 16K) (default is 1024) -.RS -.RE -.TP -.B \-\-no\-resize\-buffer -Do not automatically adjust the buffer size. -By default, the buffer size is automatically resized from an initial -value of SIZE. -.RS -.RE -.TP -.B \-\-playlist\-reverse -Download playlist videos in reverse order -.RS -.RE -.TP -.B \-\-xattr\-set\-filesize -Set file xattribute ytdl.filesize with expected filesize (experimental) -.RS -.RE -.TP -.B \-\-hls\-prefer\-native -Use the native HLS downloader instead of ffmpeg (experimental) -.RS -.RE -.TP -.B \-\-hls\-use\-mpegts -Use the mpegts container for HLS videos, allowing to play the video -while downloading (some players may not be able to play it) -.RS -.RE -.TP -.B \-\-external\-downloader \f[I]COMMAND\f[] -Use the specified external downloader. -Currently supports aria2c,axel,curl,httpie,wget -.RS -.RE -.TP -.B \-\-external\-downloader\-args \f[I]ARGS\f[] -Give these arguments to the external downloader -.RS -.RE -.SS Filesystem Options: -.TP -.B \-a, \-\-batch\-file \f[I]FILE\f[] -File containing URLs to download (\[aq]\-\[aq] for stdin) -.RS -.RE -.TP -.B \-\-id -Use only video ID in file name -.RS -.RE -.TP -.B \-o, \-\-output \f[I]TEMPLATE\f[] -Output filename template. -Use %(title)s to get the title, %(uploader)s for the uploader name, -%(uploader_id)s for the uploader nickname if different, %(autonumber)s -to get an automatically incremented number, %(ext)s for the filename -extension, %(format)s for the format description (like "22 \- 1280x720" -or "HD"), %(format_id)s for the unique id of the format (like -YouTube\[aq]s itags: "137"), %(upload_date)s for the upload date -(YYYYMMDD), %(extractor)s for the provider (youtube, metacafe, etc), -%(id)s for the video id, %(playlist_title)s, %(playlist_id)s, or -%(playlist)s (=title if present, ID otherwise) for the playlist the -video is in, %(playlist_index)s for the position in the playlist. -%(height)s and %(width)s for the width and height of the video format. -%(resolution)s for a textual description of the resolution of the video -format. -%% for a literal percent. -Use \- to output to stdout. -Can also be used to download to a different directory, for example with -\-o \[aq]/my/downloads/%(uploader)s /%(title)s\-%(id)s.%(ext)s\[aq] . -.RS -.RE -.TP -.B \-\-autonumber\-size \f[I]NUMBER\f[] -Specify the number of digits in %(autonumber)s when it is present in -output filename template or \-\-auto\-number option is given -.RS -.RE -.TP -.B \-\-restrict\-filenames -Restrict filenames to only ASCII characters, and avoid "&" and spaces in -filenames -.RS -.RE -.TP -.B \-A, \-\-auto\-number -[deprecated; use \-o "%(autonumber)s\-%(title)s.%(ext)s" ] Number -downloaded files starting from 00000 -.RS -.RE -.TP -.B \-t, \-\-title -[deprecated] Use title in file name (default) -.RS -.RE -.TP -.B \-l, \-\-literal -[deprecated] Alias of \-\-title -.RS -.RE -.TP -.B \-w, \-\-no\-overwrites -Do not overwrite files -.RS -.RE -.TP -.B \-c, \-\-continue -Force resume of partially downloaded files. -By default, youtube\-dl will resume downloads if possible. -.RS -.RE -.TP -.B \-\-no\-continue -Do not resume partially downloaded files (restart from beginning) -.RS -.RE -.TP -.B \-\-no\-part -Do not use .part files \- write directly into output file -.RS -.RE -.TP -.B \-\-no\-mtime -Do not use the Last\-modified header to set the file modification time -.RS -.RE -.TP -.B \-\-write\-description -Write video description to a .description file -.RS -.RE -.TP -.B \-\-write\-info\-json -Write video metadata to a .info.json file -.RS -.RE -.TP -.B \-\-write\-annotations -Write video annotations to a .annotations.xml file -.RS -.RE -.TP -.B \-\-load\-info \f[I]FILE\f[] -JSON file containing the video information (created with the -"\-\-write\-info\-json" option) -.RS -.RE -.TP -.B \-\-cookies \f[I]FILE\f[] -File to read cookies from and dump cookie jar in -.RS -.RE -.TP -.B \-\-cache\-dir \f[I]DIR\f[] -Location in the filesystem where youtube\-dl can store some downloaded -information permanently. -By default $XDG_CACHE_HOME /youtube\-dl or ~/.cache/youtube\-dl . -At the moment, only YouTube player files (for videos with obfuscated -signatures) are cached, but that may change. -.RS -.RE -.TP -.B \-\-no\-cache\-dir -Disable filesystem caching -.RS -.RE -.TP -.B \-\-rm\-cache\-dir -Delete all filesystem cache files -.RS -.RE -.SS Thumbnail images: -.TP -.B \-\-write\-thumbnail -Write thumbnail image to disk -.RS -.RE -.TP -.B \-\-write\-all\-thumbnails -Write all thumbnail image formats to disk -.RS -.RE -.TP -.B \-\-list\-thumbnails -Simulate and list all available thumbnail formats -.RS -.RE -.SS Verbosity / Simulation Options: -.TP -.B \-q, \-\-quiet -Activate quiet mode -.RS -.RE -.TP -.B \-\-no\-warnings -Ignore warnings -.RS -.RE -.TP -.B \-s, \-\-simulate -Do not download the video and do not write anything to disk -.RS -.RE -.TP -.B \-\-skip\-download -Do not download the video -.RS -.RE -.TP -.B \-g, \-\-get\-url -Simulate, quiet but print URL -.RS -.RE -.TP -.B \-e, \-\-get\-title -Simulate, quiet but print title -.RS -.RE -.TP -.B \-\-get\-id -Simulate, quiet but print id -.RS -.RE -.TP -.B \-\-get\-thumbnail -Simulate, quiet but print thumbnail URL -.RS -.RE -.TP -.B \-\-get\-description -Simulate, quiet but print video description -.RS -.RE -.TP -.B \-\-get\-duration -Simulate, quiet but print video length -.RS -.RE -.TP -.B \-\-get\-filename -Simulate, quiet but print output filename -.RS -.RE -.TP -.B \-\-get\-format -Simulate, quiet but print output format -.RS -.RE -.TP -.B \-j, \-\-dump\-json -Simulate, quiet but print JSON information. -See \-\-output for a description of available keys. -.RS -.RE -.TP -.B \-J, \-\-dump\-single\-json -Simulate, quiet but print JSON information for each command\-line -argument. -If the URL refers to a playlist, dump the whole playlist information in -a single line. -.RS -.RE -.TP -.B \-\-print\-json -Be quiet and print the video information as JSON (video is still being -downloaded). -.RS -.RE -.TP -.B \-\-newline -Output progress bar as new lines -.RS -.RE -.TP -.B \-\-no\-progress -Do not print progress bar -.RS -.RE -.TP -.B \-\-console\-title -Display progress in console titlebar -.RS -.RE -.TP -.B \-v, \-\-verbose -Print various debugging information -.RS -.RE -.TP -.B \-\-dump\-pages -Print downloaded pages encoded using base64 to debug problems (very -verbose) -.RS -.RE -.TP -.B \-\-write\-pages -Write downloaded intermediary pages to files in the current directory to -debug problems -.RS -.RE -.TP -.B \-\-print\-traffic -Display sent and read HTTP traffic -.RS -.RE -.TP -.B \-C, \-\-call\-home -Contact the youtube\-dl server for debugging -.RS -.RE -.TP -.B \-\-no\-call\-home -Do NOT contact the youtube\-dl server for debugging -.RS -.RE -.SS Workarounds: -.TP -.B \-\-encoding \f[I]ENCODING\f[] -Force the specified encoding (experimental) -.RS -.RE -.TP -.B \-\-no\-check\-certificate -Suppress HTTPS certificate validation -.RS -.RE -.TP -.B \-\-prefer\-insecure -Use an unencrypted connection to retrieve information about the video. -(Currently supported only for YouTube) -.RS -.RE -.TP -.B \-\-user\-agent \f[I]UA\f[] -Specify a custom user agent -.RS -.RE -.TP -.B \-\-referer \f[I]URL\f[] -Specify a custom referer, use if the video access is restricted to one -domain -.RS -.RE -.TP -.B \-\-add\-header \f[I]FIELD:VALUE\f[] -Specify a custom HTTP header and its value, separated by a colon -\[aq]:\[aq]. -You can use this option multiple times -.RS -.RE -.TP -.B \-\-bidi\-workaround -Work around terminals that lack bidirectional text support. -Requires bidiv or fribidi executable in PATH -.RS -.RE -.TP -.B \-\-sleep\-interval \f[I]SECONDS\f[] -Number of seconds to sleep before each download. -.RS -.RE -.SS Video Format Options: -.TP -.B \-f, \-\-format \f[I]FORMAT\f[] -Video format code, see the "FORMAT SELECTION" for all the info -.RS -.RE -.TP -.B \-\-all\-formats -Download all available video formats -.RS -.RE -.TP -.B \-\-prefer\-free\-formats -Prefer free video formats unless a specific one is requested -.RS -.RE -.TP -.B \-F, \-\-list\-formats -List all available formats of requested videos -.RS -.RE -.TP -.B \-\-youtube\-skip\-dash\-manifest -Do not download the DASH manifests and related data on YouTube videos -.RS -.RE -.TP -.B \-\-merge\-output\-format \f[I]FORMAT\f[] -If a merge is required (e.g. -bestvideo+bestaudio), output to given container format. -One of mkv, mp4, ogg, webm, flv. -Ignored if no merge is required -.RS -.RE -.SS Subtitle Options: -.TP -.B \-\-write\-sub -Write subtitle file -.RS -.RE -.TP -.B \-\-write\-auto\-sub -Write automatically generated subtitle file (YouTube only) -.RS -.RE -.TP -.B \-\-all\-subs -Download all the available subtitles of the video -.RS -.RE -.TP -.B \-\-list\-subs -List all available subtitles for the video -.RS -.RE -.TP -.B \-\-sub\-format \f[I]FORMAT\f[] -Subtitle format, accepts formats preference, for example: "srt" or -"ass/srt/best" -.RS -.RE -.TP -.B \-\-sub\-lang \f[I]LANGS\f[] -Languages of the subtitles to download (optional) separated by commas, -use \-\-list\- subs for available language tags -.RS -.RE -.SS Authentication Options: -.TP -.B \-u, \-\-username \f[I]USERNAME\f[] -Login with this account ID -.RS -.RE -.TP -.B \-p, \-\-password \f[I]PASSWORD\f[] -Account password. -If this option is left out, youtube\-dl will ask interactively. -.RS -.RE -.TP -.B \-2, \-\-twofactor \f[I]TWOFACTOR\f[] -Two\-factor auth code -.RS -.RE -.TP -.B \-n, \-\-netrc -Use .netrc authentication data -.RS -.RE -.TP -.B \-\-video\-password \f[I]PASSWORD\f[] -Video password (vimeo, smotri, youku) -.RS -.RE -.SS Post\-processing Options: -.TP -.B \-x, \-\-extract\-audio -Convert video files to audio\-only files (requires ffmpeg or avconv and -ffprobe or avprobe) -.RS -.RE -.TP -.B \-\-audio\-format \f[I]FORMAT\f[] -Specify audio format: "best", "aac", "vorbis", "mp3", "m4a", "opus", or -"wav"; "best" by default -.RS -.RE -.TP -.B \-\-audio\-quality \f[I]QUALITY\f[] -Specify ffmpeg/avconv audio quality, insert a value between 0 (better) -and 9 (worse) for VBR or a specific bitrate like 128K (default 5) -.RS -.RE -.TP -.B \-\-recode\-video \f[I]FORMAT\f[] -Encode the video to another format if necessary (currently supported: -mp4|flv|ogg|webm|mkv|avi) -.RS -.RE -.TP -.B \-\-postprocessor\-args \f[I]ARGS\f[] -Give these arguments to the postprocessor -.RS -.RE -.TP -.B \-k, \-\-keep\-video -Keep the video file on disk after the post\- processing; the video is -erased by default -.RS -.RE -.TP -.B \-\-no\-post\-overwrites -Do not overwrite post\-processed files; the post\-processed files are -overwritten by default -.RS -.RE -.TP -.B \-\-embed\-subs -Embed subtitles in the video (only for mkv and mp4 videos) -.RS -.RE -.TP -.B \-\-embed\-thumbnail -Embed thumbnail in the audio as cover art -.RS -.RE -.TP -.B \-\-add\-metadata -Write metadata to the video file -.RS -.RE -.TP -.B \-\-metadata\-from\-title \f[I]FORMAT\f[] -Parse additional metadata like song title / artist from the video title. -The format syntax is the same as \-\-output, the parsed parameters -replace existing values. -Additional templates: %(album)s, %(artist)s. -Example: \-\-metadata\-from\-title "%(artist)s \- %(title)s" matches a -title like "Coldplay \- Paradise" -.RS -.RE -.TP -.B \-\-xattrs -Write metadata to the video file\[aq]s xattrs (using dublin core and xdg -standards) -.RS -.RE -.TP -.B \-\-fixup \f[I]POLICY\f[] -Automatically correct known faults of the file. -One of never (do nothing), warn (only emit a warning), detect_or_warn -(the default; fix file if we can, warn otherwise) -.RS -.RE -.TP -.B \-\-prefer\-avconv -Prefer avconv over ffmpeg for running the postprocessors (default) -.RS -.RE -.TP -.B \-\-prefer\-ffmpeg -Prefer ffmpeg over avconv for running the postprocessors -.RS -.RE -.TP -.B \-\-ffmpeg\-location \f[I]PATH\f[] -Location of the ffmpeg/avconv binary; either the path to the binary or -its containing directory. -.RS -.RE -.TP -.B \-\-exec \f[I]CMD\f[] -Execute a command on the file after downloading, similar to find\[aq]s -\-exec syntax. -Example: \-\-exec \[aq]adb push {} /sdcard/Music/ && rm {}\[aq] -.RS -.RE -.TP -.B \-\-convert\-subs \f[I]FORMAT\f[] -Convert the subtitles to other format (currently supported: srt|ass|vtt) -.RS -.RE -.SH CONFIGURATION -.PP -You can configure youtube\-dl by placing any supported command line -option to a configuration file. -On Linux, the system wide configuration file is located at -\f[C]/etc/youtube\-dl.conf\f[] and the user wide configuration file at -\f[C]~/.config/youtube\-dl/config\f[]. -On Windows, the user wide configuration file locations are -\f[C]%APPDATA%\\youtube\-dl\\config.txt\f[] or -\f[C]C:\\Users\\<user\ name>\\youtube\-dl.conf\f[]. -For example, with the following configuration file youtube\-dl will -always extract the audio, not copy the mtime and use a proxy: -.IP -.nf -\f[C] -\-\-extract\-audio -\-\-no\-mtime -\-\-proxy\ 127.0.0.1:3128 -\f[] -.fi -.PP -You can use \f[C]\-\-ignore\-config\f[] if you want to disable the -configuration file for a particular youtube\-dl run. -.SS Authentication with \f[C]\&.netrc\f[] file -.PP -You may also want to configure automatic credentials storage for -extractors that support authentication (by providing login and password -with \f[C]\-\-username\f[] and \f[C]\-\-password\f[]) in order not to -pass credentials as command line arguments on every youtube\-dl -execution and prevent tracking plain text passwords in the shell command -history. -You can achieve this using a \f[C]\&.netrc\f[] -file (http://stackoverflow.com/tags/.netrc/info) on per extractor basis. -For that you will need to create a\f[C]\&.netrc\f[] file in your -\f[C]$HOME\f[] and restrict permissions to read/write by you only: -.IP -.nf -\f[C] -touch\ $HOME/.netrc -chmod\ a\-rwx,u+rw\ $HOME/.netrc -\f[] -.fi -.PP -After that you can add credentials for extractor in the following -format, where \f[I]extractor\f[] is the name of extractor in lowercase: -.IP -.nf -\f[C] -machine\ <extractor>\ login\ <login>\ password\ <password> -\f[] -.fi -.PP -For example: -.IP -.nf -\f[C] -machine\ youtube\ login\ myaccount\@gmail.com\ password\ my_youtube_password -machine\ twitch\ login\ my_twitch_account_name\ password\ my_twitch_password -\f[] -.fi -.PP -To activate authentication with the \f[C]\&.netrc\f[] file you should -pass \f[C]\-\-netrc\f[] to youtube\-dl or place it in the configuration -file (#configuration). -.PP -On Windows you may also need to setup the \f[C]%HOME%\f[] environment -variable manually. -.SH OUTPUT TEMPLATE -.PP -The \f[C]\-o\f[] option allows users to indicate a template for the -output file names. -The basic usage is not to set any template arguments when downloading a -single file, like in -\f[C]youtube\-dl\ \-o\ funny_video.flv\ "http://some/video"\f[]. -However, it may contain special sequences that will be replaced when -downloading each video. -The special sequences have the format \f[C]%(NAME)s\f[]. -To clarify, that is a percent symbol followed by a name in parentheses, -followed by a lowercase S. -Allowed names are: -.IP \[bu] 2 -\f[C]id\f[]: Video identifier -.IP \[bu] 2 -\f[C]title\f[]: Video title -.IP \[bu] 2 -\f[C]url\f[]: Video URL -.IP \[bu] 2 -\f[C]ext\f[]: Video filename extension -.IP \[bu] 2 -\f[C]alt_title\f[]: A secondary title of the video -.IP \[bu] 2 -\f[C]display_id\f[]: An alternative identifier for the video -.IP \[bu] 2 -\f[C]uploader\f[]: Full name of the video uploader -.IP \[bu] 2 -\f[C]creator\f[]: The main artist who created the video -.IP \[bu] 2 -\f[C]release_date\f[]: The date (YYYYMMDD) when the video was released -.IP \[bu] 2 -\f[C]timestamp\f[]: UNIX timestamp of the moment the video became -available -.IP \[bu] 2 -\f[C]upload_date\f[]: Video upload date (YYYYMMDD) -.IP \[bu] 2 -\f[C]uploader_id\f[]: Nickname or id of the video uploader -.IP \[bu] 2 -\f[C]location\f[]: Physical location where the video was filmed -.IP \[bu] 2 -\f[C]duration\f[]: Length of the video in seconds -.IP \[bu] 2 -\f[C]view_count\f[]: How many users have watched the video on the -platform -.IP \[bu] 2 -\f[C]like_count\f[]: Number of positive ratings of the video -.IP \[bu] 2 -\f[C]dislike_count\f[]: Number of negative ratings of the video -.IP \[bu] 2 -\f[C]repost_count\f[]: Number of reposts of the video -.IP \[bu] 2 -\f[C]average_rating\f[]: Average rating give by users, the scale used -depends on the webpage -.IP \[bu] 2 -\f[C]comment_count\f[]: Number of comments on the video -.IP \[bu] 2 -\f[C]age_limit\f[]: Age restriction for the video (years) -.IP \[bu] 2 -\f[C]format\f[]: A human\-readable description of the format -.IP \[bu] 2 -\f[C]format_id\f[]: Format code specified by \f[C]\-\-format\f[] -.IP \[bu] 2 -\f[C]format_note\f[]: Additional info about the format -.IP \[bu] 2 -\f[C]width\f[]: Width of the video -.IP \[bu] 2 -\f[C]height\f[]: Height of the video -.IP \[bu] 2 -\f[C]resolution\f[]: Textual description of width and height -.IP \[bu] 2 -\f[C]tbr\f[]: Average bitrate of audio and video in KBit/s -.IP \[bu] 2 -\f[C]abr\f[]: Average audio bitrate in KBit/s -.IP \[bu] 2 -\f[C]acodec\f[]: Name of the audio codec in use -.IP \[bu] 2 -\f[C]asr\f[]: Audio sampling rate in Hertz -.IP \[bu] 2 -\f[C]vbr\f[]: Average video bitrate in KBit/s -.IP \[bu] 2 -\f[C]fps\f[]: Frame rate -.IP \[bu] 2 -\f[C]vcodec\f[]: Name of the video codec in use -.IP \[bu] 2 -\f[C]container\f[]: Name of the container format -.IP \[bu] 2 -\f[C]filesize\f[]: The number of bytes, if known in advance -.IP \[bu] 2 -\f[C]filesize_approx\f[]: An estimate for the number of bytes -.IP \[bu] 2 -\f[C]protocol\f[]: The protocol that will be used for the actual -download -.IP \[bu] 2 -\f[C]extractor\f[]: Name of the extractor -.IP \[bu] 2 -\f[C]extractor_key\f[]: Key name of the extractor -.IP \[bu] 2 -\f[C]epoch\f[]: Unix epoch when creating the file -.IP \[bu] 2 -\f[C]autonumber\f[]: Five\-digit number that will be increased with each -download, starting at zero -.IP \[bu] 2 -\f[C]playlist\f[]: Name or id of the playlist that contains the video -.IP \[bu] 2 -\f[C]playlist_index\f[]: Index of the video in the playlist padded with -leading zeros according to the total length of the playlist -.PP -Available for the video that belongs to some logical chapter or section: -\- \f[C]chapter\f[]: Name or title of the chapter the video belongs to -\- \f[C]chapter_number\f[]: Number of the chapter the video belongs to -\- \f[C]chapter_id\f[]: Id of the chapter the video belongs to -.PP -Available for the video that is an episode of some series or programme: -\- \f[C]series\f[]: Title of the series or programme the video episode -belongs to \- \f[C]season\f[]: Title of the season the video episode -belongs to \- \f[C]season_number\f[]: Number of the season the video -episode belongs to \- \f[C]season_id\f[]: Id of the season the video -episode belongs to \- \f[C]episode\f[]: Title of the video episode \- -\f[C]episode_number\f[]: Number of the video episode within a season \- -\f[C]episode_id\f[]: Id of the video episode -.PP -Each aforementioned sequence when referenced in output template will be -replaced by the actual value corresponding to the sequence name. -Note that some of the sequences are not guaranteed to be present since -they depend on the metadata obtained by particular extractor, such -sequences will be replaced with \f[C]NA\f[]. -.PP -For example for \f[C]\-o\ %(title)s\-%(id)s.%(ext)s\f[] and mp4 video -with title \f[C]youtube\-dl\ test\ video\f[] and id -\f[C]BaW_jenozKcj\f[] this will result in a -\f[C]youtube\-dl\ test\ video\-BaW_jenozKcj.mp4\f[] file created in the -current directory. -.PP -Output template can also contain arbitrary hierarchical path, e.g. -\f[C]\-o\ \[aq]%(playlist)s/%(playlist_index)s\ \-\ %(title)s.%(ext)s\[aq]\f[] -that will result in downloading each video in a directory corresponding -to this path template. -Any missing directory will be automatically created for you. -.PP -To specify percent literal in output template use \f[C]%%\f[]. -To output to stdout use \f[C]\-o\ \-\f[]. -.PP -The current default template is \f[C]%(title)s\-%(id)s.%(ext)s\f[]. -.PP -In some cases, you don\[aq]t want special characters such as 中, spaces, -or &, such as when transferring the downloaded filename to a Windows -system or the filename through an 8bit\-unsafe channel. -In these cases, add the \f[C]\-\-restrict\-filenames\f[] flag to get a -shorter title: -.PP -Examples (note on Windows you may need to use double quotes instead of -single): -.IP -.nf -\f[C] -$\ youtube\-dl\ \-\-get\-filename\ \-o\ \[aq]%(title)s.%(ext)s\[aq]\ BaW_jenozKc -youtube\-dl\ test\ video\ \[aq]\[aq]_ä↭𝕐.mp4\ \ \ \ #\ All\ kinds\ of\ weird\ characters - -$\ youtube\-dl\ \-\-get\-filename\ \-o\ \[aq]%(title)s.%(ext)s\[aq]\ BaW_jenozKc\ \-\-restrict\-filenames -youtube\-dl_test_video_.mp4\ \ \ \ \ \ \ \ \ \ #\ A\ simple\ file\ name - -#\ Download\ YouTube\ playlist\ videos\ in\ separate\ directory\ indexed\ by\ video\ order\ in\ a\ playlist -$\ youtube\-dl\ \-o\ \[aq]%(playlist)s/%(playlist_index)s\ \-\ %(title)s.%(ext)s\[aq]\ https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re - -#\ Download\ Udemy\ course\ keeping\ each\ chapter\ in\ separate\ directory\ under\ MyVideos\ directory\ in\ your\ home -$\ youtube\-dl\ \-u\ user\ \-p\ password\ \-o\ \[aq]~/MyVideos/%(playlist)s/%(chapter_number)s\ \-\ %(chapter)s/%(title)s.%(ext)s\[aq]\ https://www.udemy.com/java\-tutorial/ - -#\ Download\ entire\ series\ season\ keeping\ each\ series\ and\ each\ season\ in\ separate\ directory\ under\ C:/MyVideos -$\ youtube\-dl\ \-o\ "C:/MyVideos/%(series)s/%(season_number)s\ \-\ %(season)s/%(episode_number)s\ \-\ %(episode)s.%(ext)s"\ http://videomore.ru/kino_v_detalayah/5_sezon/367617 - -#\ Stream\ the\ video\ being\ downloaded\ to\ stdout -$\ youtube\-dl\ \-o\ \-\ BaW_jenozKc -\f[] -.fi -.SH FORMAT SELECTION -.PP -By default youtube\-dl tries to download the best available quality, -i.e. -if you want the best quality you \f[B]don\[aq]t need\f[] to pass any -special options, youtube\-dl will guess it for you by \f[B]default\f[]. -.PP -But sometimes you may want to download in a different format, for -example when you are on a slow or intermittent connection. -The key mechanism for achieving this is so called \f[I]format -selection\f[] based on which you can explicitly specify desired format, -select formats based on some criterion or criteria, setup precedence and -much more. -.PP -The general syntax for format selection is \f[C]\-\-format\ FORMAT\f[] -or shorter \f[C]\-f\ FORMAT\f[] where \f[C]FORMAT\f[] is a \f[I]selector -expression\f[], i.e. -an expression that describes format or formats you would like to -download. -.PP -The simplest case is requesting a specific format, for example with -\f[C]\-f\ 22\f[] you can download the format with format code equal to -22. -You can get the list of available format codes for particular video -using \f[C]\-\-list\-formats\f[] or \f[C]\-F\f[]. -Note that these format codes are extractor specific. -.PP -You can also use a file extension (currently \f[C]3gp\f[], \f[C]aac\f[], -\f[C]flv\f[], \f[C]m4a\f[], \f[C]mp3\f[], \f[C]mp4\f[], \f[C]ogg\f[], -\f[C]wav\f[], \f[C]webm\f[] are supported) to download best quality -format of particular file extension served as a single file, e.g. -\f[C]\-f\ webm\f[] will download best quality format with \f[C]webm\f[] -extension served as a single file. -.PP -You can also use special names to select particular edge case format: \- -\f[C]best\f[]: Select best quality format represented by single file -with video and audio \- \f[C]worst\f[]: Select worst quality format -represented by single file with video and audio \- \f[C]bestvideo\f[]: -Select best quality video only format (e.g. -DASH video), may not be available \- \f[C]worstvideo\f[]: Select worst -quality video only format, may not be available \- \f[C]bestaudio\f[]: -Select best quality audio only format, may not be available \- -\f[C]worstaudio\f[]: Select worst quality audio only format, may not be -available -.PP -For example, to download worst quality video only format you can use -\f[C]\-f\ worstvideo\f[]. -.PP -If you want to download multiple videos and they don\[aq]t have the same -formats available, you can specify the order of preference using -slashes. -Note that slash is left\-associative, i.e. -formats on the left hand side are preferred, for example -\f[C]\-f\ 22/17/18\f[] will download format 22 if it\[aq]s available, -otherwise it will download format 17 if it\[aq]s available, otherwise it -will download format 18 if it\[aq]s available, otherwise it will -complain that no suitable formats are available for download. -.PP -If you want to download several formats of the same video use comma as a -separator, e.g. -\f[C]\-f\ 22,17,18\f[] will download all these three formats, of course -if they are available. -Or more sophisticated example combined with precedence feature -\f[C]\-f\ 136/137/mp4/bestvideo,140/m4a/bestaudio\f[]. -.PP -You can also filter the video formats by putting a condition in -brackets, as in \f[C]\-f\ "best[height=720]"\f[] (or -\f[C]\-f\ "[filesize>10M]"\f[]). -.PP -The following numeric meta fields can be used with comparisons -\f[C]<\f[], \f[C]<=\f[], \f[C]>\f[], \f[C]>=\f[], \f[C]=\f[] (equals), -\f[C]!=\f[] (not equals): \- \f[C]filesize\f[]: The number of bytes, if -known in advance \- \f[C]width\f[]: Width of the video, if known \- -\f[C]height\f[]: Height of the video, if known \- \f[C]tbr\f[]: Average -bitrate of audio and video in KBit/s \- \f[C]abr\f[]: Average audio -bitrate in KBit/s \- \f[C]vbr\f[]: Average video bitrate in KBit/s \- -\f[C]asr\f[]: Audio sampling rate in Hertz \- \f[C]fps\f[]: Frame rate -.PP -Also filtering work for comparisons \f[C]=\f[] (equals), \f[C]!=\f[] -(not equals), \f[C]^=\f[] (begins with), \f[C]$=\f[] (ends with), -\f[C]*=\f[] (contains) and following string meta fields: \- -\f[C]ext\f[]: File extension \- \f[C]acodec\f[]: Name of the audio codec -in use \- \f[C]vcodec\f[]: Name of the video codec in use \- -\f[C]container\f[]: Name of the container format \- \f[C]protocol\f[]: -The protocol that will be used for the actual download, lower\-case. -\f[C]http\f[], \f[C]https\f[], \f[C]rtsp\f[], \f[C]rtmp\f[], -\f[C]rtmpe\f[], \f[C]m3u8\f[], or \f[C]m3u8_native\f[] -.PP -Note that none of the aforementioned meta fields are guaranteed to be -present since this solely depends on the metadata obtained by particular -extractor, i.e. -the metadata offered by video hoster. -.PP -Formats for which the value is not known are excluded unless you put a -question mark (\f[C]?\f[]) after the operator. -You can combine format filters, so -\f[C]\-f\ "[height\ <=?\ 720][tbr>500]"\f[] selects up to 720p videos -(or videos where the height is not known) with a bitrate of at least 500 -KBit/s. -.PP -You can merge the video and audio of two formats into a single file -using \f[C]\-f\ <video\-format>+<audio\-format>\f[] (requires ffmpeg or -avconv installed), for example \f[C]\-f\ bestvideo+bestaudio\f[] will -download best video only format, best audio only format and mux them -together with ffmpeg/avconv. -.PP -Format selectors can also be grouped using parentheses, for example if -you want to download the best mp4 and webm formats with a height lower -than 480 you can use \f[C]\-f\ \[aq](mp4,webm)[height<480]\[aq]\f[]. -.PP -Since the end of April 2015 and version 2015.04.26 youtube\-dl uses -\f[C]\-f\ bestvideo+bestaudio/best\f[] as default format selection (see -#5447, #5456). -If ffmpeg or avconv are installed this results in downloading -\f[C]bestvideo\f[] and \f[C]bestaudio\f[] separately and muxing them -together into a single file giving the best overall quality available. -Otherwise it falls back to \f[C]best\f[] and results in downloading the -best available quality served as a single file. -\f[C]best\f[] is also needed for videos that don\[aq]t come from YouTube -because they don\[aq]t provide the audio and video in two different -files. -If you want to only download some DASH formats (for example if you are -not interested in getting videos with a resolution higher than 1080p), -you can add \f[C]\-f\ bestvideo[height<=?1080]+bestaudio/best\f[] to -your configuration file. -Note that if you use youtube\-dl to stream to \f[C]stdout\f[] (and most -likely to pipe it to your media player then), i.e. -you explicitly specify output template as \f[C]\-o\ \-\f[], youtube\-dl -still uses \f[C]\-f\ best\f[] format selection in order to start content -delivery immediately to your player and not to wait until -\f[C]bestvideo\f[] and \f[C]bestaudio\f[] are downloaded and muxed. -.PP -If you want to preserve the old format selection behavior (prior to -youtube\-dl 2015.04.26), i.e. -you want to download the best available quality media served as a single -file, you should explicitly specify your choice with \f[C]\-f\ best\f[]. -You may want to add it to the configuration file (#configuration) in -order not to type it every time you run youtube\-dl. -.PP -Examples (note on Windows you may need to use double quotes instead of -single): -.IP -.nf -\f[C] -#\ Download\ best\ mp4\ format\ available\ or\ any\ other\ best\ if\ no\ mp4\ available -$\ youtube\-dl\ \-f\ \[aq]bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best\[aq] - -#\ Download\ best\ format\ available\ but\ not\ better\ that\ 480p -$\ youtube\-dl\ \-f\ \[aq]bestvideo[height<=480]+bestaudio/best[height<=480]\[aq] - -#\ Download\ best\ video\ only\ format\ but\ no\ bigger\ that\ 50\ MB -$\ youtube\-dl\ \-f\ \[aq]best[filesize<50M]\[aq] - -#\ Download\ best\ format\ available\ via\ direct\ link\ over\ HTTP/HTTPS\ protocol -$\ youtube\-dl\ \-f\ \[aq](bestvideo+bestaudio/best)[protocol^=http]\[aq] -\f[] -.fi -.SH VIDEO SELECTION -.PP -Videos can be filtered by their upload date using the options -\f[C]\-\-date\f[], \f[C]\-\-datebefore\f[] or \f[C]\-\-dateafter\f[]. -They accept dates in two formats: -.IP \[bu] 2 -Absolute dates: Dates in the format \f[C]YYYYMMDD\f[]. -.IP \[bu] 2 -Relative dates: Dates in the format -\f[C](now|today)[+\-][0\-9](day|week|month|year)(s)?\f[] -.PP -Examples: -.IP -.nf -\f[C] -#\ Download\ only\ the\ videos\ uploaded\ in\ the\ last\ 6\ months -$\ youtube\-dl\ \-\-dateafter\ now\-6months - -#\ Download\ only\ the\ videos\ uploaded\ on\ January\ 1,\ 1970 -$\ youtube\-dl\ \-\-date\ 19700101 - -$\ #\ Download\ only\ the\ videos\ uploaded\ in\ the\ 200x\ decade -$\ youtube\-dl\ \-\-dateafter\ 20000101\ \-\-datebefore\ 20091231 -\f[] -.fi -.SH FAQ -.SS How do I update youtube\-dl? -.PP -If you\[aq]ve followed our manual installation -instructions (http://rg3.github.io/youtube-dl/download.html), you can -simply run \f[C]youtube\-dl\ \-U\f[] (or, on Linux, -\f[C]sudo\ youtube\-dl\ \-U\f[]). -.PP -If you have used pip, a simple -\f[C]sudo\ pip\ install\ \-U\ youtube\-dl\f[] is sufficient to update. -.PP -If you have installed youtube\-dl using a package manager like -\f[I]apt\-get\f[] or \f[I]yum\f[], use the standard system update -mechanism to update. -Note that distribution packages are often outdated. -As a rule of thumb, youtube\-dl releases at least once a month, and -often weekly or even daily. -Simply go to http://yt\-dl.org/ to find out the current version. -Unfortunately, there is nothing we youtube\-dl developers can do if your -distribution serves a really outdated version. -You can (and should) complain to your distribution in their bugtracker -or support forum. -.PP -As a last resort, you can also uninstall the version installed by your -package manager and follow our manual installation instructions. -For that, remove the distribution\[aq]s package, with a line like -.IP -.nf -\f[C] -sudo\ apt\-get\ remove\ \-y\ youtube\-dl -\f[] -.fi -.PP -Afterwards, simply follow our manual installation -instructions (http://rg3.github.io/youtube-dl/download.html): -.IP -.nf -\f[C] -sudo\ wget\ https://yt\-dl.org/latest/youtube\-dl\ \-O\ /usr/local/bin/youtube\-dl -sudo\ chmod\ a+x\ /usr/local/bin/youtube\-dl -hash\ \-r -\f[] -.fi -.PP -Again, from then on you\[aq]ll be able to update with -\f[C]sudo\ youtube\-dl\ \-U\f[]. -.SS I\[aq]m getting an error -\f[C]Unable\ to\ extract\ OpenGraph\ title\f[] on YouTube playlists -.PP -YouTube changed their playlist format in March 2014 and later on, so -you\[aq]ll need at least youtube\-dl 2014.07.25 to download all YouTube -videos. -.PP -If you have installed youtube\-dl with a package manager, pip, setup.py -or a tarball, please use that to update. -Note that Ubuntu packages do not seem to get updated anymore. -Since we are not affiliated with Ubuntu, there is little we can do. -Feel free to report -bugs (https://bugs.launchpad.net/ubuntu/+source/youtube-dl/+filebug) to -the Ubuntu packaging -guys (mailto:ubuntu-motu@lists.ubuntu.com?subject=outdated%20version%20of%20youtube-dl) -\- all they have to do is update the package to a somewhat recent -version. -See above for a way to update. -.SS Do I always have to pass \f[C]\-citw\f[]? -.PP -By default, youtube\-dl intends to have the best options (incidentally, -if you have a convincing case that these should be different, please -file an issue where you explain that (https://yt-dl.org/bug)). -Therefore, it is unnecessary and sometimes harmful to copy long option -strings from webpages. -In particular, the only option out of \f[C]\-citw\f[] that is regularly -useful is \f[C]\-i\f[]. -.SS Can you please put the \f[C]\-b\f[] option back? -.PP -Most people asking this question are not aware that youtube\-dl now -defaults to downloading the highest available quality as reported by -YouTube, which will be 1080p or 720p in some cases, so you no longer -need the \f[C]\-b\f[] option. -For some specific videos, maybe YouTube does not report them to be -available in a specific high quality format you\[aq]re interested in. -In that case, simply request it with the \f[C]\-f\f[] option and -youtube\-dl will try to download it. -.SS I get HTTP error 402 when trying to download a video. What\[aq]s -this? -.PP -Apparently YouTube requires you to pass a CAPTCHA test if you download -too much. -We\[aq]re considering to provide a way to let you solve the -CAPTCHA (https://github.com/rg3/youtube-dl/issues/154), but at the -moment, your best course of action is pointing a webbrowser to the -youtube URL, solving the CAPTCHA, and restart youtube\-dl. -.SS Do I need any other programs? -.PP -youtube\-dl works fine on its own on most sites. -However, if you want to convert video/audio, you\[aq]ll need -avconv (https://libav.org/) or ffmpeg (https://www.ffmpeg.org/). -On some sites \- most notably YouTube \- videos can be retrieved in a -higher quality format without sound. -youtube\-dl will detect whether avconv/ffmpeg is present and -automatically pick the best option. -.PP -Videos or video formats streamed via RTMP protocol can only be -downloaded when rtmpdump (https://rtmpdump.mplayerhq.hu/) is installed. -Downloading MMS and RTSP videos requires either -mplayer (http://mplayerhq.hu/) or mpv (https://mpv.io/) to be installed. -.SS I have downloaded a video but how can I play it? -.PP -Once the video is fully downloaded, use any video player, such as -vlc (http://www.videolan.org) or mplayer (http://www.mplayerhq.hu/). -.SS I extracted a video URL with \f[C]\-g\f[], but it does not play on -another machine / in my webbrowser. -.PP -It depends a lot on the service. -In many cases, requests for the video (to download/play it) must come -from the same IP address and with the same cookies. -Use the \f[C]\-\-cookies\f[] option to write the required cookies into a -file, and advise your downloader to read cookies from that file. -Some sites also require a common user agent to be used, use -\f[C]\-\-dump\-user\-agent\f[] to see the one in use by youtube\-dl. -.PP -It may be beneficial to use IPv6; in some cases, the restrictions are -only applied to IPv4. -Some services (sometimes only for a subset of videos) do not restrict -the video URL by IP address, cookie, or user\-agent, but these are the -exception rather than the rule. -.PP -Please bear in mind that some URL protocols are \f[B]not\f[] supported -by browsers out of the box, including RTMP. -If you are using \f[C]\-g\f[], your own downloader must support these as -well. -.PP -If you want to play the video on a machine that is not running -youtube\-dl, you can relay the video content from the machine that runs -youtube\-dl. -You can use \f[C]\-o\ \-\f[] to let youtube\-dl stream a video to -stdout, or simply allow the player to download the files written by -youtube\-dl in turn. -.SS ERROR: no fmt_url_map or conn information found in video info -.PP -YouTube has switched to a new video info format in July 2011 which is -not supported by old versions of youtube\-dl. -See above (#how-do-i-update-youtube-dl) for how to update youtube\-dl. -.SS ERROR: unable to download video -.PP -YouTube requires an additional signature since September 2012 which is -not supported by old versions of youtube\-dl. -See above (#how-do-i-update-youtube-dl) for how to update youtube\-dl. -.SS Video URL contains an ampersand and I\[aq]m getting some strange -output \f[C][1]\ 2839\f[] or -\f[C]\[aq]v\[aq]\ is\ not\ recognized\ as\ an\ internal\ or\ external\ command\f[] -.PP -That\[aq]s actually the output from your shell. -Since ampersand is one of the special shell characters it\[aq]s -interpreted by the shell preventing you from passing the whole URL to -youtube\-dl. -To disable your shell from interpreting the ampersands (or any other -special characters) you have to either put the whole URL in quotes or -escape them with a backslash (which approach will work depends on your -shell). -.PP -For example if your URL is -https://www.youtube.com/watch?t=4&v=BaW_jenozKc you should end up with -following command: -.PP -\f[C]youtube\-dl\ \[aq]https://www.youtube.com/watch?t=4&v=BaW_jenozKc\[aq]\f[] -.PP -or -.PP -\f[C]youtube\-dl\ https://www.youtube.com/watch?t=4\\&v=BaW_jenozKc\f[] -.PP -For Windows you have to use the double quotes: -.PP -\f[C]youtube\-dl\ "https://www.youtube.com/watch?t=4&v=BaW_jenozKc"\f[] -.SS ExtractorError: Could not find JS function u\[aq]OF\[aq] -.PP -In February 2015, the new YouTube player contained a character sequence -in a string that was misinterpreted by old versions of youtube\-dl. -See above (#how-do-i-update-youtube-dl) for how to update youtube\-dl. -.SS HTTP Error 429: Too Many Requests or 402: Payment Required -.PP -These two error codes indicate that the service is blocking your IP -address because of overuse. -Contact the service and ask them to unblock your IP address, or \- if -you have acquired a whitelisted IP address already \- use the -\f[C]\-\-proxy\f[] or \f[C]\-\-source\-address\f[] -options (#network-options) to select another IP address. -.SS SyntaxError: Non\-ASCII character -.PP -The error -.IP -.nf -\f[C] -File\ "youtube\-dl",\ line\ 2 -SyntaxError:\ Non\-ASCII\ character\ \[aq]\\x93\[aq]\ ... -\f[] -.fi -.PP -means you\[aq]re using an outdated version of Python. -Please update to Python 2.6 or 2.7. -.SS What is this binary file? Where has the code gone? -.PP -Since June 2012 (#342) youtube\-dl is packed as an executable zipfile, -simply unzip it (might need renaming to \f[C]youtube\-dl.zip\f[] first -on some systems) or clone the git repository, as laid out above. -If you modify the code, you can run it by executing the -\f[C]__main__.py\f[] file. -To recompile the executable, run \f[C]make\ youtube\-dl\f[]. -.SS The exe throws a \f[I]Runtime error from Visual C++\f[] -.PP -To run the exe you need to install first the Microsoft Visual C++ 2008 -Redistributable -Package (http://www.microsoft.com/en-us/download/details.aspx?id=29). -.SS On Windows, how should I set up ffmpeg and youtube\-dl? Where should -I put the exe files? -.PP -If you put youtube\-dl and ffmpeg in the same directory that you\[aq]re -running the command from, it will work, but that\[aq]s rather -cumbersome. -.PP -To make a different directory work \- either for ffmpeg, or for -youtube\-dl, or for both \- simply create the directory (say, -\f[C]C:\\bin\f[], or \f[C]C:\\Users\\<User\ name>\\bin\f[]), put all the -executables directly in there, and then set your PATH environment -variable (https://www.java.com/en/download/help/path.xml) to include -that directory. -.PP -From then on, after restarting your shell, you will be able to access -both youtube\-dl and ffmpeg (and youtube\-dl will be able to find -ffmpeg) by simply typing \f[C]youtube\-dl\f[] or \f[C]ffmpeg\f[], no -matter what directory you\[aq]re in. -.SS How do I put downloads into a specific folder? -.PP -Use the \f[C]\-o\f[] to specify an output template (#output-template), -for example \f[C]\-o\ "/home/user/videos/%(title)s\-%(id)s.%(ext)s"\f[]. -If you want this for all of your downloads, put the option into your -configuration file (#configuration). -.SS How do I download a video starting with a \f[C]\-\f[]? -.PP -Either prepend \f[C]http://www.youtube.com/watch?v=\f[] or separate the -ID from the options with \f[C]\-\-\f[]: -.IP -.nf -\f[C] -youtube\-dl\ \-\-\ \-wNyEUrxzFU -youtube\-dl\ "http://www.youtube.com/watch?v=\-wNyEUrxzFU" -\f[] -.fi -.SS How do I pass cookies to youtube\-dl? -.PP -Use the \f[C]\-\-cookies\f[] option, for example -\f[C]\-\-cookies\ /path/to/cookies/file.txt\f[]. -Note that the cookies file must be in Mozilla/Netscape format and the -first line of the cookies file must be either -\f[C]#\ HTTP\ Cookie\ File\f[] or -\f[C]#\ Netscape\ HTTP\ Cookie\ File\f[]. -Make sure you have correct newline -format (https://en.wikipedia.org/wiki/Newline) in the cookies file and -convert newlines if necessary to correspond with your OS, namely -\f[C]CRLF\f[] (\f[C]\\r\\n\f[]) for Windows, \f[C]LF\f[] (\f[C]\\n\f[]) -for Linux and \f[C]CR\f[] (\f[C]\\r\f[]) for Mac OS. -\f[C]HTTP\ Error\ 400:\ Bad\ Request\f[] when using \f[C]\-\-cookies\f[] -is a good sign of invalid newline format. -.PP -Passing cookies to youtube\-dl is a good way to workaround login when a -particular extractor does not implement it explicitly. -Another use case is working around -CAPTCHA (https://en.wikipedia.org/wiki/CAPTCHA) some websites require -you to solve in particular cases in order to get access (e.g. -YouTube, CloudFlare). -.SS Can you add support for this anime video site, or site which shows -current movies for free? -.PP -As a matter of policy (as well as legality), youtube\-dl does not -include support for services that specialize in infringing copyright. -As a rule of thumb, if you cannot easily find a video that the service -is quite obviously allowed to distribute (i.e. -that has been uploaded by the creator, the creator\[aq]s distributor, or -is published under a free license), the service is probably unfit for -inclusion to youtube\-dl. -.PP -A note on the service that they don\[aq]t host the infringing content, -but just link to those who do, is evidence that the service should -\f[B]not\f[] be included into youtube\-dl. -The same goes for any DMCA note when the whole front page of the service -is filled with videos they are not allowed to distribute. -A "fair use" note is equally unconvincing if the service shows -copyright\-protected videos in full without authorization. -.PP -Support requests for services that \f[B]do\f[] purchase the rights to -distribute their content are perfectly fine though. -If in doubt, you can simply include a source that mentions the -legitimate purchase of content. -.SS How can I speed up work on my issue? -.PP -(Also known as: Help, my important issue not being solved!) The -youtube\-dl core developer team is quite small. -While we do our best to solve as many issues as possible, sometimes that -can take quite a while. -To speed up your issue, here\[aq]s what you can do: -.PP -First of all, please do report the issue at our issue -tracker (https://yt-dl.org/bugs). -That allows us to coordinate all efforts by users and developers, and -serves as a unified point. -Unfortunately, the youtube\-dl project has grown too large to use -personal email as an effective communication channel. -.PP -Please read the bug reporting instructions (#bugs) below. -A lot of bugs lack all the necessary information. -If you can, offer proxy, VPN, or shell access to the youtube\-dl -developers. -If you are able to, test the issue from multiple computers in multiple -countries to exclude local censorship or misconfiguration issues. -.PP -If nobody is interested in solving your issue, you are welcome to take -matters into your own hands and submit a pull request (or coerce/pay -somebody else to do so). -.PP -Feel free to bump the issue from time to time by writing a small comment -("Issue is still present in youtube\-dl version ...from France, but -fixed from Belgium"), but please not more than once a month. -Please do not declare your issue as \f[C]important\f[] or -\f[C]urgent\f[]. -.SS How can I detect whether a given URL is supported by youtube\-dl? -.PP -For one, have a look at the list of supported -sites (docs/supportedsites.md). -Note that it can sometimes happen that the site changes its URL scheme -(say, from http://example.com/video/1234567 to -http://example.com/v/1234567 ) and youtube\-dl reports an URL of a -service in that list as unsupported. -In that case, simply report a bug. -.PP -It is \f[I]not\f[] possible to detect whether a URL is supported or not. -That\[aq]s because youtube\-dl contains a generic extractor which -matches \f[B]all\f[] URLs. -You may be tempted to disable, exclude, or remove the generic extractor, -but the generic extractor not only allows users to extract videos from -lots of websites that embed a video from another service, but may also -be used to extract video from a service that it\[aq]s hosting itself. -Therefore, we neither recommend nor support disabling, excluding, or -removing the generic extractor. -.PP -If you want to find out whether a given URL is supported, simply call -youtube\-dl with it. -If you get no videos back, chances are the URL is either not referring -to a video or unsupported. -You can find out which by examining the output (if you run youtube\-dl -on the console) or catching an \f[C]UnsupportedError\f[] exception if -you run it from a Python program. -.SH DEVELOPER INSTRUCTIONS -.PP -Most users do not need to build youtube\-dl and can download the -builds (http://rg3.github.io/youtube-dl/download.html) or get them from -their distribution. -.PP -To run youtube\-dl as a developer, you don\[aq]t need to build anything -either. -Simply execute -.IP -.nf -\f[C] -python\ \-m\ youtube_dl -\f[] -.fi -.PP -To run the test, simply invoke your favorite test runner, or execute a -test file directly; any of the following work: -.IP -.nf -\f[C] -python\ \-m\ unittest\ discover -python\ test/test_download.py -nosetests -\f[] -.fi -.PP -If you want to create a build of youtube\-dl yourself, you\[aq]ll need -.IP \[bu] 2 -python -.IP \[bu] 2 -make -.IP \[bu] 2 -pandoc -.IP \[bu] 2 -zip -.IP \[bu] 2 -nosetests -.SS Adding support for a new site -.PP -If you want to add support for a new site, you can follow this quick -list (assuming your service is called \f[C]yourextractor\f[]): -.IP " 1." 4 -Fork this repository (https://github.com/rg3/youtube-dl/fork) -.IP " 2." 4 -Check out the source code with -\f[C]git\ clone\ git\@github.com:YOUR_GITHUB_USERNAME/youtube\-dl.git\f[] -.IP " 3." 4 -Start a new git branch with -\f[C]cd\ youtube\-dl;\ git\ checkout\ \-b\ yourextractor\f[] -.IP " 4." 4 -Start with this simple template and save it to -\f[C]youtube_dl/extractor/yourextractor.py\f[]: -.RS 4 -.IP -.nf -\f[C] -#\ coding:\ utf\-8 -from\ __future__\ import\ unicode_literals - -from\ .common\ import\ InfoExtractor - - -class\ YourExtractorIE(InfoExtractor): -\ \ \ \ _VALID_URL\ =\ r\[aq]https?://(?:www\\.)?yourextractor\\.com/watch/(?P<id>[0\-9]+)\[aq] -\ \ \ \ _TEST\ =\ { -\ \ \ \ \ \ \ \ \[aq]url\[aq]:\ \[aq]http://yourextractor.com/watch/42\[aq], -\ \ \ \ \ \ \ \ \[aq]md5\[aq]:\ \[aq]TODO:\ md5\ sum\ of\ the\ first\ 10241\ bytes\ of\ the\ video\ file\ (use\ \-\-test)\[aq], -\ \ \ \ \ \ \ \ \[aq]info_dict\[aq]:\ { -\ \ \ \ \ \ \ \ \ \ \ \ \[aq]id\[aq]:\ \[aq]42\[aq], -\ \ \ \ \ \ \ \ \ \ \ \ \[aq]ext\[aq]:\ \[aq]mp4\[aq], -\ \ \ \ \ \ \ \ \ \ \ \ \[aq]title\[aq]:\ \[aq]Video\ title\ goes\ here\[aq], -\ \ \ \ \ \ \ \ \ \ \ \ \[aq]thumbnail\[aq]:\ \[aq]re:^https?://.*\\.jpg$\[aq], -\ \ \ \ \ \ \ \ \ \ \ \ #\ TODO\ more\ properties,\ either\ as: -\ \ \ \ \ \ \ \ \ \ \ \ #\ *\ A\ value -\ \ \ \ \ \ \ \ \ \ \ \ #\ *\ MD5\ checksum;\ start\ the\ string\ with\ md5: -\ \ \ \ \ \ \ \ \ \ \ \ #\ *\ A\ regular\ expression;\ start\ the\ string\ with\ re: -\ \ \ \ \ \ \ \ \ \ \ \ #\ *\ Any\ Python\ type\ (for\ example\ int\ or\ float) -\ \ \ \ \ \ \ \ } -\ \ \ \ } - -\ \ \ \ def\ _real_extract(self,\ url): -\ \ \ \ \ \ \ \ video_id\ =\ self._match_id(url) -\ \ \ \ \ \ \ \ webpage\ =\ self._download_webpage(url,\ video_id) - -\ \ \ \ \ \ \ \ #\ TODO\ more\ code\ goes\ here,\ for\ example\ ... -\ \ \ \ \ \ \ \ title\ =\ self._html_search_regex(r\[aq]<h1>(.+?)</h1>\[aq],\ webpage,\ \[aq]title\[aq]) - -\ \ \ \ \ \ \ \ return\ { -\ \ \ \ \ \ \ \ \ \ \ \ \[aq]id\[aq]:\ video_id, -\ \ \ \ \ \ \ \ \ \ \ \ \[aq]title\[aq]:\ title, -\ \ \ \ \ \ \ \ \ \ \ \ \[aq]description\[aq]:\ self._og_search_description(webpage), -\ \ \ \ \ \ \ \ \ \ \ \ \[aq]uploader\[aq]:\ self._search_regex(r\[aq]<div[^>]+id="uploader"[^>]*>([^<]+)<\[aq],\ webpage,\ \[aq]uploader\[aq],\ fatal=False), -\ \ \ \ \ \ \ \ \ \ \ \ #\ TODO\ more\ properties\ (see\ youtube_dl/extractor/common.py) -\ \ \ \ \ \ \ \ } -\f[] -.fi -.RE -.IP " 5." 4 -Add an import in -\f[C]youtube_dl/extractor/__init__.py\f[] (https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py). -.IP " 6." 4 -Run -\f[C]python\ test/test_download.py\ TestDownload.test_YourExtractor\f[]. -This \f[I]should fail\f[] at first, but you can continually re\-run it -until you\[aq]re done. -If you decide to add more than one test, then rename \f[C]_TEST\f[] to -\f[C]_TESTS\f[] and make it into a list of dictionaries. -The tests will then be named \f[C]TestDownload.test_YourExtractor\f[], -\f[C]TestDownload.test_YourExtractor_1\f[], -\f[C]TestDownload.test_YourExtractor_2\f[], etc. -.IP " 7." 4 -Have a look at -\f[C]youtube_dl/extractor/common.py\f[] (https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) -for possible helper methods and a detailed description of what your -extractor should and may -return (https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L62-L200). -Add tests and code for as many as you want. -.IP " 8." 4 -If you can, check the code with -flake8 (https://pypi.python.org/pypi/flake8). -.IP " 9." 4 -When the tests pass, add (http://git-scm.com/docs/git-add) the new files -and commit (http://git-scm.com/docs/git-commit) them and -push (http://git-scm.com/docs/git-push) the result, like this: -.RS 4 -.IP -.nf -\f[C] -$\ git\ add\ youtube_dl/extractor/__init__.py -$\ git\ add\ youtube_dl/extractor/yourextractor.py -$\ git\ commit\ \-m\ \[aq][yourextractor]\ Add\ new\ extractor\[aq] -$\ git\ push\ origin\ yourextractor -\f[] -.fi -.RE -.IP "10." 4 -Finally, create a pull -request (https://help.github.com/articles/creating-a-pull-request). -We\[aq]ll then review and merge it. -.PP -In any case, thank you very much for your contributions! -.SH EMBEDDING YOUTUBE\-DL -.PP -youtube\-dl makes the best effort to be a good command\-line program, -and thus should be callable from any programming language. -If you encounter any problems parsing its output, feel free to create a -report (https://github.com/rg3/youtube-dl/issues/new). -.PP -From a Python program, you can embed youtube\-dl in a more powerful -fashion, like this: -.IP -.nf -\f[C] -from\ __future__\ import\ unicode_literals -import\ youtube_dl - -ydl_opts\ =\ {} -with\ youtube_dl.YoutubeDL(ydl_opts)\ as\ ydl: -\ \ \ \ ydl.download([\[aq]http://www.youtube.com/watch?v=BaW_jenozKc\[aq]]) -\f[] -.fi -.PP -Most likely, you\[aq]ll want to use various options. -For a list of what can be done, have a look at -\f[C]youtube_dl/YoutubeDL.py\f[] (https://github.com/rg3/youtube-dl/blob/master/youtube_dl/YoutubeDL.py#L121-L269). -For a start, if you want to intercept youtube\-dl\[aq]s output, set a -\f[C]logger\f[] object. -.PP -Here\[aq]s a more complete example of a program that outputs only errors -(and a short message after the download is finished), and -downloads/converts the video to an mp3 file: -.IP -.nf -\f[C] -from\ __future__\ import\ unicode_literals -import\ youtube_dl - - -class\ MyLogger(object): -\ \ \ \ def\ debug(self,\ msg): -\ \ \ \ \ \ \ \ pass - -\ \ \ \ def\ warning(self,\ msg): -\ \ \ \ \ \ \ \ pass - -\ \ \ \ def\ error(self,\ msg): -\ \ \ \ \ \ \ \ print(msg) - - -def\ my_hook(d): -\ \ \ \ if\ d[\[aq]status\[aq]]\ ==\ \[aq]finished\[aq]: -\ \ \ \ \ \ \ \ print(\[aq]Done\ downloading,\ now\ converting\ ...\[aq]) - - -ydl_opts\ =\ { -\ \ \ \ \[aq]format\[aq]:\ \[aq]bestaudio/best\[aq], -\ \ \ \ \[aq]postprocessors\[aq]:\ [{ -\ \ \ \ \ \ \ \ \[aq]key\[aq]:\ \[aq]FFmpegExtractAudio\[aq], -\ \ \ \ \ \ \ \ \[aq]preferredcodec\[aq]:\ \[aq]mp3\[aq], -\ \ \ \ \ \ \ \ \[aq]preferredquality\[aq]:\ \[aq]192\[aq], -\ \ \ \ }], -\ \ \ \ \[aq]logger\[aq]:\ MyLogger(), -\ \ \ \ \[aq]progress_hooks\[aq]:\ [my_hook], -} -with\ youtube_dl.YoutubeDL(ydl_opts)\ as\ ydl: -\ \ \ \ ydl.download([\[aq]http://www.youtube.com/watch?v=BaW_jenozKc\[aq]]) -\f[] -.fi -.SH BUGS -.PP -Bugs and suggestions should be reported at: -<https://github.com/rg3/youtube-dl/issues>. -Unless you were prompted so or there is another pertinent reason (e.g. -GitHub fails to accept the bug report), please do not send bug reports -via personal email. -For discussions, join us in the IRC channel -#youtube\-dl (irc://chat.freenode.net/#youtube-dl) on freenode -(webchat (http://webchat.freenode.net/?randomnick=1&channels=youtube-dl)). -.PP -\f[B]Please include the full output of youtube\-dl when run with -\f[C]\-v\f[]\f[], i.e. -\f[B]add\f[] \f[C]\-v\f[] flag to \f[B]your command line\f[], copy the -\f[B]whole\f[] output and post it in the issue body wrapped in ``` for -better formatting. -It should look similar to this: -.IP -.nf -\f[C] -$\ youtube\-dl\ \-v\ <your\ command\ line> -[debug]\ System\ config:\ [] -[debug]\ User\ config:\ [] -[debug]\ Command\-line\ args:\ [u\[aq]\-v\[aq],\ u\[aq]http://www.youtube.com/watch?v=BaW_jenozKcj\[aq]] -[debug]\ Encodings:\ locale\ cp1251,\ fs\ mbcs,\ out\ cp866,\ pref\ cp1251 -[debug]\ youtube\-dl\ version\ 2015.12.06 -[debug]\ Git\ HEAD:\ 135392e -[debug]\ Python\ version\ 2.6.6\ \-\ Windows\-2003Server\-5.2.3790\-SP2 -[debug]\ exe\ versions:\ ffmpeg\ N\-75573\-g1d0487f,\ ffprobe\ N\-75573\-g1d0487f,\ rtmpdump\ 2.4 -[debug]\ Proxy\ map:\ {} -\&... -\f[] -.fi -.PP -\f[B]Do not post screenshots of verbose log only plain text is -acceptable.\f[] -.PP -The output (including the first lines) contains important debugging -information. -Issues without the full output are often not reproducible and therefore -do not get solved in short order, if ever. -.PP -Please re\-read your issue once again to avoid a couple of common -mistakes (you can and should use this as a checklist): -.SS Is the description of the issue itself sufficient? -.PP -We often get issue reports that we cannot really decipher. -While in most cases we eventually get the required information after -asking back multiple times, this poses an unnecessary drain on our -resources. -Many contributors, including myself, are also not native speakers, so we -may misread some parts. -.PP -So please elaborate on what feature you are requesting, or what bug you -want to be fixed. -Make sure that it\[aq]s obvious -.IP \[bu] 2 -What the problem is -.IP \[bu] 2 -How it could be fixed -.IP \[bu] 2 -How your proposed solution would look like -.PP -If your report is shorter than two lines, it is almost certainly missing -some of these, which makes it hard for us to respond to it. -We\[aq]re often too polite to close the issue outright, but the missing -info makes misinterpretation likely. -As a committer myself, I often get frustrated by these issues, since the -only possible way for me to move forward on them is to ask for -clarification over and over. -.PP -For bug reports, this means that your report should contain the -\f[I]complete\f[] output of youtube\-dl when called with the -\f[C]\-v\f[] flag. -The error message you get for (most) bugs even says so, but you would -not believe how many of our bug reports do not contain this information. -.PP -If your server has multiple IPs or you suspect censorship, adding -\f[C]\-\-call\-home\f[] may be a good idea to get more diagnostics. -If the error is \f[C]ERROR:\ Unable\ to\ extract\ ...\f[] and you cannot -reproduce it from multiple countries, add \f[C]\-\-dump\-pages\f[] -(warning: this will yield a rather large output, redirect it to the file -\f[C]log.txt\f[] by adding \f[C]>log.txt\ 2>&1\f[] to your -command\-line) or upload the \f[C]\&.dump\f[] files you get when you add -\f[C]\-\-write\-pages\f[] somewhere (https://gist.github.com/). -.PP -\f[B]Site support requests must contain an example URL\f[]. -An example URL is a URL you might want to download, like -\f[C]http://www.youtube.com/watch?v=BaW_jenozKc\f[]. -There should be an obvious video present. -Except under very special circumstances, the main page of a video -service (e.g. -\f[C]http://www.youtube.com/\f[]) is \f[I]not\f[] an example URL. -.SS Are you using the latest version? -.PP -Before reporting any issue, type \f[C]youtube\-dl\ \-U\f[]. -This should report that you\[aq]re up\-to\-date. -About 20% of the reports we receive are already fixed, but people are -using outdated versions. -This goes for feature requests as well. -.SS Is the issue already documented? -.PP -Make sure that someone has not already opened the issue you\[aq]re -trying to open. -Search at the top of the window or browse the GitHub -Issues (https://github.com/rg3/youtube-dl/search?type=Issues) of this -repository. -If there is an issue, feel free to write something along the lines of -"This affects me as well, with version 2015.01.01. -Here is some more information on the issue: ...". -While some issues may be old, a new post into them often spurs rapid -activity. -.SS Why are existing options not enough? -.PP -Before requesting a new feature, please have a quick peek at the list of -supported -options (https://github.com/rg3/youtube-dl/blob/master/README.md#synopsis). -Many feature requests are for features that actually exist already! -Please, absolutely do show off your work in the issue report and detail -how the existing similar options do \f[I]not\f[] solve your problem. -.SS Is there enough context in your bug report? -.PP -People want to solve problems, and often think they do us a favor by -breaking down their larger problems (e.g. -wanting to skip already downloaded files) to a specific request (e.g. -requesting us to look whether the file exists before downloading the -info page). -However, what often happens is that they break down the problem into two -steps: One simple, and one impossible (or extremely complicated one). -.PP -We are then presented with a very complicated request when the original -problem could be solved far easier, e.g. -by recording the downloaded video IDs in a separate file. -To avoid this, you must include the greater context where it is -non\-obvious. -In particular, every feature request that does not consist of adding -support for a new site should contain a use case scenario that explains -in what situation the missing feature would be useful. -.SS Does the issue involve one problem, and one problem only? -.PP -Some of our users seem to think there is a limit of issues they can or -should open. -There is no limit of issues they can or should open. -While it may seem appealing to be able to dump all your issues into one -ticket, that means that someone who solves one of your issues cannot -mark the issue as closed. -Typically, reporting a bunch of issues leads to the ticket lingering -since nobody wants to attack that behemoth, until someone mercifully -splits the issue into multiple ones. -.PP -In particular, every site support request issue should only pertain to -services at one site (generally under a common domain, but always using -the same backend technology). -Do not request support for vimeo user videos, Whitehouse podcasts, and -Google Plus pages in the same issue. -Also, make sure that you don\[aq]t post bug reports alongside feature -requests. -As a rule of thumb, a feature request does not include outputs of -youtube\-dl that are not immediately related to the feature at hand. -Do not post reports of a network error alongside the request for a new -video service. -.SS Is anyone going to need the feature? -.PP -Only post features that you (or an incapacitated friend you can -personally talk to) require. -Do not post features because they seem like a good idea. -If they are really useful, they will be requested by someone who -requires them. -.SS Is your question about youtube\-dl? -.PP -It may sound strange, but some bug reports we receive are completely -unrelated to youtube\-dl and relate to a different or even the -reporter\[aq]s own application. -Please make sure that you are actually using youtube\-dl. -If you are using a UI for youtube\-dl, report the bug to the maintainer -of the actual application providing the UI. -On the other hand, if your UI for youtube\-dl fails in some way you -believe is related to youtube\-dl, by all means, go ahead and report the -bug. -.SH COPYRIGHT -.PP -youtube\-dl is released into the public domain by the copyright holders. -.PP -This README file was originally written by Daniel -Bolton (https://github.com/dbbolton) and is likewise released into the -public domain. diff -Nru youtube-dl-2016.02.22/youtube-dl.bash-completion youtube-dl-2019.05.20/youtube-dl.bash-completion --- youtube-dl-2016.02.22/youtube-dl.bash-completion 2016-02-22 10:57:42.000000000 +0000 +++ youtube-dl-2019.05.20/youtube-dl.bash-completion 1970-01-01 00:00:00.000000000 +0000 @@ -1,29 +0,0 @@ -__youtube_dl() -{ - local cur prev opts fileopts diropts keywords - COMPREPLY=() - cur="${COMP_WORDS[COMP_CWORD]}" - prev="${COMP_WORDS[COMP_CWORD-1]}" - opts="--help --version --update --ignore-errors --abort-on-error --dump-user-agent --list-extractors --extractor-descriptions --force-generic-extractor --default-search --ignore-config --flat-playlist --no-color --proxy --socket-timeout --source-address --force-ipv4 --force-ipv6 --cn-verification-proxy --playlist-start --playlist-end --playlist-items --match-title --reject-title --max-downloads --min-filesize --max-filesize --date --datebefore --dateafter --min-views --max-views --match-filter --no-playlist --yes-playlist --age-limit --download-archive --include-ads --rate-limit --retries --buffer-size --no-resize-buffer --test --playlist-reverse --xattr-set-filesize --hls-prefer-native --hls-use-mpegts --external-downloader --external-downloader-args --batch-file --id --output --autonumber-size --restrict-filenames --auto-number --title --literal --no-overwrites --continue --no-continue --no-part --no-mtime --write-description --write-info-json --write-annotations --load-info --cookies --cache-dir --no-cache-dir --rm-cache-dir --write-thumbnail --write-all-thumbnails --list-thumbnails --quiet --no-warnings --simulate --skip-download --get-url --get-title --get-id --get-thumbnail --get-description --get-duration --get-filename --get-format --dump-json --dump-single-json --print-json --newline --no-progress --console-title --verbose --dump-pages --write-pages --youtube-print-sig-code --print-traffic --call-home --no-call-home --encoding --no-check-certificate --prefer-insecure --user-agent --referer --add-header --bidi-workaround --sleep-interval --format --all-formats --prefer-free-formats --list-formats --youtube-include-dash-manifest --youtube-skip-dash-manifest --merge-output-format --write-sub --write-auto-sub --all-subs --list-subs --sub-format --sub-lang --username --password --twofactor --netrc --video-password --extract-audio --audio-format --audio-quality --recode-video --postprocessor-args --keep-video --no-post-overwrites --embed-subs --embed-thumbnail --add-metadata --metadata-from-title --xattrs --fixup --prefer-avconv --prefer-ffmpeg --ffmpeg-location --exec --convert-subs" - keywords=":ytfavorites :ytrecommended :ytsubscriptions :ytwatchlater :ythistory" - fileopts="-a|--batch-file|--download-archive|--cookies|--load-info" - diropts="--cache-dir" - - if [[ ${prev} =~ ${fileopts} ]]; then - COMPREPLY=( $(compgen -f -- ${cur}) ) - return 0 - elif [[ ${prev} =~ ${diropts} ]]; then - COMPREPLY=( $(compgen -d -- ${cur}) ) - return 0 - fi - - if [[ ${cur} =~ : ]]; then - COMPREPLY=( $(compgen -W "${keywords}" -- ${cur}) ) - return 0 - elif [[ ${cur} == * ]] ; then - COMPREPLY=( $(compgen -W "${opts}" -- ${cur}) ) - return 0 - fi -} - -complete -F __youtube_dl youtube-dl diff -Nru youtube-dl-2016.02.22/youtube-dl.fish youtube-dl-2019.05.20/youtube-dl.fish --- youtube-dl-2016.02.22/youtube-dl.fish 2016-02-22 10:57:44.000000000 +0000 +++ youtube-dl-2019.05.20/youtube-dl.fish 1970-01-01 00:00:00.000000000 +0000 @@ -1,146 +0,0 @@ - -complete --command youtube-dl --long-option help --short-option h --description 'Print this help text and exit' -complete --command youtube-dl --long-option version --description 'Print program version and exit' -complete --command youtube-dl --long-option update --short-option U --description 'Update this program to latest version. Make sure that you have sufficient permissions (run with sudo if needed)' -complete --command youtube-dl --long-option ignore-errors --short-option i --description 'Continue on download errors, for example to skip unavailable videos in a playlist' -complete --command youtube-dl --long-option abort-on-error --description 'Abort downloading of further videos (in the playlist or the command line) if an error occurs' -complete --command youtube-dl --long-option dump-user-agent --description 'Display the current browser identification' -complete --command youtube-dl --long-option list-extractors --description 'List all supported extractors' -complete --command youtube-dl --long-option extractor-descriptions --description 'Output descriptions of all supported extractors' -complete --command youtube-dl --long-option force-generic-extractor --description 'Force extraction to use the generic extractor' -complete --command youtube-dl --long-option default-search --description 'Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for youtube-dl "large apple". Use the value "auto" to let youtube-dl guess ("auto_warning" to emit a warning when guessing). "error" just throws an error. The default value "fixup_error" repairs broken URLs, but emits an error if this is not possible instead of searching.' -complete --command youtube-dl --long-option ignore-config --description 'Do not read configuration files. When given in the global configuration file /etc/youtube-dl.conf: Do not read the user configuration in ~/.config/youtube-dl/config (%APPDATA%/youtube-dl/config.txt on Windows)' -complete --command youtube-dl --long-option flat-playlist --description 'Do not extract the videos of a playlist, only list them.' -complete --command youtube-dl --long-option no-color --description 'Do not emit color codes in output' -complete --command youtube-dl --long-option proxy --description 'Use the specified HTTP/HTTPS proxy. Pass in an empty string (--proxy "") for direct connection' -complete --command youtube-dl --long-option socket-timeout --description 'Time to wait before giving up, in seconds' -complete --command youtube-dl --long-option source-address --description 'Client-side IP address to bind to (experimental)' -complete --command youtube-dl --long-option force-ipv4 --short-option 4 --description 'Make all connections via IPv4 (experimental)' -complete --command youtube-dl --long-option force-ipv6 --short-option 6 --description 'Make all connections via IPv6 (experimental)' -complete --command youtube-dl --long-option cn-verification-proxy --description 'Use this proxy to verify the IP address for some Chinese sites. The default proxy specified by --proxy (or none, if the options is not present) is used for the actual downloading. (experimental)' -complete --command youtube-dl --long-option playlist-start --description 'Playlist video to start at (default is %default)' -complete --command youtube-dl --long-option playlist-end --description 'Playlist video to end at (default is last)' -complete --command youtube-dl --long-option playlist-items --description 'Playlist video items to download. Specify indices of the videos in the playlist separated by commas like: "--playlist-items 1,2,5,8" if you want to download videos indexed 1, 2, 5, 8 in the playlist. You can specify range: "--playlist-items 1-3,7,10-13", it will download the videos at index 1, 2, 3, 7, 10, 11, 12 and 13.' -complete --command youtube-dl --long-option match-title --description 'Download only matching titles (regex or caseless sub-string)' -complete --command youtube-dl --long-option reject-title --description 'Skip download for matching titles (regex or caseless sub-string)' -complete --command youtube-dl --long-option max-downloads --description 'Abort after downloading NUMBER files' -complete --command youtube-dl --long-option min-filesize --description 'Do not download any videos smaller than SIZE (e.g. 50k or 44.6m)' -complete --command youtube-dl --long-option max-filesize --description 'Do not download any videos larger than SIZE (e.g. 50k or 44.6m)' -complete --command youtube-dl --long-option date --description 'Download only videos uploaded in this date' -complete --command youtube-dl --long-option datebefore --description 'Download only videos uploaded on or before this date (i.e. inclusive)' -complete --command youtube-dl --long-option dateafter --description 'Download only videos uploaded on or after this date (i.e. inclusive)' -complete --command youtube-dl --long-option min-views --description 'Do not download any videos with less than COUNT views' -complete --command youtube-dl --long-option max-views --description 'Do not download any videos with more than COUNT views' -complete --command youtube-dl --long-option match-filter --description 'Generic video filter (experimental). Specify any key (see help for -o for a list of available keys) to match if the key is present, !key to check if the key is not present,key > NUMBER (like "comment_count > 12", also works with >=, <, <=, !=, =) to compare against a number, and & to require multiple matches. Values which are not known are excluded unless you put a question mark (?) after the operator.For example, to only match videos that have been liked more than 100 times and disliked less than 50 times (or the dislike functionality is not available at the given service), but who also have a description, use --match-filter "like_count > 100 & dislike_count <? 50 & description" .' -complete --command youtube-dl --long-option no-playlist --description 'Download only the video, if the URL refers to a video and a playlist.' -complete --command youtube-dl --long-option yes-playlist --description 'Download the playlist, if the URL refers to a video and a playlist.' -complete --command youtube-dl --long-option age-limit --description 'Download only videos suitable for the given age' -complete --command youtube-dl --long-option download-archive --description 'Download only videos not listed in the archive file. Record the IDs of all downloaded videos in it.' --require-parameter -complete --command youtube-dl --long-option include-ads --description 'Download advertisements as well (experimental)' -complete --command youtube-dl --long-option rate-limit --short-option r --description 'Maximum download rate in bytes per second (e.g. 50K or 4.2M)' -complete --command youtube-dl --long-option retries --short-option R --description 'Number of retries (default is %default), or "infinite".' -complete --command youtube-dl --long-option buffer-size --description 'Size of download buffer (e.g. 1024 or 16K) (default is %default)' -complete --command youtube-dl --long-option no-resize-buffer --description 'Do not automatically adjust the buffer size. By default, the buffer size is automatically resized from an initial value of SIZE.' -complete --command youtube-dl --long-option test -complete --command youtube-dl --long-option playlist-reverse --description 'Download playlist videos in reverse order' -complete --command youtube-dl --long-option xattr-set-filesize --description 'Set file xattribute ytdl.filesize with expected filesize (experimental)' -complete --command youtube-dl --long-option hls-prefer-native --description 'Use the native HLS downloader instead of ffmpeg (experimental)' -complete --command youtube-dl --long-option hls-use-mpegts --description 'Use the mpegts container for HLS videos, allowing to play the video while downloading (some players may not be able to play it)' -complete --command youtube-dl --long-option external-downloader --description 'Use the specified external downloader. Currently supports aria2c,axel,curl,httpie,wget' -complete --command youtube-dl --long-option external-downloader-args --description 'Give these arguments to the external downloader' -complete --command youtube-dl --long-option batch-file --short-option a --description 'File containing URLs to download ('"'"'-'"'"' for stdin)' --require-parameter -complete --command youtube-dl --long-option id --description 'Use only video ID in file name' -complete --command youtube-dl --long-option output --short-option o --description 'Output filename template. Use %(title)s to get the title, %(uploader)s for the uploader name, %(uploader_id)s for the uploader nickname if different, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(format)s for the format description (like "22 - 1280x720" or "HD"), %(format_id)s for the unique id of the format (like YouTube'"'"'s itags: "137"), %(upload_date)s for the upload date (YYYYMMDD), %(extractor)s for the provider (youtube, metacafe, etc), %(id)s for the video id, %(playlist_title)s, %(playlist_id)s, or %(playlist)s (=title if present, ID otherwise) for the playlist the video is in, %(playlist_index)s for the position in the playlist. %(height)s and %(width)s for the width and height of the video format. %(resolution)s for a textual description of the resolution of the video format. %% for a literal percent. Use - to output to stdout. Can also be used to download to a different directory, for example with -o '"'"'/my/downloads/%(uploader)s/%(title)s-%(id)s.%(ext)s'"'"' .' -complete --command youtube-dl --long-option autonumber-size --description 'Specify the number of digits in %(autonumber)s when it is present in output filename template or --auto-number option is given' -complete --command youtube-dl --long-option restrict-filenames --description 'Restrict filenames to only ASCII characters, and avoid "&" and spaces in filenames' -complete --command youtube-dl --long-option auto-number --short-option A --description '[deprecated; use -o "%(autonumber)s-%(title)s.%(ext)s" ] Number downloaded files starting from 00000' -complete --command youtube-dl --long-option title --short-option t --description '[deprecated] Use title in file name (default)' -complete --command youtube-dl --long-option literal --short-option l --description '[deprecated] Alias of --title' -complete --command youtube-dl --long-option no-overwrites --short-option w --description 'Do not overwrite files' -complete --command youtube-dl --long-option continue --short-option c --description 'Force resume of partially downloaded files. By default, youtube-dl will resume downloads if possible.' -complete --command youtube-dl --long-option no-continue --description 'Do not resume partially downloaded files (restart from beginning)' -complete --command youtube-dl --long-option no-part --description 'Do not use .part files - write directly into output file' -complete --command youtube-dl --long-option no-mtime --description 'Do not use the Last-modified header to set the file modification time' -complete --command youtube-dl --long-option write-description --description 'Write video description to a .description file' -complete --command youtube-dl --long-option write-info-json --description 'Write video metadata to a .info.json file' -complete --command youtube-dl --long-option write-annotations --description 'Write video annotations to a .annotations.xml file' -complete --command youtube-dl --long-option load-info --description 'JSON file containing the video information (created with the "--write-info-json" option)' --require-parameter -complete --command youtube-dl --long-option cookies --description 'File to read cookies from and dump cookie jar in' --require-parameter -complete --command youtube-dl --long-option cache-dir --description 'Location in the filesystem where youtube-dl can store some downloaded information permanently. By default $XDG_CACHE_HOME/youtube-dl or ~/.cache/youtube-dl . At the moment, only YouTube player files (for videos with obfuscated signatures) are cached, but that may change.' -complete --command youtube-dl --long-option no-cache-dir --description 'Disable filesystem caching' -complete --command youtube-dl --long-option rm-cache-dir --description 'Delete all filesystem cache files' -complete --command youtube-dl --long-option write-thumbnail --description 'Write thumbnail image to disk' -complete --command youtube-dl --long-option write-all-thumbnails --description 'Write all thumbnail image formats to disk' -complete --command youtube-dl --long-option list-thumbnails --description 'Simulate and list all available thumbnail formats' -complete --command youtube-dl --long-option quiet --short-option q --description 'Activate quiet mode' -complete --command youtube-dl --long-option no-warnings --description 'Ignore warnings' -complete --command youtube-dl --long-option simulate --short-option s --description 'Do not download the video and do not write anything to disk' -complete --command youtube-dl --long-option skip-download --description 'Do not download the video' -complete --command youtube-dl --long-option get-url --short-option g --description 'Simulate, quiet but print URL' -complete --command youtube-dl --long-option get-title --short-option e --description 'Simulate, quiet but print title' -complete --command youtube-dl --long-option get-id --description 'Simulate, quiet but print id' -complete --command youtube-dl --long-option get-thumbnail --description 'Simulate, quiet but print thumbnail URL' -complete --command youtube-dl --long-option get-description --description 'Simulate, quiet but print video description' -complete --command youtube-dl --long-option get-duration --description 'Simulate, quiet but print video length' -complete --command youtube-dl --long-option get-filename --description 'Simulate, quiet but print output filename' -complete --command youtube-dl --long-option get-format --description 'Simulate, quiet but print output format' -complete --command youtube-dl --long-option dump-json --short-option j --description 'Simulate, quiet but print JSON information. See --output for a description of available keys.' -complete --command youtube-dl --long-option dump-single-json --short-option J --description 'Simulate, quiet but print JSON information for each command-line argument. If the URL refers to a playlist, dump the whole playlist information in a single line.' -complete --command youtube-dl --long-option print-json --description 'Be quiet and print the video information as JSON (video is still being downloaded).' -complete --command youtube-dl --long-option newline --description 'Output progress bar as new lines' -complete --command youtube-dl --long-option no-progress --description 'Do not print progress bar' -complete --command youtube-dl --long-option console-title --description 'Display progress in console titlebar' -complete --command youtube-dl --long-option verbose --short-option v --description 'Print various debugging information' -complete --command youtube-dl --long-option dump-pages --description 'Print downloaded pages encoded using base64 to debug problems (very verbose)' -complete --command youtube-dl --long-option write-pages --description 'Write downloaded intermediary pages to files in the current directory to debug problems' -complete --command youtube-dl --long-option youtube-print-sig-code -complete --command youtube-dl --long-option print-traffic --description 'Display sent and read HTTP traffic' -complete --command youtube-dl --long-option call-home --short-option C --description 'Contact the youtube-dl server for debugging' -complete --command youtube-dl --long-option no-call-home --description 'Do NOT contact the youtube-dl server for debugging' -complete --command youtube-dl --long-option encoding --description 'Force the specified encoding (experimental)' -complete --command youtube-dl --long-option no-check-certificate --description 'Suppress HTTPS certificate validation' -complete --command youtube-dl --long-option prefer-insecure --description 'Use an unencrypted connection to retrieve information about the video. (Currently supported only for YouTube)' -complete --command youtube-dl --long-option user-agent --description 'Specify a custom user agent' -complete --command youtube-dl --long-option referer --description 'Specify a custom referer, use if the video access is restricted to one domain' -complete --command youtube-dl --long-option add-header --description 'Specify a custom HTTP header and its value, separated by a colon '"'"':'"'"'. You can use this option multiple times' -complete --command youtube-dl --long-option bidi-workaround --description 'Work around terminals that lack bidirectional text support. Requires bidiv or fribidi executable in PATH' -complete --command youtube-dl --long-option sleep-interval --description 'Number of seconds to sleep before each download.' -complete --command youtube-dl --long-option format --short-option f --description 'Video format code, see the "FORMAT SELECTION" for all the info' -complete --command youtube-dl --long-option all-formats --description 'Download all available video formats' -complete --command youtube-dl --long-option prefer-free-formats --description 'Prefer free video formats unless a specific one is requested' -complete --command youtube-dl --long-option list-formats --short-option F --description 'List all available formats of requested videos' -complete --command youtube-dl --long-option youtube-include-dash-manifest -complete --command youtube-dl --long-option youtube-skip-dash-manifest --description 'Do not download the DASH manifests and related data on YouTube videos' -complete --command youtube-dl --long-option merge-output-format --description 'If a merge is required (e.g. bestvideo+bestaudio), output to given container format. One of mkv, mp4, ogg, webm, flv. Ignored if no merge is required' -complete --command youtube-dl --long-option write-sub --description 'Write subtitle file' -complete --command youtube-dl --long-option write-auto-sub --description 'Write automatically generated subtitle file (YouTube only)' -complete --command youtube-dl --long-option all-subs --description 'Download all the available subtitles of the video' -complete --command youtube-dl --long-option list-subs --description 'List all available subtitles for the video' -complete --command youtube-dl --long-option sub-format --description 'Subtitle format, accepts formats preference, for example: "srt" or "ass/srt/best"' -complete --command youtube-dl --long-option sub-lang --description 'Languages of the subtitles to download (optional) separated by commas, use --list-subs for available language tags' -complete --command youtube-dl --long-option username --short-option u --description 'Login with this account ID' -complete --command youtube-dl --long-option password --short-option p --description 'Account password. If this option is left out, youtube-dl will ask interactively.' -complete --command youtube-dl --long-option twofactor --short-option 2 --description 'Two-factor auth code' -complete --command youtube-dl --long-option netrc --short-option n --description 'Use .netrc authentication data' -complete --command youtube-dl --long-option video-password --description 'Video password (vimeo, smotri, youku)' -complete --command youtube-dl --long-option extract-audio --short-option x --description 'Convert video files to audio-only files (requires ffmpeg or avconv and ffprobe or avprobe)' -complete --command youtube-dl --long-option audio-format --description 'Specify audio format: "best", "aac", "vorbis", "mp3", "m4a", "opus", or "wav"; "%default" by default' -complete --command youtube-dl --long-option audio-quality --description 'Specify ffmpeg/avconv audio quality, insert a value between 0 (better) and 9 (worse) for VBR or a specific bitrate like 128K (default %default)' -complete --command youtube-dl --long-option recode-video --description 'Encode the video to another format if necessary (currently supported: mp4|flv|ogg|webm|mkv|avi)' --arguments 'mp4 flv ogg webm mkv' --exclusive -complete --command youtube-dl --long-option postprocessor-args --description 'Give these arguments to the postprocessor' -complete --command youtube-dl --long-option keep-video --short-option k --description 'Keep the video file on disk after the post-processing; the video is erased by default' -complete --command youtube-dl --long-option no-post-overwrites --description 'Do not overwrite post-processed files; the post-processed files are overwritten by default' -complete --command youtube-dl --long-option embed-subs --description 'Embed subtitles in the video (only for mkv and mp4 videos)' -complete --command youtube-dl --long-option embed-thumbnail --description 'Embed thumbnail in the audio as cover art' -complete --command youtube-dl --long-option add-metadata --description 'Write metadata to the video file' -complete --command youtube-dl --long-option metadata-from-title --description 'Parse additional metadata like song title / artist from the video title. The format syntax is the same as --output, the parsed parameters replace existing values. Additional templates: %(album)s, %(artist)s. Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like "Coldplay - Paradise"' -complete --command youtube-dl --long-option xattrs --description 'Write metadata to the video file'"'"'s xattrs (using dublin core and xdg standards)' -complete --command youtube-dl --long-option fixup --description 'Automatically correct known faults of the file. One of never (do nothing), warn (only emit a warning), detect_or_warn (the default; fix file if we can, warn otherwise)' -complete --command youtube-dl --long-option prefer-avconv --description 'Prefer avconv over ffmpeg for running the postprocessors (default)' -complete --command youtube-dl --long-option prefer-ffmpeg --description 'Prefer ffmpeg over avconv for running the postprocessors' -complete --command youtube-dl --long-option ffmpeg-location --description 'Location of the ffmpeg/avconv binary; either the path to the binary or its containing directory.' -complete --command youtube-dl --long-option exec --description 'Execute a command on the file after downloading, similar to find'"'"'s -exec syntax. Example: --exec '"'"'adb push {} /sdcard/Music/ && rm {}'"'"'' -complete --command youtube-dl --long-option convert-subs --description 'Convert the subtitles to other format (currently supported: srt|ass|vtt)' - - -complete --command youtube-dl --arguments ":ytfavorites :ytrecommended :ytsubscriptions :ytwatchlater :ythistory" diff -Nru youtube-dl-2016.02.22/youtube-dl.plugin.zsh youtube-dl-2019.05.20/youtube-dl.plugin.zsh --- youtube-dl-2016.02.22/youtube-dl.plugin.zsh 1970-01-01 00:00:00.000000000 +0000 +++ youtube-dl-2019.05.20/youtube-dl.plugin.zsh 2019-06-07 18:49:07.000000000 +0000 @@ -0,0 +1,24 @@ +# This allows the youtube-dl command to be installed in ZSH using antigen. +# Antigen is a bundle manager. It allows you to enhance the functionality of +# your zsh session by installing bundles and themes easily. + +# Antigen documentation: +# http://antigen.sharats.me/ +# https://github.com/zsh-users/antigen + +# Install youtube-dl: +# antigen bundle ytdl-org/youtube-dl +# Bundles installed by antigen are available for use immediately. + +# Update youtube-dl (and all other antigen bundles): +# antigen update + +# The antigen command will download the git repository to a folder and then +# execute an enabling script (this file). The complete process for loading the +# code is documented here: +# https://github.com/zsh-users/antigen#notes-on-writing-plugins + +# This specific script just aliases youtube-dl to the python script that this +# library provides. This requires updating the PYTHONPATH to ensure that the +# full set of code can be located. +alias youtube-dl="PYTHONPATH=$(dirname $0) $(dirname $0)/bin/youtube-dl" diff -Nru youtube-dl-2016.02.22/youtube-dl.zsh youtube-dl-2019.05.20/youtube-dl.zsh --- youtube-dl-2016.02.22/youtube-dl.zsh 2016-02-22 10:57:43.000000000 +0000 +++ youtube-dl-2019.05.20/youtube-dl.zsh 1970-01-01 00:00:00.000000000 +0000 @@ -1,28 +0,0 @@ -#compdef youtube-dl - -__youtube_dl() { - local curcontext="$curcontext" fileopts diropts cur prev - typeset -A opt_args - fileopts="--download-archive|-a|--batch-file|--load-info|--cookies" - diropts="--cache-dir" - cur=$words[CURRENT] - case $cur in - :) - _arguments '*: :(::ytfavorites ::ytrecommended ::ytsubscriptions ::ytwatchlater ::ythistory)' - ;; - *) - prev=$words[CURRENT-1] - if [[ ${prev} =~ ${fileopts} ]]; then - _path_files - elif [[ ${prev} =~ ${diropts} ]]; then - _path_files -/ - elif [[ ${prev} == "--recode-video" ]]; then - _arguments '*: :(mp4 flv ogg webm mkv)' - else - _arguments '*: :(--help --version --update --ignore-errors --abort-on-error --dump-user-agent --list-extractors --extractor-descriptions --force-generic-extractor --default-search --ignore-config --flat-playlist --no-color --proxy --socket-timeout --source-address --force-ipv4 --force-ipv6 --cn-verification-proxy --playlist-start --playlist-end --playlist-items --match-title --reject-title --max-downloads --min-filesize --max-filesize --date --datebefore --dateafter --min-views --max-views --match-filter --no-playlist --yes-playlist --age-limit --download-archive --include-ads --rate-limit --retries --buffer-size --no-resize-buffer --test --playlist-reverse --xattr-set-filesize --hls-prefer-native --hls-use-mpegts --external-downloader --external-downloader-args --batch-file --id --output --autonumber-size --restrict-filenames --auto-number --title --literal --no-overwrites --continue --no-continue --no-part --no-mtime --write-description --write-info-json --write-annotations --load-info --cookies --cache-dir --no-cache-dir --rm-cache-dir --write-thumbnail --write-all-thumbnails --list-thumbnails --quiet --no-warnings --simulate --skip-download --get-url --get-title --get-id --get-thumbnail --get-description --get-duration --get-filename --get-format --dump-json --dump-single-json --print-json --newline --no-progress --console-title --verbose --dump-pages --write-pages --youtube-print-sig-code --print-traffic --call-home --no-call-home --encoding --no-check-certificate --prefer-insecure --user-agent --referer --add-header --bidi-workaround --sleep-interval --format --all-formats --prefer-free-formats --list-formats --youtube-include-dash-manifest --youtube-skip-dash-manifest --merge-output-format --write-sub --write-auto-sub --all-subs --list-subs --sub-format --sub-lang --username --password --twofactor --netrc --video-password --extract-audio --audio-format --audio-quality --recode-video --postprocessor-args --keep-video --no-post-overwrites --embed-subs --embed-thumbnail --add-metadata --metadata-from-title --xattrs --fixup --prefer-avconv --prefer-ffmpeg --ffmpeg-location --exec --convert-subs)' - fi - ;; - esac -} - -__youtube_dl \ No newline at end of file