diff -Nru gallery-dl-1.23.2/CHANGELOG.md gallery-dl-1.24.0/CHANGELOG.md --- gallery-dl-1.23.2/CHANGELOG.md 2022-10-01 11:22:01.000000000 +0000 +++ gallery-dl-1.24.0/CHANGELOG.md 2022-11-20 14:36:04.000000000 +0000 @@ -1,5 +1,121 @@ # Changelog +## 1.24.0 - 2022-11-20 +### Additions +- [exhentai] add metadata to search results ([#3181](https://github.com/mikf/gallery-dl/issues/3181)) +- [gelbooru_v02] implement `notes` extraction +- [instagram] add `guide` extractor ([#3192](https://github.com/mikf/gallery-dl/issues/3192)) +- [lolisafe] add support for xbunkr ([#3153](https://github.com/mikf/gallery-dl/issues/3153), [#3156](https://github.com/mikf/gallery-dl/issues/3156)) +- [mastodon] add `instance_remote` metadata field ([#3119](https://github.com/mikf/gallery-dl/issues/3119)) +- [nitter] add extractors for Nitter instances ([#2415](https://github.com/mikf/gallery-dl/issues/2415), [#2696](https://github.com/mikf/gallery-dl/issues/2696)) +- [pixiv] add support for new daily AI rankings category ([#3214](https://github.com/mikf/gallery-dl/issues/3214), [#3221](https://github.com/mikf/gallery-dl/issues/3221)) +- [twitter] add `avatar` and `background` extractors ([#349](https://github.com/mikf/gallery-dl/issues/349), [#3023](https://github.com/mikf/gallery-dl/issues/3023)) +- [uploadir] add support for `uploadir.com` ([#3162](https://github.com/mikf/gallery-dl/issues/3162)) +- [wallhaven] add `user` extractor ([#3212](https://github.com/mikf/gallery-dl/issues/3212), [#3213](https://github.com/mikf/gallery-dl/issues/3213), [#3226](https://github.com/mikf/gallery-dl/issues/3226)) +- [downloader:http] add `chunk-size` option ([#3143](https://github.com/mikf/gallery-dl/issues/3143)) +- [downloader:http] add file signature check for `.mp4` files +- [downloader:http] add file signature check and MIME type for `.avif` files +- [postprocessor] implement `post-after` event ([#3117](https://github.com/mikf/gallery-dl/issues/3117)) +- [postprocessor:metadata] implement `"mode": "jsonl"` +- [postprocessor:metadata] add `open`, `encoding`, and `private` options +- add `--chunk-size` command-line option ([#3143](https://github.com/mikf/gallery-dl/issues/3143)) +- add `--user-agent` command-line option +- implement `http-metadata` option +- implement `"user-agent": "browser"` ([#2636](https://github.com/mikf/gallery-dl/issues/2636)) +### Changes +- [deviantart] restore cookies warning for mature scraps ([#3129](https://github.com/mikf/gallery-dl/issues/3129)) +- [instagram] use REST API for unauthenticated users by default +- [downloader:http] increase default `chunk-size` to 32768 bytes ([#3143](https://github.com/mikf/gallery-dl/issues/3143)) +- build Windows executables using py2exe's new `freeze()` API +- build executables on GitHub Actions with Python 3.11 +- reword error text for unsupported URLs +### Fixes +- [exhentai] fix pagination ([#3181](https://github.com/mikf/gallery-dl/issues/3181)) +- [khinsider] fix extraction ([#3215](https://github.com/mikf/gallery-dl/issues/3215), [#3219](https://github.com/mikf/gallery-dl/issues/3219)) +- [realbooru] fix download URLs ([#2530](https://github.com/mikf/gallery-dl/issues/2530)) +- [realbooru] fix `tags` extraction ([#2530](https://github.com/mikf/gallery-dl/issues/2530)) +- [tumblr] fall back to `gifv` when possible ([#3095](https://github.com/mikf/gallery-dl/issues/3095), [#3159](https://github.com/mikf/gallery-dl/issues/3159)) +- [twitter] fix login ([#3220](https://github.com/mikf/gallery-dl/issues/3220)) +- [twitter] update URL for syndication API ([#3160](https://github.com/mikf/gallery-dl/issues/3160)) +- [weibo] send `Referer` headers ([#3188](https://github.com/mikf/gallery-dl/issues/3188)) +- [ytdl] update `parse_bytes` location ([#3256](https://github.com/mikf/gallery-dl/issues/3256)) +### Improvements +- [imxto] extract additional metadata ([#3118](https://github.com/mikf/gallery-dl/issues/3118), [#3175](https://github.com/mikf/gallery-dl/issues/3175)) +- [instagram] allow downloading avatars for private profiles ([#3255](https://github.com/mikf/gallery-dl/issues/3255)) +- [pixiv] raise error for invalid search/ranking parameters ([#3214](https://github.com/mikf/gallery-dl/issues/3214)) +- [twitter] update `bookmarks` pagination ([#3172](https://github.com/mikf/gallery-dl/issues/3172)) +- [downloader:http] refactor file signature checks +- [downloader:http] improve `-r/--limit-rate` accuracy ([#3143](https://github.com/mikf/gallery-dl/issues/3143)) +- add loaded config files to debug output +- improve `-K` output for lists +### Removals +- [instagram] remove login support ([#3139](https://github.com/mikf/gallery-dl/issues/3139), [#3141](https://github.com/mikf/gallery-dl/issues/3141), [#3191](https://github.com/mikf/gallery-dl/issues/3191)) +- [instagram] remove `channel` extractor +- [ngomik] remove module + +## 1.23.5 - 2022-10-30 +### Fixes +- [instagram] fix AttributeError on user stories extraction ([#3123](https://github.com/mikf/gallery-dl/issues/3123)) + +## 1.23.4 - 2022-10-29 +### Additions +- [aibooru] add support for aibooru.online ([#3075](https://github.com/mikf/gallery-dl/issues/3075)) +- [instagram] add 'avatar' extractor ([#929](https://github.com/mikf/gallery-dl/issues/929), [#1097](https://github.com/mikf/gallery-dl/issues/1097), [#2992](https://github.com/mikf/gallery-dl/issues/2992)) +- [instagram] support 'instagram.com/s/' highlight URLs ([#3076](https://github.com/mikf/gallery-dl/issues/3076)) +- [instagram] extract 'coauthors' metadata ([#3107](https://github.com/mikf/gallery-dl/issues/3107)) +- [mangasee] add support for 'mangalife' ([#3086](https://github.com/mikf/gallery-dl/issues/3086)) +- [mastodon] add 'bookmark' extractor ([#3109](https://github.com/mikf/gallery-dl/issues/3109)) +- [mastodon] support cross-instance user references and '/web/' URLs ([#3109](https://github.com/mikf/gallery-dl/issues/3109)) +- [moebooru] implement 'notes' extraction ([#3094](https://github.com/mikf/gallery-dl/issues/3094)) +- [pixiv] extend 'metadata' option ([#3057](https://github.com/mikf/gallery-dl/issues/3057)) +- [reactor] match 'best', 'new', 'all' URLs ([#3073](https://github.com/mikf/gallery-dl/issues/3073)) +- [smugloli] add 'smugloli' extractors ([#3060](https://github.com/mikf/gallery-dl/issues/3060)) +- [tumblr] add 'fallback-delay' and 'fallback-retries' options ([#2957](https://github.com/mikf/gallery-dl/issues/2957)) +- [vichan] add generic extractors for vichan imageboards +### Fixes +- [bcy] fix extraction ([#3103](https://github.com/mikf/gallery-dl/issues/3103)) +- [gelbooru] support alternate parameter order in post URLs ([#2821](https://github.com/mikf/gallery-dl/issues/2821)) +- [hentai2read] support minor versions in chapter URLs ([#3089](https://github.com/mikf/gallery-dl/issues/3089)) +- [hentaihere] support minor versions in chapter URLs +- [kemonoparty] fix 'dms' extraction ([#3106](https://github.com/mikf/gallery-dl/issues/3106)) +- [kemonoparty] update pagination offset +- [manganelo] update domain to 'chapmanganato.com' ([#3097](https://github.com/mikf/gallery-dl/issues/3097)) +- [pixiv] use 'exact_match_for_tags' as default search mode ([#3092](https://github.com/mikf/gallery-dl/issues/3092)) +- [redgifs] fix 'token' extraction ([#3080](https://github.com/mikf/gallery-dl/issues/3080), [#3081](https://github.com/mikf/gallery-dl/issues/3081)) +- [skeb] fix extraction ([#3112](https://github.com/mikf/gallery-dl/issues/3112)) +- improve compatibility of DownloadArchive ([#3078](https://github.com/mikf/gallery-dl/issues/3078)) + +## 1.23.3 - 2022-10-15 +### Additions +- [2chen] Add `2chen.moe` extractor ([#2707](https://github.com/mikf/gallery-dl/issues/2707)) +- [8chan] add `thread` and `board` extractors ([#2938](https://github.com/mikf/gallery-dl/issues/2938)) +- [deviantart] add `group` option ([#3018](https://github.com/mikf/gallery-dl/issues/3018)) +- [fanbox] add `content` metadata field ([#3020](https://github.com/mikf/gallery-dl/issues/3020)) +- [instagram] restore `cursor` functionality ([#2991](https://github.com/mikf/gallery-dl/issues/2991)) +- [instagram] restore warnings for private profiles ([#3004](https://github.com/mikf/gallery-dl/issues/3004), [#3045](https://github.com/mikf/gallery-dl/issues/3045)) +- [nana] add `nana` extractors ([#2967](https://github.com/mikf/gallery-dl/issues/2967)) +- [nijie] add `feed` and `followed` extractors ([#3048](https://github.com/mikf/gallery-dl/issues/3048)) +- [tumblr] support `https://www.tumblr.com/BLOGNAME` URLs ([#3034](https://github.com/mikf/gallery-dl/issues/3034)) +- [tumblr] add `offset` option +- [vk] add `tagged` extractor ([#2997](https://github.com/mikf/gallery-dl/issues/2997)) +- add `path-extended` option ([#3021](https://github.com/mikf/gallery-dl/issues/3021)) +- emit debug logging messages before calling time.sleep() ([#2982](https://github.com/mikf/gallery-dl/issues/2982)) +### Changes +- [postprocessor:metadata] assume `"mode": "custom"` when `format` is given +### Fixes +- [artstation] skip missing projects ([#3016](https://github.com/mikf/gallery-dl/issues/3016)) +- [danbooru] fix ugoira metadata extraction ([#3056](https://github.com/mikf/gallery-dl/issues/3056)) +- [deviantart] fix `deviation` extraction ([#2981](https://github.com/mikf/gallery-dl/issues/2981)) +- [hitomi] fall back to `webp` when selected format is not available ([#3030](https://github.com/mikf/gallery-dl/issues/3030)) +- [imagefap] fix and improve folder extraction and gallery pagination ([#3013](https://github.com/mikf/gallery-dl/issues/3013)) +- [instagram] fix login ([#3011](https://github.com/mikf/gallery-dl/issues/3011), [#3015](https://github.com/mikf/gallery-dl/issues/3015)) +- [nozomi] fix extraction ([#3051](https://github.com/mikf/gallery-dl/issues/3051)) +- [redgifs] fix extraction ([#3037](https://github.com/mikf/gallery-dl/issues/3037)) +- [tumblr] sleep between fallback retries ([#2957](https://github.com/mikf/gallery-dl/issues/2957)) +- [vk] unescape error messages +- fix duplicated metadata bug with `-j` ([#3033](https://github.com/mikf/gallery-dl/issues/3033)) +- fix bug when processing input file comments ([#2808](https://github.com/mikf/gallery-dl/issues/2808)) + ## 1.23.2 - 2022-10-01 ### Additions - [artstation] support search filters ([#2970](https://github.com/mikf/gallery-dl/issues/2970)) diff -Nru gallery-dl-1.23.2/data/completion/_gallery-dl gallery-dl-1.24.0/data/completion/_gallery-dl --- gallery-dl-1.23.2/data/completion/_gallery-dl 2022-07-10 11:30:26.000000000 +0000 +++ gallery-dl-1.24.0/data/completion/_gallery-dl 2022-11-11 19:20:08.000000000 +0000 @@ -13,6 +13,7 @@ {-f,--filename}'[Filename format string for downloaded files ("/O" for "original" filenames)]':'' \ --proxy'[Use the specified proxy]':'' \ --source-address'[Client-side IP address to bind to]':'' \ +--user-agent'[User-Agent request header]':'' \ --clear-cache'[Delete cached login sessions, cookies, etc. for MODULE (ALL to delete everything)]':'' \ --cookies'[File to load additional cookies from]':'':_files \ --cookies-from-browser'[Name of the browser to load cookies from, with optional keyring name prefixed with "+" and profile prefixed with ":"]':'' \ @@ -37,6 +38,7 @@ --sleep-extractor'[Number of seconds to wait before starting data extraction for an input URL]':'' \ --filesize-min'[Do not download files smaller than SIZE (e.g. 500k or 2.5M)]':'' \ --filesize-max'[Do not download files larger than SIZE (e.g. 500k or 2.5M)]':'' \ +--chunk-size'[Size of in-memory data chunks (default: 32k)]':'' \ --no-part'[Do not use .part files]' \ --no-skip'[Do not skip downloads; overwrite existing files]' \ --no-mtime'[Do not set file modification times according to Last-Modified HTTP response headers]' \ diff -Nru gallery-dl-1.23.2/data/completion/gallery-dl gallery-dl-1.24.0/data/completion/gallery-dl --- gallery-dl-1.23.2/data/completion/gallery-dl 2022-07-10 11:30:26.000000000 +0000 +++ gallery-dl-1.24.0/data/completion/gallery-dl 2022-11-11 19:20:08.000000000 +0000 @@ -10,7 +10,7 @@ elif [[ "${prev}" =~ ^()$ ]]; then COMPREPLY=( $(compgen -d -- "${cur}") ) else - COMPREPLY=( $(compgen -W "--help --version --input-file --destination --directory --filename --proxy --source-address --clear-cache --cookies --cookies-from-browser --quiet --verbose --get-urls --resolve-urls --dump-json --simulate --extractor-info --list-keywords --list-modules --list-extractors --write-log --write-unsupported --write-pages --limit-rate --retries --http-timeout --sleep --sleep-request --sleep-extractor --filesize-min --filesize-max --no-part --no-skip --no-mtime --no-download --no-postprocessors --no-check-certificate --config --config-yaml --option --ignore-config --username --password --netrc --download-archive --abort --terminate --range --chapter-range --filter --chapter-filter --zip --ugoira-conv --ugoira-conv-lossless --ugoira-conv-copy --write-metadata --write-info-json --write-infojson --write-tags --mtime-from-date --exec --exec-after --postprocessor" -- "${cur}") ) + COMPREPLY=( $(compgen -W "--help --version --input-file --destination --directory --filename --proxy --source-address --user-agent --clear-cache --cookies --cookies-from-browser --quiet --verbose --get-urls --resolve-urls --dump-json --simulate --extractor-info --list-keywords --list-modules --list-extractors --write-log --write-unsupported --write-pages --limit-rate --retries --http-timeout --sleep --sleep-request --sleep-extractor --filesize-min --filesize-max --chunk-size --no-part --no-skip --no-mtime --no-download --no-postprocessors --no-check-certificate --config --config-yaml --option --ignore-config --username --password --netrc --download-archive --abort --terminate --range --chapter-range --filter --chapter-filter --zip --ugoira-conv --ugoira-conv-lossless --ugoira-conv-copy --write-metadata --write-info-json --write-infojson --write-tags --mtime-from-date --exec --exec-after --postprocessor" -- "${cur}") ) fi } diff -Nru gallery-dl-1.23.2/data/completion/gallery-dl.fish gallery-dl-1.24.0/data/completion/gallery-dl.fish --- gallery-dl-1.23.2/data/completion/gallery-dl.fish 2022-07-10 11:30:26.000000000 +0000 +++ gallery-dl-1.24.0/data/completion/gallery-dl.fish 2022-11-11 19:20:08.000000000 +0000 @@ -7,6 +7,7 @@ complete -c gallery-dl -x -s 'f' -l 'filename' -d 'Filename format string for downloaded files ("/O" for "original" filenames)' complete -c gallery-dl -x -l 'proxy' -d 'Use the specified proxy' complete -c gallery-dl -x -l 'source-address' -d 'Client-side IP address to bind to' +complete -c gallery-dl -x -l 'user-agent' -d 'User-Agent request header' complete -c gallery-dl -x -l 'clear-cache' -d 'Delete cached login sessions, cookies, etc. for MODULE (ALL to delete everything)' complete -c gallery-dl -r -F -l 'cookies' -d 'File to load additional cookies from' complete -c gallery-dl -x -l 'cookies-from-browser' -d 'Name of the browser to load cookies from, with optional keyring name prefixed with "+" and profile prefixed with ":"' @@ -31,6 +32,7 @@ complete -c gallery-dl -x -l 'sleep-extractor' -d 'Number of seconds to wait before starting data extraction for an input URL' complete -c gallery-dl -x -l 'filesize-min' -d 'Do not download files smaller than SIZE (e.g. 500k or 2.5M)' complete -c gallery-dl -x -l 'filesize-max' -d 'Do not download files larger than SIZE (e.g. 500k or 2.5M)' +complete -c gallery-dl -x -l 'chunk-size' -d 'Size of in-memory data chunks (default: 32k)' complete -c gallery-dl -l 'no-part' -d 'Do not use .part files' complete -c gallery-dl -l 'no-skip' -d 'Do not skip downloads; overwrite existing files' complete -c gallery-dl -l 'no-mtime' -d 'Do not set file modification times according to Last-Modified HTTP response headers' diff -Nru gallery-dl-1.23.2/data/man/gallery-dl.1 gallery-dl-1.24.0/data/man/gallery-dl.1 --- gallery-dl-1.23.2/data/man/gallery-dl.1 2022-10-01 11:22:01.000000000 +0000 +++ gallery-dl-1.24.0/data/man/gallery-dl.1 2022-11-20 14:36:04.000000000 +0000 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2022-10-01" "1.23.2" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2022-11-20" "1.24.0" "gallery-dl Manual" .\" disable hyphenation .nh @@ -41,6 +41,9 @@ .B "\-\-source\-address" \f[I]IP\f[] Client-side IP address to bind to .TP +.B "\-\-user\-agent" \f[I]UA\f[] +User-Agent request header +.TP .B "\-\-clear\-cache" \f[I]MODULE\f[] Delete cached login sessions, cookies, etc. for MODULE (ALL to delete everything) .TP @@ -113,6 +116,9 @@ .B "\-\-filesize\-max" \f[I]SIZE\f[] Do not download files larger than SIZE (e.g. 500k or 2.5M) .TP +.B "\-\-chunk\-size" \f[I]SIZE\f[] +Size of in-memory data chunks (default: 32k) +.TP .B "\-\-no\-part" Do not use .part files .TP diff -Nru gallery-dl-1.23.2/data/man/gallery-dl.conf.5 gallery-dl-1.24.0/data/man/gallery-dl.conf.5 --- gallery-dl-1.23.2/data/man/gallery-dl.conf.5 2022-10-01 11:22:01.000000000 +0000 +++ gallery-dl-1.24.0/data/man/gallery-dl.conf.5 2022-11-20 14:36:04.000000000 +0000 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2022-10-01" "1.23.2" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2022-11-20" "1.24.0" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -310,6 +310,18 @@ * \f[I]"windows"\f[]: \f[I]". "\f[] +.SS extractor.*.path-extended +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]true\f[] + +.IP "Description:" 4 +On Windows, use \f[I]extended-length paths\f[] +prefixed with \f[I]\\\\?\\\f[] to work around the 260 characters path length limit. + + .SS extractor.*.extension-map .IP "Type:" 6 \f[I]object\f[] @@ -437,8 +449,6 @@ .br * \f[I]inkbunny\f[] .br -* \f[I]instagram\f[] -.br * \f[I]kemonoparty\f[] .br * \f[I]mangadex\f[] @@ -603,6 +613,9 @@ .IP "Description:" 4 User-Agent header value to be used for HTTP requests. +Setting this value to \f[I]"browser"\f[] will try to automatically detect +and use the User-Agent used by the system's default browser. + Note: This option has no effect on pixiv extractors, as these need specific values to function correctly. @@ -612,7 +625,10 @@ \f[I]string\f[] .IP "Default:" 9 -\f[I]"firefox"\f[] for \f[I]patreon\f[], \f[I]null\f[] everywhere else +.br +* \f[I]"firefox"\f[] for \f[I]patreon\f[], \f[I]mangapark\f[], and \f[I]mangasee\f[] +.br +* \f[I]null\f[] everywhere else .IP "Example:" 4 .br @@ -684,6 +700,23 @@ to access the current file's filename as \f[I]"[gdl_path.filename}"\f[]. +.SS extractor.*.http-metadata +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]null\f[] + +.IP "Description:" 4 +Insert an \f[I]object\f[] containing a file's HTTP headers and +\f[I]filename\f[], \f[I]extension\f[], and \f[I]date\f[] parsed from them +into metadata dictionaries as the given name. + +For example, setting this option to \f[I]"gdl_http"\f[] would make it possible +to access the current file's \f[I]Last-Modified\f[] header as \f[I]"{gdl_http[Last-Modified]}"\f[] +and its parsed form as \f[I]"{gdl_http[date]}"\f[]. + + .SS extractor.*.category-transfer .IP "Type:" 6 \f[I]bool\f[] @@ -1211,6 +1244,18 @@ Use with caution. +.SS extractor.deviantart.group +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]true\f[] + +.IP "Description:" 4 +Check whether the profile name in a given URL +belongs to a group or a regular user. + + .SS extractor.deviantart.include .IP "Type:" 6 \f[I]string\f[] or \f[I]list\f[] of \f[I]strings\f[] @@ -1694,17 +1739,15 @@ \f[I]string\f[] .IP "Default:" 9 -\f[I]"auto"\f[] +\f[I]"rest"\f[] .IP "Description:" 4 Selects which API endpoints to use. .br -* \f[I]"rest"\f[]: REST API - higher-resolution media, only usable when logged in -.br -* \f[I]"graphql"\f[]: GraphQL API - lower-resolution media, partially accessible when not logged in +* \f[I]"rest"\f[]: REST API - higher-resolution media .br -* \f[I]"auto"\f[]: Use REST API when logged in, GraphQL API otherwise +* \f[I]"graphql"\f[]: GraphQL API - lower-resolution media .SS extractor.instagram.include @@ -1722,8 +1765,12 @@ when processing a user profile. Possible values are -\f[I]"posts"\f[], \f[I]"reels"\f[], \f[I]"channel"\f[], \f[I]"tagged"\f[], -\f[I]"stories"\f[], \f[I]"highlights"\f[]. +\f[I]"posts"\f[], +\f[I]"reels"\f[], +\f[I]"tagged"\f[], +\f[I]"stories"\f[], +\f[I]"highlights"\f[], +\f[I]"avatar"\f[]. You can use \f[I]"all"\f[] instead of listing all values separately. @@ -1974,6 +2021,18 @@ Also emit metadata for text-only posts without media content. +.SS extractor.nana.favkey +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]null\f[] + +.IP "Description:" 4 +Your \f[I]Nana Favorite Key\f[], +used to access your favorite archives. + + .SS extractor.newgrounds.flash .IP "Type:" 6 \f[I]bool\f[] @@ -2214,7 +2273,7 @@ It is possible to use \f[I]"all"\f[] instead of listing all values separately. -.SS extractor.pixiv.artworks.metadata +.SS extractor.pixiv.metadata .IP "Type:" 6 \f[I]bool\f[] @@ -2610,6 +2669,19 @@ Search posts for inline images and videos. +.SS extractor.tumblr.offset +.IP "Type:" 6 +\f[I]integer\f[] + +.IP "Default:" 9 +\f[I]0\f[] + +.IP "Description:" 4 +Custom \f[I]offset\f[] starting value when paginating over blog posts. + +Allows skipping over posts without having to waste API calls. + + .SS extractor.tumblr.original .IP "Type:" 6 \f[I]bool\f[] @@ -2678,6 +2750,29 @@ You can use \f[I]"all"\f[] instead of listing all types separately. +.SS extractor.tumblr.fallback-delay +.IP "Type:" 6 +\f[I]float\f[] + +.IP "Default:" 9 +\f[I]120.0\f[] + +.IP "Description:" 4 +Number of seconds to wait between retries +for fetching full-resolution images. + + +.SS extractor.tumblr.fallback-retries +.IP "Type:" 6 +\f[I]integer\f[] + +.IP "Default:" 9 +\f[I]2\f[] + +.IP "Description:" 4 +Number of retries for fetching full-resolution images. + + .SS extractor.twibooru.api-key .IP "Type:" 6 \f[I]string\f[] @@ -3028,6 +3123,32 @@ See https://wallhaven.cc/help/api for more information. +.SS extractor.wallhaven.include +.IP "Type:" 6 +.br +* \f[I]string\f[] +.br +* \f[I]list\f[] of \f[I]strings\f[] + +.IP "Default:" 9 +\f[I]"uploads"\f[] + +.IP "Example:" 4 +.br +* "uploads,collections" +.br +* ["uploads", "collections"] + +.IP "Description:" 4 +A (comma-separated) list of subcategories to include +when processing a user profile. + +Possible values are +\f[I]"uploads"\f[], \f[I]"collections"\f[]. + +It is possible to use \f[I]"all"\f[] instead of listing all values separately. + + .SS extractor.wallhaven.metadata .IP "Type:" 6 \f[I]bool\f[] @@ -3336,7 +3457,7 @@ Any file smaller/larger than this limit will not be downloaded. Possible values are valid integer or floating-point numbers -optionally followed by one of \f[I]k\f[], \f[I]m\f[]. \f[I]g\f[], \f[I]t\f[] or \f[I]p\f[]. +optionally followed by one of \f[I]k\f[], \f[I]m\f[]. \f[I]g\f[], \f[I]t\f[], or \f[I]p\f[]. These suffixes are case-insensitive. @@ -3414,7 +3535,7 @@ Maximum download rate in bytes per second. Possible values are valid integer or floating-point numbers -optionally followed by one of \f[I]k\f[], \f[I]m\f[]. \f[I]g\f[], \f[I]t\f[] or \f[I]p\f[]. +optionally followed by one of \f[I]k\f[], \f[I]m\f[]. \f[I]g\f[], \f[I]t\f[], or \f[I]p\f[]. These suffixes are case-insensitive. @@ -3474,9 +3595,31 @@ \f[I]true\f[] .IP "Description:" 4 -Check the file headers of \f[I]jpg\f[], \f[I]png\f[], and \f[I]gif\f[] files +Check file headers of downloaded files and adjust their filename extensions if they do not match. +For example, this will change the filename extension (\f[I]{extension}\f[]) +of a file called \f[I]example.png\f[] from \f[I]png\f[] to \f[I]jpg\f[] when said file +contains JPEG/JFIF data. + + +.SS downloader.http.chunk-size +.IP "Type:" 6 +\f[I]integer\f[] or \f[I]string\f[] + +.IP "Default:" 9 +\f[I]32768\f[] + +.IP "Example:" 4 +"50k", "0.8M" + +.IP "Description:" 4 +Number of bytes per downloaded chunk. + +Possible values are integer numbers +optionally followed by one of \f[I]k\f[], \f[I]m\f[]. \f[I]g\f[], \f[I]t\f[], or \f[I]p\f[]. +These suffixes are case-insensitive. + .SS downloader.http.headers .IP "Type:" 6 @@ -3928,6 +4071,9 @@ * \f[I]"json"\f[]: write metadata using \f[I]json.dump() \f[] .br +* \f[I]"jsonl"\f[]: write metadata in \f[I]JSON Lines +\f[] format +.br * \f[I]"tags"\f[]: write \f[I]tags\f[] separated by newlines .br * \f[I]"custom"\f[]: write the result of applying \f[I]metadata.content-format\f[] @@ -4031,6 +4177,8 @@ \f[I]post\f[] When starting to download all files of a post, e.g. a Tweet on Twitter or a post on Patreon. +\f[I]post-after\f[] +After downloading all files of a post .SS metadata.fields @@ -4082,6 +4230,48 @@ Note: Only applies for \f[I]"mode": "custom"\f[]. +.SS metadata.open +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Defsult:" 4 +\f[I]"w"\f[] + +.IP "Description:" 4 +The \f[I]mode\f[] in which metadata files get opened. + +For example, +use \f[I]"a"\f[] to append to a file's content +or \f[I]"w"\f[] to truncate it. + +See the \f[I]mode\f[] parameter of \f[I]open()\f[] for further details. + + +.SS metadata.private +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Include private fields, +i.e. fields whose name starts with an underscore. + + +.SS metadata.encoding +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Defsult:" 4 +\f[I]"utf-8"\f[] + +.IP "Description:" 4 +Name of the encoding used to encode a file's content. + +See the \f[I]encoding\f[] parameter of \f[I]open()\f[] for further details. + + .SS metadata.archive .IP "Type:" 6 \f[I]Path\f[] diff -Nru gallery-dl-1.23.2/debian/changelog gallery-dl-1.24.0/debian/changelog --- gallery-dl-1.23.2/debian/changelog 2022-10-06 07:35:41.000000000 +0000 +++ gallery-dl-1.24.0/debian/changelog 2022-11-23 02:56:11.000000000 +0000 @@ -1,12 +1,30 @@ -gallery-dl (1.23.2-1~bpo20.04.1) focal-backports; urgency=medium +gallery-dl (1.24.0-1~bpo20.04.1) focal-backports; urgency=medium * No-change backport to focal. - -- Unit 193 Thu, 06 Oct 2022 03:35:41 -0400 + -- Unit 193 Tue, 22 Nov 2022 21:56:11 -0500 + +gallery-dl (1.24.0-1) unstable; urgency=medium + + * New upstream version 1.24.0. + + -- Unit 193 Tue, 22 Nov 2022 04:35:11 -0500 + +gallery-dl (1.23.5-1) unstable; urgency=medium + + * New upstream version 1.23.5. + + -- Unit 193 Tue, 01 Nov 2022 21:33:44 -0400 + +gallery-dl (1.23.3-1) unstable; urgency=medium + + * New upstream version 1.23.3. + + -- Unit 193 Mon, 17 Oct 2022 03:46:23 -0400 gallery-dl (1.23.2-1) unstable; urgency=medium - * New upstream version 1.23.2 + * New upstream version 1.23.2. -- Unit 193 Mon, 03 Oct 2022 04:10:00 -0400 diff -Nru gallery-dl-1.23.2/docs/gallery-dl.conf gallery-dl-1.24.0/docs/gallery-dl.conf --- gallery-dl-1.23.2/docs/gallery-dl.conf 2022-09-30 17:35:45.000000000 +0000 +++ gallery-dl-1.24.0/docs/gallery-dl.conf 2022-11-17 16:15:08.000000000 +0000 @@ -24,6 +24,8 @@ "path-replace": "_", "path-remove": "\\u0000-\\u001f\\u007f", "path-strip": "auto", + "path-extended": true, + "extension-map": { "jpeg": "jpg", "jpe" : "jpg", @@ -71,10 +73,13 @@ { "client-id": null, "client-secret": null, + "auto-watch": false, + "auto-unwatch": false, "comments": false, "extra": false, "flat": true, "folders": false, + "group": true, "include": "gallery", "journals": "html", "mature": true, @@ -154,9 +159,8 @@ }, "instagram": { - "username": null, - "password": null, - "api": "auto", + "api": "rest", + "cookies": null, "include": "posts", "sleep-request": [6.0, 12.0], "videos": true @@ -189,6 +193,10 @@ "format": "original", "include": "art" }, + "nana": + { + "favkey": null + }, "nijie": { "username": null, @@ -221,6 +229,7 @@ { "refresh-token": null, "include": "artworks", + "metadata": false, "tags": "japanese", "ugoira": true }, @@ -288,6 +297,7 @@ "external": false, "inline": true, "posts": "all", + "offset": 0, "original": true, "reblogs": true }, @@ -319,7 +329,8 @@ "wallhaven": { "api-key": null, - "metadata": false + "metadata": false, + "include": "uploads" }, "weasyl": { @@ -370,6 +381,7 @@ "http": { "adjust-extensions": true, + "chunk-size": 32768, "headers": null }, diff -Nru gallery-dl-1.23.2/docs/gallery-dl-example.conf gallery-dl-1.24.0/docs/gallery-dl-example.conf --- gallery-dl-1.23.2/docs/gallery-dl-example.conf 2022-09-30 14:46:45.000000000 +0000 +++ gallery-dl-1.24.0/docs/gallery-dl-example.conf 2022-11-10 12:38:30.000000000 +0000 @@ -210,6 +210,19 @@ "text-tweets": true }, + "ytdl": + { + "#": "enable 'ytdl' extractor", + "#": "i.e. invoke ytdl on all otherwise unsupported input URLs", + "enabled": true, + + "#": "use yt-dlp instead of youtube-dl", + "module": "yt_dlp", + + "#": "load ytdl options from config file", + "config-file": "~/yt-dlp.conf" + }, + "mastodon": { "#": "add 'tabletop.social' as recognized mastodon instance", diff -Nru gallery-dl-1.23.2/gallery_dl/config.py gallery-dl-1.24.0/gallery_dl/config.py --- gallery-dl-1.23.2/gallery_dl/config.py 2022-05-27 13:20:33.000000000 +0000 +++ gallery-dl-1.24.0/gallery_dl/config.py 2022-11-18 12:33:19.000000000 +0000 @@ -21,6 +21,7 @@ # internals _config = {} +_files = [] if util.WINDOWS: _default_configs = [ @@ -61,8 +62,8 @@ else: parsefunc = json.load - for path in files or _default_configs: - path = util.expand_path(path) + for pathfmt in files or _default_configs: + path = util.expand_path(pathfmt) try: with open(path, encoding="utf-8") as file: confdict = parsefunc(file) @@ -79,6 +80,7 @@ _config.update(confdict) else: util.combine_dict(_config, confdict) + _files.append(pathfmt) def clear(): diff -Nru gallery-dl-1.23.2/gallery_dl/cookies.py gallery-dl-1.24.0/gallery_dl/cookies.py --- gallery-dl-1.23.2/gallery_dl/cookies.py 2022-06-11 13:30:11.000000000 +0000 +++ gallery-dl-1.24.0/gallery_dl/cookies.py 2022-11-18 12:12:14.000000000 +0000 @@ -263,7 +263,7 @@ path = _find_most_recently_used_file(search_root, "Cookies") if path is None: - raise FileNotFoundError("Unable tp find {} cookies database in " + raise FileNotFoundError("Unable to find {} cookies database in " "'{}'".format(config["browser"], search_root)) logger.debug("Extracting cookies from %s", path) diff -Nru gallery-dl-1.23.2/gallery_dl/downloader/http.py gallery-dl-1.24.0/gallery_dl/downloader/http.py --- gallery-dl-1.23.2/gallery_dl/downloader/http.py 2022-09-19 20:10:20.000000000 +0000 +++ gallery-dl-1.24.0/gallery_dl/downloader/http.py 2022-11-20 00:22:11.000000000 +0000 @@ -27,10 +27,11 @@ def __init__(self, job): DownloaderBase.__init__(self, job) extractor = job.extractor - self.chunk_size = 16384 self.downloading = False self.adjust_extension = self.config("adjust-extensions", True) + self.chunk_size = self.config("chunk-size", 32768) + self.metadata = extractor.config("http-metadata") self.progress = self.config("progress", 3.0) self.headers = self.config("headers") self.minsize = self.config("filesize-min") @@ -55,6 +56,13 @@ self.log.warning( "Invalid maximum file size (%r)", self.maxsize) self.maxsize = maxsize + if isinstance(self.chunk_size, str): + chunk_size = text.parse_bytes(self.chunk_size) + if not chunk_size: + self.log.warning( + "Invalid chunk size (%r)", self.chunk_size) + chunk_size = 32768 + self.chunk_size = chunk_size if self.rate: rate = text.parse_bytes(self.rate) if rate: @@ -83,11 +91,12 @@ tries = 0 msg = "" + metadata = self.metadata kwdict = pathfmt.kwdict adjust_extension = kwdict.get( "_http_adjust_extension", self.adjust_extension) - if self.part: + if self.part and not metadata: pathfmt.part_enable(self.partdir) while True: @@ -164,13 +173,6 @@ self.log.warning("Invalid response") return False - # set missing filename extension from MIME type - if not pathfmt.extension: - pathfmt.set_extension(self._find_extension(response)) - if pathfmt.exists(): - pathfmt.temppath = "" - return True - # check file size size = text.parse_int(size, None) if size is not None: @@ -185,11 +187,33 @@ size, self.maxsize) return False + build_path = False + + # set missing filename extension from MIME type + if not pathfmt.extension: + pathfmt.set_extension(self._find_extension(response)) + build_path = True + + # set metadata from HTTP headers + if metadata: + kwdict[metadata] = util.extract_headers(response) + build_path = True + + # build and check file path + if build_path: + pathfmt.build_path() + if pathfmt.exists(): + pathfmt.temppath = "" + return True + if self.part and metadata: + pathfmt.part_enable(self.partdir) + metadata = False + content = response.iter_content(self.chunk_size) # check filename extension against file header if adjust_extension and not offset and \ - pathfmt.extension in FILE_SIGNATURES: + pathfmt.extension in SIGNATURE_CHECKS: try: file_header = next( content if response.raw.chunked @@ -220,7 +244,7 @@ offset += len(file_header) elif offset: if adjust_extension and \ - pathfmt.extension in FILE_SIGNATURES: + pathfmt.extension in SIGNATURE_CHECKS: self._adjust_extension(pathfmt, fp.read(16)) fp.seek(offset) @@ -250,42 +274,38 @@ return True @staticmethod - def receive(fp, content, bytes_total, bytes_downloaded): + def receive(fp, content, bytes_total, bytes_start): write = fp.write for data in content: write(data) - def _receive_rate(self, fp, content, bytes_total, bytes_downloaded): + def _receive_rate(self, fp, content, bytes_total, bytes_start): rate = self.rate - progress = self.progress - bytes_start = bytes_downloaded write = fp.write - t1 = tstart = time.time() + progress = self.progress + + bytes_downloaded = 0 + time_start = time.time() for data in content: - write(data) + time_current = time.time() + time_elapsed = time_current - time_start + bytes_downloaded += len(data) - t2 = time.time() # current time - elapsed = t2 - t1 # elapsed time - num_bytes = len(data) + write(data) if progress is not None: - bytes_downloaded += num_bytes - tdiff = t2 - tstart - if tdiff >= progress: + if time_elapsed >= progress: self.out.progress( - bytes_total, bytes_downloaded, - int((bytes_downloaded - bytes_start) / tdiff), + bytes_total, + bytes_start + bytes_downloaded, + int(bytes_downloaded / time_elapsed), ) if rate: - expected = num_bytes / rate # expected elapsed time - if elapsed < expected: - # sleep if less time elapsed than expected - time.sleep(expected - elapsed) - t2 = time.time() - - t1 = t2 + time_expected = bytes_downloaded / rate + if time_expected > time_elapsed: + time.sleep(time_expected - time_elapsed) def _find_extension(self, response): """Get filename extension from MIME type""" @@ -308,11 +328,11 @@ @staticmethod def _adjust_extension(pathfmt, file_header): """Check filename extension against file header""" - sig = FILE_SIGNATURES[pathfmt.extension] - if not file_header.startswith(sig): - for ext, sig in FILE_SIGNATURES.items(): - if file_header.startswith(sig): + if not SIGNATURE_CHECKS[pathfmt.extension](file_header): + for ext, check in SIGNATURE_CHECKS.items(): + if check(file_header): pathfmt.set_extension(ext) + pathfmt.build_path() return True return False @@ -326,6 +346,7 @@ "image/x-bmp" : "bmp", "image/x-ms-bmp": "bmp", "image/webp" : "webp", + "image/avif" : "avif", "image/svg+xml" : "svg", "image/ico" : "ico", "image/icon" : "ico", @@ -362,27 +383,33 @@ } # https://en.wikipedia.org/wiki/List_of_file_signatures -FILE_SIGNATURES = { - "jpg" : b"\xFF\xD8\xFF", - "png" : b"\x89PNG\r\n\x1A\n", - "gif" : (b"GIF87a", b"GIF89a"), - "bmp" : b"BM", - "webp": b"RIFF", - "svg" : b"", "")[0] - title, _, boardname = title.rpartition(" - ") + title, _, boardname = text.extr( + page, "", "").rpartition(" - ") return { "server": self.server, "title": title, @@ -72,8 +72,8 @@ def posts(self, page): """Build a list of all post-objects""" - page = text.extract( - page, '
')[0] + page = text.extr( + page, '
') return [ self.parse(post) for post in page.split('') @@ -84,7 +84,7 @@ data = self._extract_post(post) if data["name"]: data["name"] = data["name"].strip() - path = text.extract(post, '= 10", + }), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.board, self.thread = match.groups() + + def items(self): + url = "{}/{}/{}".format(self.root, self.board, self.thread) + page = self.request(url, encoding="utf-8").text + data = self.metadata(page) + yield Message.Directory, data + for post in self.posts(page): + if not post["url"]: + continue + post.update(data) + post["url"] = self.root + post["url"] + post["time"] = text.parse_int(post["date"].timestamp()) + yield Message.Url, post["url"], text.nameext_from_url( + post["filename"], post) + + def metadata(self, page): + board, pos = text.extract(page, 'class="board">/', '/<') + title = text.extract(page, "

", "

", pos)[0] + return { + "board" : board, + "thread": self.thread, + "title" : text.unescape(title), + } + + def posts(self, page): + """Return iterable with relevant posts""" + return map(self.parse, text.extract_iter( + page, 'class="glass media', '')) + + def parse(self, post): + extr = text.extract_from(post) + return { + "name" : text.unescape(extr("", "")), + "date" : text.parse_datetime( + extr("")[2], + "%d %b %Y (%a) %H:%M:%S" + ), + "no" : extr('href="#p', '"'), + "url" : extr('
board["pageCount"]: + return + url = "{}/{}/{}.json".format(self.root, self.board, page) + threads = self.request(url).json()["threads"] diff -Nru gallery-dl-1.23.2/gallery_dl/extractor/8kun.py gallery-dl-1.24.0/gallery_dl/extractor/8kun.py --- gallery-dl-1.23.2/gallery_dl/extractor/8kun.py 2022-07-12 13:49:22.000000000 +0000 +++ gallery-dl-1.24.0/gallery_dl/extractor/8kun.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,100 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2020-2022 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://8kun.top/""" - -from .common import Extractor, Message -from .. import text - - -class _8kunThreadExtractor(Extractor): - """Extractor for 8kun threads""" - category = "8kun" - subcategory = "thread" - directory_fmt = ("{category}", "{board}", "{thread} {title}") - filename_fmt = "{time}{num:?-//} {filename}.{extension}" - archive_fmt = "{board}_{thread}_{tim}" - pattern = r"(?:https?://)?8kun\.top/([^/]+)/res/(\d+)" - test = ( - ("https://8kun.top/test/res/65248.html", { - "pattern": r"https://media\.8kun\.top/file_store/\w{64}\.\w+", - "count": ">= 8", - }), - # old-style file URLs (#1101) - # ("https://8kun.top/d/res/13258.html", { - # "pattern": r"https://media\.8kun\.top/d/src/\d+(-\d)?\.\w+", - # "range": "1-20", - # }), - ) - - def __init__(self, match): - Extractor.__init__(self, match) - self.board, self.thread = match.groups() - - def items(self): - url = "https://8kun.top/{}/res/{}.json".format(self.board, self.thread) - posts = self.request(url).json()["posts"] - title = posts[0].get("sub") or text.remove_html(posts[0]["com"]) - process = self._process - - data = { - "board" : self.board, - "thread": self.thread, - "title" : text.unescape(title)[:50], - "num" : 0, - } - - yield Message.Directory, data - for post in posts: - if "filename" in post: - yield process(post, data) - if "extra_files" in post: - for post["num"], filedata in enumerate( - post["extra_files"], 1): - yield process(post, filedata) - - @staticmethod - def _process(post, data): - post.update(data) - post["extension"] = post["ext"][1:] - tim = post["tim"] - url = ("https://media.8kun.top/" + - ("file_store/" if len(tim) > 16 else post["board"] + "/src/") + - tim + post["ext"]) - return Message.Url, url, post - - -class _8kunBoardExtractor(Extractor): - """Extractor for 8kun boards""" - category = "8kun" - subcategory = "board" - pattern = r"(?:https?://)?8kun\.top/([^/?#]+)/(?:index|\d+)\.html" - test = ( - ("https://8kun.top/v/index.html", { - "pattern": _8kunThreadExtractor.pattern, - "count": ">= 100", - }), - ("https://8kun.top/v/2.html"), - ("https://8kun.top/v/index.html?PageSpeed=noscript"), - ) - - def __init__(self, match): - Extractor.__init__(self, match) - self.board = match.group(1) - - def items(self): - url = "https://8kun.top/{}/threads.json".format(self.board) - threads = self.request(url).json() - - for page in threads: - for thread in page["threads"]: - url = "https://8kun.top/{}/res/{}.html".format( - self.board, thread["no"]) - thread["page"] = page["page"] - thread["_extractor"] = _8kunThreadExtractor - yield Message.Queue, url, thread diff -Nru gallery-dl-1.23.2/gallery_dl/extractor/8muses.py gallery-dl-1.24.0/gallery_dl/extractor/8muses.py --- gallery-dl-1.23.2/gallery_dl/extractor/8muses.py 2022-07-12 13:49:22.000000000 +0000 +++ gallery-dl-1.24.0/gallery_dl/extractor/8muses.py 2022-11-10 12:38:30.000000000 +0000 @@ -76,9 +76,9 @@ url = self.root + self.path + self.params while True: - data = self._unobfuscate(text.extract( + data = self._unobfuscate(text.extr( self.request(url).text, - 'id="ractive-public" type="text/plain">', '')[0]) + 'id="ractive-public" type="text/plain">', '')) images = data.get("pictures") if images: diff -Nru gallery-dl-1.23.2/gallery_dl/extractor/artstation.py gallery-dl-1.24.0/gallery_dl/extractor/artstation.py --- gallery-dl-1.23.2/gallery_dl/extractor/artstation.py 2022-09-30 14:46:45.000000000 +0000 +++ gallery-dl-1.24.0/gallery_dl/extractor/artstation.py 2022-11-10 12:38:30.000000000 +0000 @@ -41,8 +41,8 @@ if adict["has_embedded_player"] and self.external: player = adict["player_embedded"] - url = text.extract(player, 'src="', '"')[0] or \ - text.extract(player, "src='", "'")[0] + url = (text.extr(player, 'src="', '"') or + text.extr(player, "src='", "'")) if url and not url.startswith(self.root): asset["extension"] = None yield Message.Url, "ytdl:" + url, asset @@ -76,7 +76,12 @@ def get_project_assets(self, project_id): """Return all assets associated with 'project_id'""" url = "{}/projects/{}.json".format(self.root, project_id) - data = self.request(url).json() + + try: + data = self.request(url).json() + except exception.HttpError as exc: + self.log.warning(exc) + return data["title"] = text.unescape(data["title"]) data["description"] = text.unescape(text.remove_html( @@ -406,6 +411,10 @@ "options": (("external", True),), "pattern": "ytdl:https://www.youtube.com/embed/JNFfJtwwrU0", }), + # 404 (#3016) + ("https://www.artstation.com/artwork/3q3mXB", { + "count": 0, + }), # alternate URL patterns ("https://sungchoi.artstation.com/projects/LQVJr"), ("https://artstn.co/p/LQVJr"), @@ -419,7 +428,10 @@ def metadata(self): self.assets = list(ArtstationExtractor.get_project_assets( self, self.project_id)) - self.user = self.assets[0]["user"]["username"] + try: + self.user = self.assets[0]["user"]["username"] + except IndexError: + self.user = "" return ArtstationExtractor.metadata(self) def projects(self): diff -Nru gallery-dl-1.23.2/gallery_dl/extractor/aryion.py gallery-dl-1.24.0/gallery_dl/extractor/aryion.py --- gallery-dl-1.23.2/gallery_dl/extractor/aryion.py 2022-05-30 10:58:03.000000000 +0000 +++ gallery-dl-1.24.0/gallery_dl/extractor/aryion.py 2022-11-10 12:38:30.000000000 +0000 @@ -128,8 +128,7 @@ # get filename from 'Content-Disposition' header cdis = headers["content-disposition"] - fname, _, ext = text.extract( - cdis, 'filename="', '"')[0].rpartition(".") + fname, _, ext = text.extr(cdis, 'filename="', '"').rpartition(".") if not fname: fname, ext = ext, fname diff -Nru gallery-dl-1.23.2/gallery_dl/extractor/bbc.py gallery-dl-1.24.0/gallery_dl/extractor/bbc.py --- gallery-dl-1.23.2/gallery_dl/extractor/bbc.py 2022-02-01 23:09:19.000000000 +0000 +++ gallery-dl-1.24.0/gallery_dl/extractor/bbc.py 2022-11-10 12:38:30.000000000 +0000 @@ -38,8 +38,8 @@ ) def metadata(self, page): - data = json.loads(text.extract( - page, '')[0]) + data = json.loads(text.extr( + page, '')) return { "programme": self.gallery_url.split("/")[4], "path": list(util.unique_sequence( diff -Nru gallery-dl-1.23.2/gallery_dl/extractor/bcy.py gallery-dl-1.24.0/gallery_dl/extractor/bcy.py --- gallery-dl-1.23.2/gallery_dl/extractor/bcy.py 2022-05-03 10:22:33.000000000 +0000 +++ gallery-dl-1.24.0/gallery_dl/extractor/bcy.py 2022-11-10 12:38:30.000000000 +0000 @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2020-2021 Mike Fährmann +# Copyright 2020-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -25,9 +25,12 @@ def __init__(self, match): Extractor.__init__(self, match) self.item_id = match.group(1) + self.session.headers["Referer"] = self.root + "/" def items(self): - sub = re.compile(r"^https?://p\d+-bcy\.byteimg\.com/img/banciyuan").sub + sub = re.compile(r"^https?://p\d+-bcy" + r"(?:-sign\.bcyimg\.com|\.byteimg\.com/img)" + r"/banciyuan").sub iroot = "https://img-bcy-qn.pstatp.com" noop = self.config("noop") @@ -64,19 +67,18 @@ url = image["path"].partition("~")[0] text.nameext_from_url(url, data) + # full-resolution image without watermark if data["extension"]: if not url.startswith(iroot): url = sub(iroot, url) data["filter"] = "" yield Message.Url, url, data + # watermarked image & low quality noop filter else: - if not multi: - if len(post["multi"]) < len(post["image_list"]): - multi = self._data_from_post(post["item_id"]) - multi = multi["post_data"]["multi"] - else: - multi = post["multi"] + if multi is None: + multi = self._data_from_post( + post["item_id"])["post_data"]["multi"] image = multi[data["num"] - 1] if image["origin"]: @@ -95,7 +97,7 @@ url = "{}/item/detail/{}".format(self.root, post_id) page = self.request(url, notfound="post").text return json.loads( - text.extract(page, 'JSON.parse("', '");')[0] + text.extr(page, 'JSON.parse("', '");') .replace('\\\\u002F', '/') .replace('\\"', '"') )["detail"] @@ -111,8 +113,8 @@ "count": ">= 20", }), ("https://bcy.net/u/109282764041", { - "pattern": r"https://p\d-bcy.byteimg.com/img/banciyuan/[0-9a-f]+" - r"~tplv-banciyuan-logo-v3:.+\.image", + "pattern": r"https://p\d-bcy-sign\.bcyimg\.com/banciyuan/[0-9a-f]+" + r"~tplv-bcyx-yuan-logo-v1:.+\.image", "range": "1-25", "count": 25, }), @@ -171,13 +173,13 @@ }), # only watermarked images available ("https://bcy.net/item/detail/6950136331708144648", { - "pattern": r"https://p\d-bcy.byteimg.com/img/banciyuan/[0-9a-f]+" - r"~tplv-banciyuan-logo-v3:.+\.image", - "count": 8, + "pattern": r"https://p\d-bcy-sign\.bcyimg\.com/banciyuan/[0-9a-f]+" + r"~tplv-bcyx-yuan-logo-v1:.+\.image", + "count": 10, "keyword": {"filter": "watermark"}, }), # deleted - ("https://bcy.net/item/detail/6780546160802143236", { + ("https://bcy.net/item/detail/6780546160802143237", { "exception": exception.NotFoundError, "count": 0, }), diff -Nru gallery-dl-1.23.2/gallery_dl/extractor/behance.py gallery-dl-1.24.0/gallery_dl/extractor/behance.py --- gallery-dl-1.23.2/gallery_dl/extractor/behance.py 2022-05-03 10:22:33.000000000 +0000 +++ gallery-dl-1.24.0/gallery_dl/extractor/behance.py 2022-11-10 12:38:30.000000000 +0000 @@ -119,8 +119,8 @@ } page = self.request(url, cookies=cookies).text - data = json.loads(text.extract( - page, 'id="beconfig-store_state">', '')[0]) + data = json.loads(text.extr( + page, 'id="beconfig-store_state">', '')) return self._update(data["project"]["project"]) def get_images(self, data): @@ -137,7 +137,7 @@ elif mtype == "video": page = self.request(module["src"]).text - url = text.extract(page, '', '<')[0]) + 'id="__NEXT_DATA__" type="application/json">', '<')) album = data["props"]["pageProps"]["album"] files = album["files"] except Exception as exc: diff -Nru gallery-dl-1.23.2/gallery_dl/extractor/common.py gallery-dl-1.24.0/gallery_dl/extractor/common.py --- gallery-dl-1.23.2/gallery_dl/extractor/common.py 2022-09-30 14:46:45.000000000 +0000 +++ gallery-dl-1.24.0/gallery_dl/extractor/common.py 2022-11-15 11:32:45.000000000 +0000 @@ -20,7 +20,7 @@ import threading from requests.adapters import HTTPAdapter from .message import Message -from .. import config, text, util, exception +from .. import config, text, util, cache, exception class Extractor(): @@ -122,8 +122,7 @@ seconds = (self._interval() - (time.time() - Extractor.request_timestamp)) if seconds > 0.0: - self.log.debug("Sleeping for %.5s seconds", seconds) - time.sleep(seconds) + self.sleep(seconds, "request") while True: try: @@ -150,14 +149,13 @@ msg = "'{} {}' for '{}'".format(code, response.reason, url) server = response.headers.get("Server") - if server and server.startswith("cloudflare"): - if code == 503 and \ - (b"_cf_chl_opt" in response.content or - b"jschl-answer" in response.content): + if server and server.startswith("cloudflare") and \ + code in (403, 503): + content = response.content + if b"_cf_chl_opt" in content or b"jschl-answer" in content: self.log.warning("Cloudflare IUAM challenge") break - if code == 403 and \ - b'name="captcha-bypass"' in response.content: + if b'name="captcha-bypass"' in content: self.log.warning("Cloudflare CAPTCHA") break if code < 500 and code != 429 and code != 430: @@ -169,8 +167,9 @@ self.log.debug("%s (%s/%s)", msg, tries, retries+1) if tries > retries: break - time.sleep( - max(tries, self._interval()) if self._interval else tries) + self.sleep( + max(tries, self._interval()) if self._interval else tries, + "retry") tries += 1 raise exception.HttpError(msg, response) @@ -202,6 +201,11 @@ self.log.info("Waiting until %s for %s.", isotime, reason) time.sleep(seconds) + def sleep(self, seconds, reason): + self.log.debug("Sleeping %.2f seconds (%s)", + seconds, reason) + time.sleep(seconds) + def _get_auth_info(self): """Return authentication information as (username, password) tuple""" username = self.config("username") @@ -258,9 +262,13 @@ ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1) ssl_ciphers = SSL_CIPHERS[browser] else: - headers["User-Agent"] = self.config("user-agent", ( - "Mozilla/5.0 (Windows NT 10.0; Win64; x64; " - "rv:102.0) Gecko/20100101 Firefox/102.0")) + useragent = self.config("user-agent") + if useragent is None: + useragent = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64; " + "rv:102.0) Gecko/20100101 Firefox/102.0") + elif useragent == "browser": + useragent = _browser_useragent() + headers["User-Agent"] = useragent headers["Accept"] = "*/*" headers["Accept-Language"] = "en-US,en;q=0.5" @@ -720,6 +728,36 @@ return adapter +@cache.cache(maxage=86400) +def _browser_useragent(): + """Get User-Agent header from default browser""" + import webbrowser + import socket + + server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + server.bind(("127.0.0.1", 6414)) + server.listen(1) + + webbrowser.open("http://127.0.0.1:6414/user-agent") + + client = server.accept()[0] + server.close() + + for line in client.recv(1024).split(b"\r\n"): + key, _, value = line.partition(b":") + if key.strip().lower() == b"user-agent": + useragent = value.strip() + break + else: + useragent = b"" + + client.send(b"HTTP/1.1 200 OK\r\n\r\n" + useragent) + client.close() + + return useragent.decode() + + _adapter_cache = {} _browser_cookies = {} diff -Nru gallery-dl-1.23.2/gallery_dl/extractor/danbooru.py gallery-dl-1.24.0/gallery_dl/extractor/danbooru.py --- gallery-dl-1.23.2/gallery_dl/extractor/danbooru.py 2022-08-27 18:29:11.000000000 +0000 +++ gallery-dl-1.24.0/gallery_dl/extractor/danbooru.py 2022-11-08 16:10:24.000000000 +0000 @@ -88,10 +88,7 @@ if post["extension"] == "zip": if self.ugoira: - post["frames"] = self.request( - "{}/posts/{}.json?only=pixiv_ugoira_frame_data".format( - self.root, post["id"]) - ).json()["pixiv_ugoira_frame_data"]["data"] + post["frames"] = self._ugoira_frames(post) post["_http_adjust_extension"] = False else: url = post["large_file_url"] @@ -105,6 +102,9 @@ resp = self.request(template.format(self.root, post["id"])) post.update(resp.json()) + if url[0] == "/": + url = self.root + url + post.update(data) yield Message.Directory, post yield Message.Url, url, post @@ -139,6 +139,18 @@ else: return + def _ugoira_frames(self, post): + data = self.request("{}/posts/{}.json?only=media_metadata".format( + self.root, post["id"]) + ).json()["media_metadata"]["metadata"] + + ext = data["ZIP:ZipFileName"].rpartition(".")[2] + print(post["id"], ext) + fmt = ("{:>06}." + ext).format + delays = data["Ugoira:FrameDelays"] + return [{"file": fmt(index), "delay": delay} + for index, delay in enumerate(delays)] + INSTANCES = { "danbooru": { @@ -161,6 +173,10 @@ "pattern": r"booru\.allthefallen\.moe", "page-limit": 5000, }, + "aibooru": { + "root": None, + "pattern": r"(?:safe.)?aibooru\.online", + } } BASE_PATTERN = DanbooruExtractor.update(INSTANCES) @@ -193,10 +209,16 @@ ("https://booru.allthefallen.moe/posts?tags=yume_shokunin", { "count": 12, }), + ("https://aibooru.online/posts?tags=center_frills&z=1", { + "pattern": r"https://aibooru\.online/data/original" + r"/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{32}\.\w+", + "count": ">= 3", + }), ("https://hijiribe.donmai.us/posts?tags=bonocho"), ("https://sonohara.donmai.us/posts?tags=bonocho"), ("https://safebooru.donmai.us/posts?tags=bonocho"), ("https://e926.net/posts?tags=anry"), + ("https://safe.aibooru.online/posts?tags=center_frills"), ) def __init__(self, match): @@ -229,6 +251,7 @@ "url": "902549ffcdb00fe033c3f63e12bc3cb95c5fd8d5", "count": 6, }), + ("https://aibooru.online/pools/1"), ("https://danbooru.donmai.us/pool/show/7659"), ("https://e621.net/pool/show/73"), ) @@ -291,6 +314,9 @@ ("https://booru.allthefallen.moe/posts/22", { "content": "21dda68e1d7e0a554078e62923f537d8e895cac8", }), + ("https://aibooru.online/posts/1", { + "content": "54d548743cd67799a62c77cbae97cfa0fec1b7e9", + }), ("https://danbooru.donmai.us/post/show/294929"), ("https://e621.net/post/show/535"), ) @@ -325,6 +351,7 @@ "count": ">= 70", }), ("https://booru.allthefallen.moe/explore/posts/popular"), + ("https://aibooru.online/explore/posts/popular"), ) def __init__(self, match): diff -Nru gallery-dl-1.23.2/gallery_dl/extractor/deviantart.py gallery-dl-1.24.0/gallery_dl/extractor/deviantart.py --- gallery-dl-1.23.2/gallery_dl/extractor/deviantart.py 2022-09-30 14:46:45.000000000 +0000 +++ gallery-dl-1.24.0/gallery_dl/extractor/deviantart.py 2022-11-10 12:38:30.000000000 +0000 @@ -72,7 +72,7 @@ def items(self): self.api = DeviantartOAuthAPI(self) - if self.user: + if self.user and self.config("group", True): profile = self.api.user_profile(self.user) self.group = not profile if self.group: @@ -603,22 +603,22 @@ page = self._limited_request(url).text if stash_id[0] == "0": - uuid = text.extract(page, '//deviation/', '"')[0] + uuid = text.extr(page, '//deviation/', '"') if uuid: deviation = self.api.deviation(uuid) - deviation["index"] = text.parse_int(text.extract( - page, 'gmi-deviationid="', '"')[0]) + deviation["index"] = text.parse_int(text.extr( + page, 'gmi-deviationid="', '"')) yield deviation return for item in text.extract_iter( page, 'class="stash-thumb-container', ''): - url = text.extract(item, '[^/?#]+)/(?P[^?#]+\." r"(?:jpe?g|jpe|png|gif|web[mp]|mp4|mkv|og[gmv]|opus))" - r"(?:\?(?P[^/?#]*))?(?:#(?P.*))?$") + r"(?:\?(?P[^#]*))?(?:#(?P.*))?$") test = ( (("https://en.wikipedia.org/static/images/project-logos/enwiki.png"), { "url": "18c5d00077332e98e53be9fed2ee4be66154b88d", @@ -31,9 +31,9 @@ "keyword": "29dad729c40fb09349f83edafa498dba1297464a", }), # more complex example - ("https://example.org/path/to/file.webm?que=1&ry=2#fragment", { - "url": "114b8f1415cc224b0f26488ccd4c2e7ce9136622", - "keyword": "06014abd503e3b2b58aa286f9bdcefdd2ae336c0", + ("https://example.org/path/to/file.webm?que=1?&ry=2/#fragment", { + "url": "6fb1061390f8aada3db01cb24b51797c7ee42b31", + "keyword": "3d7abc31d45ba324e59bc599c3b4862452d5f29c", }), # percent-encoded characters ("https://example.org/%27%3C%23/%23%3E%27.jpg?key=%3C%26%3E", { diff -Nru gallery-dl-1.23.2/gallery_dl/extractor/dynastyscans.py gallery-dl-1.24.0/gallery_dl/extractor/dynastyscans.py --- gallery-dl-1.23.2/gallery_dl/extractor/dynastyscans.py 2022-05-03 10:22:33.000000000 +0000 +++ gallery-dl-1.24.0/gallery_dl/extractor/dynastyscans.py 2022-11-10 12:38:30.000000000 +0000 @@ -30,7 +30,7 @@ src = extr("class='btn-group'>", "") url = extr(' src="', '"') - src = text.extract(src, 'href="', '"')[0] if "Source<" in src else "" + src = text.extr(src, 'href="', '"') if "Source<" in src else "" return { "url" : self.root + url, @@ -75,7 +75,7 @@ "title" : text.unescape(match.group(4) or ""), "author" : text.remove_html(author), "group" : (text.remove_html(group) or - text.extract(group, ' alt="', '"')[0] or ""), + text.extr(group, ' alt="', '"')), "date" : text.parse_datetime(extr( '"icon-calendar"> ', '<'), "%b %d, %Y"), "lang" : "en", @@ -83,7 +83,7 @@ } def images(self, page): - data = text.extract(page, "var pages = ", ";\n")[0] + data = text.extr(page, "var pages = ", ";\n") return [ (self.root + img["image"], None) for img in json.loads(data) diff -Nru gallery-dl-1.23.2/gallery_dl/extractor/erome.py gallery-dl-1.24.0/gallery_dl/extractor/erome.py --- gallery-dl-1.23.2/gallery_dl/extractor/erome.py 2022-09-19 20:55:23.000000000 +0000 +++ gallery-dl-1.24.0/gallery_dl/extractor/erome.py 2022-11-10 12:38:30.000000000 +0000 @@ -55,8 +55,8 @@ yield Message.Directory, data groups = page.split('
", "<")[0]) + data["uploader"] = text.unescape(text.extr( + data["uploader"], ">", "<")) f = data["favorites"][0] if f == "N": @@ -400,7 +400,7 @@ } page = self.request(url, cookies=cookies).text - current = text.extract(page, "", "")[0] + current = text.extr(page, "", "") self.log.debug("Image Limits: %s/%s", current, self.limits) self._remaining = self.limits - text.parse_int(current) @@ -473,6 +473,10 @@ "pattern": ExhentaiGalleryExtractor.pattern, "range": "1-30", "count": 30, + "keyword": { + "gallery_id": int, + "gallery_token": r"re:^[0-9a-f]{10}$" + }, }), ) @@ -490,26 +494,39 @@ self.params = {"f_search": tag, "page": 0} else: self.params = text.parse_query(query) - self.params["page"] = text.parse_int(self.params.get("page")) + if "next" not in self.params: + self.params["page"] = text.parse_int(self.params.get("page")) def items(self): self.login() data = {"_extractor": ExhentaiGalleryExtractor} + search_url = self.search_url + params = self.params while True: last = None - page = self.request(self.search_url, params=self.params).text + page = self.request(search_url, params=params).text for gallery in ExhentaiGalleryExtractor.pattern.finditer(page): url = gallery.group(0) if url == last: continue last = url + data["gallery_id"] = text.parse_int(gallery.group(2)) + data["gallery_token"] = gallery.group(3) yield Message.Queue, url + "/", data - if 'class="ptdd">><' in page or ">No hits found

" in page: + next_url = text.extr(page, 'nexturl = "', '"', None) + if next_url is not None: + if not next_url: + return + search_url = next_url + params = None + + elif 'class="ptdd">><' in page or ">No hits found

" in page: return - self.params["page"] += 1 + else: + params["page"] += 1 class ExhentaiFavoriteExtractor(ExhentaiSearchExtractor): diff -Nru gallery-dl-1.23.2/gallery_dl/extractor/fallenangels.py gallery-dl-1.24.0/gallery_dl/extractor/fallenangels.py --- gallery-dl-1.23.2/gallery_dl/extractor/fallenangels.py 2021-12-11 02:38:22.000000000 +0000 +++ gallery-dl-1.24.0/gallery_dl/extractor/fallenangels.py 2022-11-10 12:38:30.000000000 +0000 @@ -57,7 +57,7 @@ return [ (img["page_image"], None) for img in json.loads( - text.extract(page, "var pages = ", ";")[0] + text.extr(page, "var pages = ", ";") ) ] diff -Nru gallery-dl-1.23.2/gallery_dl/extractor/fanbox.py gallery-dl-1.24.0/gallery_dl/extractor/fanbox.py --- gallery-dl-1.23.2/gallery_dl/extractor/fanbox.py 2022-08-27 18:29:11.000000000 +0000 +++ gallery-dl-1.24.0/gallery_dl/extractor/fanbox.py 2022-11-08 16:10:24.000000000 +0000 @@ -68,6 +68,16 @@ post["html"] = content_body["html"] if post["type"] == "article": post["articleBody"] = content_body.copy() + if "blocks" in content_body: + content = [] + append = content.append + for block in content_body["blocks"]: + if "text" in block: + append(block["text"]) + if "links" in block: + for link in block["links"]: + append(link["url"]) + post["content"] = "\n".join(content) post["date"] = text.parse_datetime(post["publishedDatetime"]) post["text"] = content_body.get("text") if content_body else None @@ -271,6 +281,19 @@ "hasAdultContent": True }, }), + # 'content' metadata (#3020) + ("https://www.fanbox.cc/@official-en/posts/4326303", { + "keyword": { + "content": r"re:(?s)^Greetings from FANBOX.\n \nAs of Monday, " + r"September 5th, 2022, we are happy to announce " + r"the start of the FANBOX hashtag event " + r"#MySetupTour ! \nAbout the event\nTo join this " + r"event .+ \nPlease check this page for further " + r"details regarding the Privacy & Terms.\n" + r"https://fanbox.pixiv.help/.+/10184952456601\n\n\n" + r"Thank you for your continued support of FANBOX.$", + }, + }), ) def __init__(self, match): diff -Nru gallery-dl-1.23.2/gallery_dl/extractor/foolfuuka.py gallery-dl-1.24.0/gallery_dl/extractor/foolfuuka.py --- gallery-dl-1.23.2/gallery_dl/extractor/foolfuuka.py 2022-08-27 18:29:11.000000000 +0000 +++ gallery-dl-1.24.0/gallery_dl/extractor/foolfuuka.py 2022-11-10 12:38:30.000000000 +0000 @@ -56,7 +56,7 @@ """Resolve a remote media link""" needle = '", "")[0].strip() + title = text.extr(page, "", "").strip() title, _, gallery_id = title.rpartition("#") return { @@ -104,7 +104,7 @@ page, 'class="pic_pad">
', '>>><')[0] + pages = text.extr(page, 'class="pages">', '>>><') if not pages: return url = self.root + text.rextract(pages, 'href="', '"')[0] diff -Nru gallery-dl-1.23.2/gallery_dl/extractor/gelbooru.py gallery-dl-1.24.0/gallery_dl/extractor/gelbooru.py --- gallery-dl-1.23.2/gallery_dl/extractor/gelbooru.py 2022-08-27 18:29:11.000000000 +0000 +++ gallery-dl-1.24.0/gallery_dl/extractor/gelbooru.py 2022-11-10 12:38:30.000000000 +0000 @@ -68,6 +68,22 @@ yield "https://img2.gelbooru.com" + path yield "https://img1.gelbooru.com" + path + def _notes(self, post, page): + notes_data = text.extr(page, '
') + if not notes_data: + return + + post["notes"] = notes = [] + extr = text.extract + for note in text.extract_iter(notes_data, ''): + notes.append({ + "width" : int(extr(note, 'data-width="', '"')[0]), + "height": int(extr(note, 'data-height="', '"')[0]), + "x" : int(extr(note, 'data-x="', '"')[0]), + "y" : int(extr(note, 'data-y="', '"')[0]), + "body" : extr(note, 'data-body="', '"')[0], + }) + class GelbooruTagExtractor(GelbooruBase, gelbooru_v02.GelbooruV02TagExtractor): @@ -142,13 +158,23 @@ class GelbooruPostExtractor(GelbooruBase, gelbooru_v02.GelbooruV02PostExtractor): """Extractor for single images from gelbooru.com""" - pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?" - r"\?page=post&s=view&id=(?P\d+)") + pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?\?" + r"(?=(?:[^#]+&)?page=post(?:&|#|$))" + r"(?=(?:[^#]+&)?s=view(?:&|#|$))" + r"(?:[^#]+&)?id=(\d+)") test = ( ("https://gelbooru.com/index.php?page=post&s=view&id=313638", { "content": "5e255713cbf0a8e0801dc423563c34d896bb9229", "count": 1, }), + + ("https://gelbooru.com/index.php?page=post&s=view&id=313638"), + ("https://gelbooru.com/index.php?s=view&page=post&id=313638"), + ("https://gelbooru.com/index.php?page=post&id=313638&s=view"), + ("https://gelbooru.com/index.php?s=view&id=313638&page=post"), + ("https://gelbooru.com/index.php?id=313638&page=post&s=view"), + ("https://gelbooru.com/index.php?id=313638&s=view&page=post"), + ("https://gelbooru.com/index.php?page=post&s=view&id=6018318", { "options": (("tags", True),), "content": "977caf22f27c72a5d07ea4d4d9719acdab810991", @@ -172,21 +198,21 @@ "keywords": { "notes": [ { - "height": 553, "body": "Look over this way when you talk~", + "height": 553, "width": 246, "x": 35, - "y": 72 + "y": 72, }, { - "height": 557, "body": "Hey~\nAre you listening~?", + "height": 557, "width": 246, "x": 1233, - "y": 109 - } - ] - } + "y": 109, + }, + ], + }, }), ) diff -Nru gallery-dl-1.23.2/gallery_dl/extractor/gelbooru_v02.py gallery-dl-1.24.0/gallery_dl/extractor/gelbooru_v02.py --- gallery-dl-1.23.2/gallery_dl/extractor/gelbooru_v02.py 2022-10-01 10:34:23.000000000 +0000 +++ gallery-dl-1.24.0/gallery_dl/extractor/gelbooru_v02.py 2022-11-10 16:03:18.000000000 +0000 @@ -31,6 +31,7 @@ if self.category == "realbooru": self._file_url = self._file_url_realbooru + self._tags = self._tags_realbooru def _api_request(self, params): url = self.api_root + "/index.php?page=dapi&s=post&q=index" @@ -85,55 +86,58 @@ post["date"] = text.parse_datetime( post["created_at"], "%a %b %d %H:%M:%S %z %Y") + def _html(self, post): + return self.request("{}/index.php?page=post&s=view&id={}".format( + self.root, post["id"])).text + + def _tags(self, post, page): + tag_container = (text.extr(page, '
    ')) + if not tag_container: + return + + tags = collections.defaultdict(list) + pattern = re.compile( + r"tag-type-([^\"' ]+).*?[?;]tags=([^\"'&]+)", re.S) + for tag_type, tag_name in pattern.findall(tag_container): + tags[tag_type].append(text.unquote(tag_name)) + for key, value in tags.items(): + post["tags_" + key] = " ".join(value) + + def _notes(self, post, page): + note_container = text.extr(page, 'id="note-container"', "", "
"))), + }) + def _file_url_realbooru(self, post): url = post["file_url"] - if url.count("/") == 5: - md5 = post["md5"] + md5 = post["md5"] + if md5 not in post["preview_url"] or url.count("/") == 5: url = "{}/images/{}/{}/{}.{}".format( self.root, md5[0:2], md5[2:4], md5, url.rpartition(".")[2]) return url - def _extended_tags(self, post, page=None): - if not page: - url = "{}/index.php?page=post&s=view&id={}".format( - self.root, post["id"]) - page = self.request(url).text - html = text.extract(page, '