diff -Nru r-cran-stringi-1.1.5/debian/changelog r-cran-stringi-1.1.6/debian/changelog --- r-cran-stringi-1.1.5/debian/changelog 2017-11-07 09:02:36.000000000 +0000 +++ r-cran-stringi-1.1.6/debian/changelog 2018-02-23 08:50:37.000000000 +0000 @@ -1,8 +1,12 @@ -r-cran-stringi (1.1.5-1build1) bionic; urgency=medium +r-cran-stringi (1.1.6-1) unstable; urgency=medium - * No-change rebuild for icu soname change. + * New upstream version + * Standards-Version: 4.1.3 + * debhelper 11 + * Drop unused lintian-override + * Secure URI in watch file - -- Matthias Klose Tue, 07 Nov 2017 09:02:36 +0000 + -- Andreas Tille Fri, 23 Feb 2018 09:50:37 +0100 r-cran-stringi (1.1.5-1) unstable; urgency=medium diff -Nru r-cran-stringi-1.1.5/debian/compat r-cran-stringi-1.1.6/debian/compat --- r-cran-stringi-1.1.5/debian/compat 2017-10-12 20:28:55.000000000 +0000 +++ r-cran-stringi-1.1.6/debian/compat 2018-02-23 08:50:37.000000000 +0000 @@ -1 +1 @@ -10 +11 diff -Nru r-cran-stringi-1.1.5/debian/control r-cran-stringi-1.1.6/debian/control --- r-cran-stringi-1.1.5/debian/control 2017-10-12 20:28:55.000000000 +0000 +++ r-cran-stringi-1.1.6/debian/control 2018-02-23 08:50:37.000000000 +0000 @@ -2,14 +2,14 @@ Maintainer: Debian Med Packaging Team Uploaders: Andreas Tille Section: gnu-r -Priority: optional Testsuite: autopkgtest-pkg-r -Build-Depends: debhelper (>= 10), +Priority: optional +Build-Depends: debhelper (>= 11~), dh-r, libicu-dev, pkg-config, r-base-dev -Standards-Version: 4.1.1 +Standards-Version: 4.1.3 Vcs-Browser: https://anonscm.debian.org/cgit/debian-med/r-cran-stringi.git Vcs-Git: https://anonscm.debian.org/git/debian-med/r-cran-stringi.git Homepage: https://cran.r-project.org/package=stringi diff -Nru r-cran-stringi-1.1.5/debian/source/lintian-overrides r-cran-stringi-1.1.6/debian/source/lintian-overrides --- r-cran-stringi-1.1.5/debian/source/lintian-overrides 2017-10-12 20:28:55.000000000 +0000 +++ r-cran-stringi-1.1.6/debian/source/lintian-overrides 1970-01-01 00:00:00.000000000 +0000 @@ -1,2 +0,0 @@ -# This is a non-issue since the URL belongs to the license text and should not be changed -r-cran-stringi source: obsolete-url-in-packaging debian/copyright http://code.google.com/p/lao-dictionary/ diff -Nru r-cran-stringi-1.1.5/debian/watch r-cran-stringi-1.1.6/debian/watch --- r-cran-stringi-1.1.5/debian/watch 2017-10-12 20:28:55.000000000 +0000 +++ r-cran-stringi-1.1.6/debian/watch 2018-02-23 08:50:37.000000000 +0000 @@ -1,2 +1,2 @@ -version=3 -http://cran.r-project.org/src/contrib/stringi_([\d.-]*)\.tar\.gz +version=4 +https://cran.r-project.org/src/contrib/stringi_([\d.-]*)\.tar\.gz diff -Nru r-cran-stringi-1.1.5/DESCRIPTION r-cran-stringi-1.1.6/DESCRIPTION --- r-cran-stringi-1.1.5/DESCRIPTION 2017-04-07 13:08:03.000000000 +0000 +++ r-cran-stringi-1.1.6/DESCRIPTION 2017-11-17 13:13:14.000000000 +0000 @@ -1,6 +1,6 @@ Package: stringi -Version: 1.1.5 -Date: 2017-04-07 +Version: 1.1.6 +Date: 2017-11-10 Title: Character String Processing Facilities Description: Allows for fast, correct, consistent, portable, as well as convenient character string/text processing in every locale @@ -27,7 +27,7 @@ Maintainer: Marek Gagolewski RoxygenNote: 6.0.1 NeedsCompilation: yes -Packaged: 2017-04-07 09:50:47 UTC; gagolews +Packaged: 2017-11-10 13:42:04 UTC; gagolews License_is_FOSS: yes Repository: CRAN -Date/Publication: 2017-04-07 13:08:03 UTC +Date/Publication: 2017-11-17 13:13:14 UTC diff -Nru r-cran-stringi-1.1.5/man/stri_compare.Rd r-cran-stringi-1.1.6/man/stri_compare.Rd --- r-cran-stringi-1.1.5/man/stri_compare.Rd 2017-03-31 15:59:05.000000000 +0000 +++ r-cran-stringi-1.1.6/man/stri_compare.Rd 2017-10-31 15:43:04.000000000 +0000 @@ -69,7 +69,7 @@ locale-independent operations: for natural language text processing, in which the notion of canonical equivalence is more valid, this might not be exactly what you are looking for, see Examples. -By the way, note that \pkg{stringi} always silently removes UTF-8 +Please note that \pkg{stringi} always silently removes UTF-8 BOMs from input strings, so e.g. \code{stri_cmp_eq} does not take BOMs into account while comparing strings. diff -Nru r-cran-stringi-1.1.5/man/stri_extract.Rd r-cran-stringi-1.1.6/man/stri_extract.Rd --- r-cran-stringi-1.1.5/man/stri_extract.Rd 2017-03-31 15:59:05.000000000 +0000 +++ r-cran-stringi-1.1.6/man/stri_extract.Rd 2017-10-31 15:43:04.000000000 +0000 @@ -135,6 +135,11 @@ stri_extract_first_charclass('AaBbCc', '\\\\p{Ll}') stri_extract_last_charclass('AaBbCc', '\\\\p{Ll}') +\dontrun{ +# emoji support available since ICU 57 +stri_extract_all_charclass(stri_enc_fromutf32(32:55200), "\\\\p{EMOJI}") +} + stri_extract_all_coll(c('AaaaaaaA', 'AAAA'), 'a') stri_extract_first_coll(c('Yy\\u00FD', 'AAA'), 'y', strength=2, locale="sk_SK") stri_extract_last_coll(c('Yy\\u00FD', 'AAA'), 'y', strength=1, locale="sk_SK") diff -Nru r-cran-stringi-1.1.5/man/stri_locate_boundaries.Rd r-cran-stringi-1.1.6/man/stri_locate_boundaries.Rd --- r-cran-stringi-1.1.5/man/stri_locate_boundaries.Rd 2017-03-31 15:59:05.000000000 +0000 +++ r-cran-stringi-1.1.6/man/stri_locate_boundaries.Rd 2017-10-31 15:43:04.000000000 +0000 @@ -84,6 +84,9 @@ stri_locate_all_boundaries(test, type="character") stri_locate_all_words(test) +stri_extract_all_boundaries("Mr. Jones and Mrs. Brown are very happy. +So am I, Prof. Smith.", type="sentence", locale="en_US@ss=standard") # ICU >= 56 only + } \seealso{ Other search_locate: \code{\link{stri_locate_all}}, diff -Nru r-cran-stringi-1.1.5/man/stri_match.Rd r-cran-stringi-1.1.6/man/stri_match.Rd --- r-cran-stringi-1.1.5/man/stri_match.Rd 2017-03-31 15:59:05.000000000 +0000 +++ r-cran-stringi-1.1.6/man/stri_match.Rd 2017-10-31 15:43:04.000000000 +0000 @@ -74,7 +74,7 @@ If no pattern match is detected and \code{omit_no_match=FALSE}, then \code{NA}s are included in the resulting matrix (matrices), see Examples. -By the way, \pkg{ICU} regex engine currently does not support named capture groups. +Please note: \pkg{ICU} regex engine currently does not support named capture groups. \code{stri_match}, \code{stri_match_all}, \code{stri_match_first}, and \code{stri_match_last} are convenience functions. diff -Nru r-cran-stringi-1.1.5/man/stringi-encoding.Rd r-cran-stringi-1.1.6/man/stringi-encoding.Rd --- r-cran-stringi-1.1.5/man/stringi-encoding.Rd 2017-03-31 15:59:05.000000000 +0000 +++ r-cran-stringi-1.1.6/man/stringi-encoding.Rd 2017-10-31 15:43:04.000000000 +0000 @@ -62,7 +62,7 @@ for representing Unicode character strings in \R. UTF-8 has ASCII as its subset (code points 1--127 represent the same characters in both of them). Code points larger than 127 are represented by multi-byte sequences -(from 2 to 4 bytes: by the way, not all sequences of bytes are valid UTF-8, +(from 2 to 4 bytes: Please note that not all sequences of bytes are valid UTF-8, cf. \code{\link{stri_enc_isutf8}}). Most of the computations in \pkg{stringi} are performed internally @@ -115,7 +115,8 @@ Character strings in \R (internally) can be declared to be in: \itemize{ \item \code{UTF-8}; -\item \code{latin1}, i.e. ISO-8859-1 (Western European); +\item \code{latin1}, i.e., either ISO-8859-1 (Western European on +Linux, macOS, and other Unixes) or WINDOWS-1252 (Windows); \item \code{bytes} -- for strings that should be manipulated as sequences of bytes. } @@ -126,7 +127,7 @@ \item \code{native} (a.k.a. \code{unknown} in \code{\link{Encoding}}; quite a misleading name: no explicit encoding mark) -- for strings that are assumed to be in your platform's native (default) encoding. -This can represent UTF-8 if you are an OS X user, +This can represent UTF-8 if you are an macOS user, or some 8-bit Windows code page, for example. The native encoding used by \R may be determined by examining the LC_CTYPE category, see \code{\link{Sys.getlocale}}. diff -Nru r-cran-stringi-1.1.5/man/stringi-search-charclass.Rd r-cran-stringi-1.1.6/man/stringi-search-charclass.Rd --- r-cran-stringi-1.1.5/man/stringi-search-charclass.Rd 2017-03-31 15:59:05.000000000 +0000 +++ r-cran-stringi-1.1.6/man/stringi-search-charclass.Rd 2017-10-31 15:43:04.000000000 +0000 @@ -233,6 +233,10 @@ \item{\code{CHANGES_WHEN_CASEFOLDED}}{} \item{\code{CHANGES_WHEN_CASEMAPPED}}{} \item{\code{CHANGES_WHEN_NFKC_CASEFOLDED}}{} + \item{\code{EMOJI}}{Since ICU 57} + \item{\code{EMOJI_PRESENTATION}}{Since ICU 57} + \item{\code{EMOJI_MODIFIER}}{Since ICU 57} + \item{\code{EMOJI_MODIFIER_BASE}}{Since ICU 57} } } diff -Nru r-cran-stringi-1.1.5/man/stringi-search-regex.Rd r-cran-stringi-1.1.6/man/stringi-search-regex.Rd --- r-cran-stringi-1.1.5/man/stringi-search-regex.Rd 2017-03-31 15:59:05.000000000 +0000 +++ r-cran-stringi-1.1.6/man/stringi-search-regex.Rd 2017-10-31 15:43:04.000000000 +0000 @@ -87,6 +87,8 @@ just before the current position. Does not alter the input position. The length of possible strings matched by the look-behind pattern must not be unbounded (no \code{*} or \code{+} operators.)} +\item{\code{(?...)}}{Named capture group. The \code{} + are literal - they appear in the pattern.} \item{\code{(?ismwx-ismwx:...)}}{Flag settings. Evaluate the parenthesized expression with the specified flags enabled or \code{-}disabled, see also \code{\link{stri_opts_regex}}.} @@ -121,6 +123,12 @@ \item{\code{\\f}}{Match a FORM FEED, \code{\\u000C}.} \item{\code{\\G}}{Match if the current position is at the end of the previous match.} +\item{\code{\\h}}{Match a Horizontal White Space character. + They are characters with Unicode General Category of Space_Separator plus + the ASCII tab, \code{\\u0009}. [Since ICU 55]} +\item{\code{\\H}}{Match a non-Horizontal White Space character. + [Since ICU 55]} +\item{\code{\\k}}{Named Capture Back Reference. [Since ICU 55]} \item{\code{\\n}}{Match a LINE FEED, \code{\\u000A}.} \item{\code{\\N{UNICODE CHARACTER NAME}}}{Match the named character.} \item{\code{\\p{UNICODE PROPERTY NAME}}}{Match any character with the diff -Nru r-cran-stringi-1.1.5/man/stri_opts_brkiter.Rd r-cran-stringi-1.1.6/man/stri_opts_brkiter.Rd --- r-cran-stringi-1.1.5/man/stri_opts_brkiter.Rd 2017-03-31 15:59:05.000000000 +0000 +++ r-cran-stringi-1.1.6/man/stri_opts_brkiter.Rd 2017-10-31 15:43:04.000000000 +0000 @@ -9,8 +9,9 @@ skip_line_hard, skip_sentence_term, skip_sentence_sep, ...) } \arguments{ -\item{type}{single string; break iterator type, one of \code{character}, -\code{line_break}, \code{sentence}, or \code{word}; +\item{type}{single string; either the break iterator type, one of \code{character}, +\code{line_break}, \code{sentence}, \code{word}; +or a custom set of ICU break iteration rules. see \link{stringi-search-boundaries}} \item{locale}{single string, \code{NULL} or \code{""} for default locale} @@ -61,6 +62,11 @@ any special actions on particular types of text boundaries, e.g. in case of the \code{\link{stri_locate_all_boundaries}} and \code{\link{stri_split_boundaries}} functions. + +Note that custom break iterator rules (advanced users only) +should be specified as a single string. +For a detailed description of the syntax of RBBI rules, please refer +to the ICU User Guide on Boundary Analysis. } \references{ \emph{\code{ubrk.h} File Reference} -- ICU4C API Documentation, diff -Nru r-cran-stringi-1.1.5/man/stri_replace.Rd r-cran-stringi-1.1.6/man/stri_replace.Rd --- r-cran-stringi-1.1.5/man/stri_replace.Rd 2017-03-31 15:59:05.000000000 +0000 +++ r-cran-stringi-1.1.6/man/stri_replace.Rd 2017-10-31 15:43:04.000000000 +0000 @@ -116,6 +116,7 @@ of the capture group (their numbering starts from 1). In order to treat the \code{$} character literally, escape it with a backslash. +Moreover, \code{${name}} are used for named capture groups. \code{stri_replace}, \code{stri_replace_all}, \code{stri_replace_first}, and \code{stri_replace_last} are convenience functions; they just call @@ -144,6 +145,12 @@ stri_replace_all_regex('123|456|789', '(\\\\p{N}).(\\\\p{N})', '$2-$1') stri_replace_all_regex(c("stringi R", "REXAMINE", "123"), '( R|R.)', ' r ') +\dontrun{ +# named capture groups available since ICU 55 +stri_replace_all_regex("words 123 and numbers 456", + "(?[0-9]+)", "!${numbers}!") +} + # Compare the results: stri_replace_all_fixed("The quick brown fox jumped over the lazy dog.", c("quick", "brown", "fox"), c("slow", "black", "bear"), vectorize_all=TRUE) diff -Nru r-cran-stringi-1.1.5/man/stri_split_boundaries.Rd r-cran-stringi-1.1.6/man/stri_split_boundaries.Rd --- r-cran-stringi-1.1.5/man/stri_split_boundaries.Rd 2017-03-31 15:59:05.000000000 +0000 +++ r-cran-stringi-1.1.6/man/stri_split_boundaries.Rd 2017-10-31 15:43:04.000000000 +0000 @@ -70,7 +70,7 @@ # filtered break iterator with the new ICU: stri_split_boundaries("Mr. Jones and Mrs. Brown are very happy. -So am I, Prof. Smith.", type="sentence", locale="@ss=standard") # ICU >= 56 only +So am I, Prof. Smith.", type="sentence", locale="en_US@ss=standard") # ICU >= 56 only } \seealso{ diff -Nru r-cran-stringi-1.1.5/man/stri_sub.Rd r-cran-stringi-1.1.6/man/stri_sub.Rd --- r-cran-stringi-1.1.5/man/stri_sub.Rd 2017-03-31 15:59:05.000000000 +0000 +++ r-cran-stringi-1.1.6/man/stri_sub.Rd 2017-10-31 15:43:04.000000000 +0000 @@ -18,8 +18,8 @@ \item{length}{integer vector; mutually exclusive with \code{to} and \code{from} being a matrix} -\item{omit_na}{single logical value; if \code{TRUE}, missing values in \code{from}, -\code{to}, or \code{length} will result in an unchanged input; replacement function only} +\item{omit_na}{single logical value; if \code{TRUE}, missing values in any of the arguments +provided will result in an unchanged input; replacement function only} \item{value}{character vector to be substituted with; replacement function only} } diff -Nru r-cran-stringi-1.1.5/man/stri_trans_general.Rd r-cran-stringi-1.1.6/man/stri_trans_general.Rd --- r-cran-stringi-1.1.5/man/stri_trans_general.Rd 2017-03-31 15:59:05.000000000 +0000 +++ r-cran-stringi-1.1.6/man/stri_trans_general.Rd 2017-10-31 15:43:04.000000000 +0000 @@ -59,6 +59,9 @@ stri_trans_general("stringi", "upper") # see stri_trans_toupper stri_trans_general("\\u0104", "nfd; lower") # compound id; see stri_trans_nfd stri_trans_general("tato nie wraca ranki wieczory", "pl-pl_FONIPA") +stri_trans_general("\\u2620", "any-name") # character name +stri_trans_general("\\\\N{latin small letter a}", "name-any") # decode name +stri_trans_general("\\u2620", "hex") # to hex } \references{ \emph{General Transforms} -- ICU User Guide, diff -Nru r-cran-stringi-1.1.5/MD5 r-cran-stringi-1.1.6/MD5 --- r-cran-stringi-1.1.5/MD5 2017-04-07 13:08:03.000000000 +0000 +++ r-cran-stringi-1.1.6/MD5 2017-11-17 13:13:14.000000000 +0000 @@ -1,13 +1,13 @@ -385c56c3e20a13d922d972602623aeb8 *DESCRIPTION +859f51fe06d9eb47bb07cccf5a662fd3 *DESCRIPTION ee8b16f29c99477beb6d828fe6e91595 *INSTALL 7046c04fd6bdbea3546adb75e91cb6db *LICENSE 913e8dae7f4b5710551952537fea0916 *NAMESPACE -d4852820ea7b69f70c58a3df90ba82b6 *NEWS +15e3ca5a10873dd2e015072750ed4a93 *NEWS c81ac36a300bf461a0f29a37040defa1 *R/ICU_settings.R -9bdd87ab2a81cda87f488233507e6aa1 *R/compare.R +4621b9121f0d652b518f773d8363a1a1 *R/compare.R bf6f90295ce7e54700b045e587a0e087 *R/draft_encdetect.R 91353e5979961a59e44da8ff898d44e8 *R/draft_files.R -1a9f79d94b2ef225b5839b8e2a2ca74e *R/encoding.R +4a05a8c32ff6a74bc33ec44c943e8c72 *R/encoding.R f8eae4bb15945176cde77cb727cf51a3 *R/encoding_conversion.R a7eac5951ea21c4c30066f4e56459aec *R/encoding_detection.R 0a96a5c4303fff13da9b113e5f2bce5a *R/encoding_management.R @@ -19,29 +19,29 @@ 09b4b730a4e1a0cf21f81085aca019e2 *R/length.R dd68e339c8809f97f5558289e6fa9028 *R/locale.R 4c3258077a514647593f06109e132e5a *R/locale_management.R -71775f48ad1d507e47f9c72d7cb84b82 *R/opts.R +4a90a044e0e586e375c198071696eecc *R/opts.R 4ad424314866e784b45aa5de52e3f90d *R/pad.R bcbc6c83a2e3b3f206aba2841ed9a881 *R/random.R 91f24c91cc37ced12dbbc731c759320a *R/reverse.R -710e851377719659c1504291238a8da4 *R/search.R +e47b41485b3f6e6f4a67766c299e0409 *R/search.R c173af972d96b772f1d68fefbc9fd489 *R/search_count_4.R ca2f35225446f55879f3f3b2637d5ba7 *R/search_count_bound.R 608c4e3646888367802a3d4c3c629b12 *R/search_detect_4.R -0578e1734bc2c497ff5f7074ad287f5e *R/search_extract_4.R +212359c8b3994747afc504fcb1b21579 *R/search_extract_4.R 40693bf761a48b16585b3a5912eb3238 *R/search_extract_bound.R f3aacf1e82e7c1904b5b1480b702cfb2 *R/search_locate_4.R -19795d881dd30fae5f3e3a1047f42b0b *R/search_locate_bound.R -e5c8233ee4ed93dd1c64301536e0abe6 *R/search_match_4.R -2476f38d0531ad288c27998b48e02049 *R/search_replace_4.R +bae110a847b15adbe9d4ad1fac40055b *R/search_locate_bound.R +d415b4b9bebc0db8b8d86aad62347409 *R/search_match_4.R +cda8a3da5ae053a33a40309b22fd0435 *R/search_replace_4.R f75af2938ebc9b25d087f24338c34f46 *R/search_replace_na.R cc0d7f5509fd2999024b1021a1b9388d *R/search_split_4.R -98127b2026b5c296e0e76fb638612fe8 *R/search_split_bound.R +1176398bb4cdc265ce1e3d71395792ae *R/search_split_bound.R c53f03926544896f382189fb70a6a8f2 *R/search_startsendswith_4.R 2d618c7d5a81cccb5fe7fbe78598015c *R/search_subset_4.R dc8cae5f0b0b5e615b285909595a6d79 *R/sort.R c7918d77003afbdc7b73617e12cd522b *R/stats.R 3e270d5c0da94d4158cb440d3c87aecf *R/stringi-package.R -79a1799eaafdb9da46df272d89b52bb1 *R/sub.R +411b04e3f9bd810443eadb1e82d94910 *R/sub.R 0c9439ca1f4948216944a8f3888b148f *R/time_calendar.R 02f3bd9e2e61b34b1ab264f40c7c571c *R/time_format.R b38a9455e88df9aaf78f640c3021e895 *R/time_symbols.R @@ -54,7 +54,7 @@ 31e14d3e1bfdcb0f502d4edc042e1478 *R/trans_casemap.R 2f4579b7add274ff04ad496251f74eae *R/trans_normalization.R c2b9cff7e4a3930d0e415f77fb440dc8 *R/trans_other.R -f37ba50803cdf6e6254a5fd35480d78e *R/trans_transliterate.R +5bf8ecd5bc168c924bdcb38b27ea087f *R/trans_transliterate.R 7f6ed76b2b142b811293a23fe109a77b *R/trim.R 9a39ed292c422093c87b321b08e04c1d *R/utils.R 726393464c54b2e68439fafc37177f1e *R/wrap.R @@ -65,7 +65,7 @@ dbff8d774fa477bd3b2025e8d73efd2c *inst/CITATION 15588b9d62e1c296d432a8e26983d4dd *man/oper_comparison.Rd dc278dae56fde988bd69a5bd60b6c375 *man/oper_plus.Rd -44594f86307e4d3ffe16c0e01789067d *man/stri_compare.Rd +f11800dd14071eea84fa531e9ef70cab *man/stri_compare.Rd 5e1b9b6176078b30c3b57217880bfc10 *man/stri_count.Rd f986ea1bfd8be8a78531c3732e426437 *man/stri_count_boundaries.Rd 02955a6d446b84214e52b22837ea1da9 *man/stri_datetime_add.Rd @@ -94,7 +94,7 @@ c324fa2e7f63b3d7a37978e02e46602f *man/stri_enc_toutf8.Rd 3b45913a491338b87b79fc14e728076b *man/stri_encode.Rd e8fa1792aad1bb9273cd156292f284d9 *man/stri_escape_unicode.Rd -c7db9e1f65d4d5d8d3b8e9c9221eef1f *man/stri_extract.Rd +b6dec4a90c4ba08103577c28699ea516 *man/stri_extract.Rd 881695763d0df0b4a87f881b9a40bc00 *man/stri_extract_boundaries.Rd 2b1241b500861ca0db508a028712e446 *man/stri_flatten.Rd 7ededaa6dfab59a9c1707ba56f3def0c *man/stri_info.Rd @@ -107,10 +107,10 @@ e78d1d8391a76c574bda1ebca0e9a328 *man/stri_locale_list.Rd e5757a261f35d2a043bae01c1aaac82f *man/stri_locale_set.Rd e62f5b726e95f761ea8096ff8d9273ad *man/stri_locate.Rd -79aab9ce68426397df8126a999d25a88 *man/stri_locate_boundaries.Rd -e172b0704268e508df5a12405b96d756 *man/stri_match.Rd +d79eef6d3e2df3a26879b9be6560e804 *man/stri_locate_boundaries.Rd +eb0c5404ecc362b41002d5fcef9ecf09 *man/stri_match.Rd 5d875353f4d641f44767823cae7b04af *man/stri_numbytes.Rd -4ec55f96b3f67373624c09c71e6a30bc *man/stri_opts_brkiter.Rd +9c96f754fe30169e399954a06de8d858 *man/stri_opts_brkiter.Rd 2d399c2525262543bad90715db294bff *man/stri_opts_collator.Rd 5e9fba621a6b886e5591e648e97b131e *man/stri_opts_fixed.Rd b95486d98d0b565088c9bd393ee8bea1 *man/stri_opts_regex.Rd @@ -121,23 +121,23 @@ ad18ad7cc26177dabd21d6fa2031a139 *man/stri_rand_strings.Rd bf8c8c59784e055aaeded1a7c835a6ed *man/stri_read_lines.Rd 58a84bca88229750b5825c60a257ab5b *man/stri_read_raw.Rd -8c1327a34aae583175ae0c29ed1d49d9 *man/stri_replace.Rd +234a4695a1dc511feb931573d4635c08 *man/stri_replace.Rd 3347d2f732147a3e2b4750fc522c45b8 *man/stri_replace_na.Rd b7465654101c1c902c128aeac1dd6248 *man/stri_reverse.Rd 34f63546939ed8ffc8878dd511c8bfae *man/stri_split.Rd -961edd3f060aa8781af83e6d4e2376ae *man/stri_split_boundaries.Rd +1e3d76748ccc970b4d35a610a31825cc *man/stri_split_boundaries.Rd b96f311fe838a34b2a19532f7aeb7831 *man/stri_split_lines.Rd 8a522a8ea9178020ebb1930c2a5b4156 *man/stri_startsendswith.Rd 1772c43e509a6623b44f69d39d3906be *man/stri_stats_general.Rd c9c7574157005d6eb78719c1993113aa *man/stri_stats_latex.Rd -011623e4bfaaefa0fcdd5b163d0437d2 *man/stri_sub.Rd +8a3f5cd71a291a61055a936c49e119a1 *man/stri_sub.Rd f50bbb055cea299044207a6bed8b9296 *man/stri_subset.Rd ada401a07b12245bdd175d5c37e28a97 *man/stri_timezone_info.Rd 33f6fa48c85c31a88c0bc63c566fd7f3 *man/stri_timezone_list.Rd fe0ab8db348bc2287923f361702e85d8 *man/stri_timezone_set.Rd 1c7cc14e3ee5464c30320983205506e2 *man/stri_trans_casemap.Rd 1b5b7ec0b1454f1c84ea4c1c1efdef84 *man/stri_trans_char.Rd -43a549d49f4ea1fdcb38a135b0393825 *man/stri_trans_general.Rd +90aa916c05aab1ad499d18a2333b641c *man/stri_trans_general.Rd 80a4d51ef171c4aae0b861d83167a46a *man/stri_trans_list.Rd eb7a3e54d887d0890d0ec0d1bbfb7ea6 *man/stri_trans_nf.Rd f3f5b310309644d7ff9337ba816a7b41 *man/stri_trim.Rd @@ -147,14 +147,14 @@ 97abe47392cd57a37c9d098100f41211 *man/stri_wrap.Rd 31afcd570890fc4d3ad70a522849a93b *man/stri_write_lines.Rd d15ba71c5da0f7de2789b1917502b2f5 *man/stringi-arguments.Rd -d6bb05c10abd98b24ecf3e86cc45d4da *man/stringi-encoding.Rd +9d2734bfc39ba6b42cea217be9a6275c *man/stringi-encoding.Rd 56bd0b4e9aaf3d7061648a861b9e1d80 *man/stringi-locale.Rd 7787514f84c5f89b0b01ec9a6aa5c1b2 *man/stringi-package.Rd e5b72c2a9febcbc14357b52233be3095 *man/stringi-search-boundaries.Rd -93b7360481471e1fdfe3fe783d01eb09 *man/stringi-search-charclass.Rd +af1466a8f8c3bafaf4a178c5b42311e0 *man/stringi-search-charclass.Rd 38e7e8d6dc3b294f4cb919a21b5f063a *man/stringi-search-coll.Rd 134d2f24fe61334ec6f2c1346fd9f0bb *man/stringi-search-fixed.Rd -36ca45bd29defd5a4f23879d4ecc8d28 *man/stringi-search-regex.Rd +9a27a57fbf64e3ac5e4be6a546324346 *man/stringi-search-regex.Rd 91e81faefac576b549bcb172bf88d8ab *man/stringi-search.Rd ecde906ce156481a413325ce9fcdd2ac *src/Makevars.in fc24b0aa2722078367c49a673d045999 *src/Makevars.win @@ -923,15 +923,15 @@ ba647fe26a1b79bff385787aa18e37ec *src/stri_TODO_justify.cpp 4d94406e0667bd12d230e388360d6f46 *src/stri_TODO_splitpos.cpp 477ecb853266ee5927f6501cff3b9120 *src/stri_TODO_uchar.cpp -d4fd984caed48c2df709f6f829f04eed *src/stri_brkiter.cpp -13b912adffac61cb012756e35b853293 *src/stri_brkiter.h +4691409467ac41717c073b99a6edbf87 *src/stri_brkiter.cpp +04f6b4676de9c9bf42601b3cdaafae4e *src/stri_brkiter.h 3deace6ffb49ef4b5e0f89facbf6c72f *src/stri_bytesearch_matcher.h -d118eeb04f21bd5d6cd6e07906fb7341 *src/stri_collator.cpp +adc29bc40c5508214babc88abd92528d *src/stri_collator.cpp c0c2c4cca6916764e0cd983cb2f3e98a *src/stri_common.cpp afb99cfbd73d7b7c9dd29d0619d106e1 *src/stri_compare.cpp 570f14b8a343e3cb18cd090e064ce96d *src/stri_container_base.cpp 63b7b957013fdc44fc180904e93adebb *src/stri_container_base.h -c3d4173d85dcddb0a522b1866a4f4a66 *src/stri_container_bytesearch.cpp +a0aa699dbb068767b40193ba096a9810 *src/stri_container_bytesearch.cpp 2ecf67bd1bcd1d3cf31841905a2368da *src/stri_container_bytesearch.h ed7b4027ac81e8854445bfaa488ad952 *src/stri_container_charclass.h a138d27d76c6d16566ab6de8de7fe497 *src/stri_container_double.h @@ -943,13 +943,13 @@ 0270c7327c6bbaa0c84c0739be677905 *src/stri_container_listutf8.cpp 40297ed82be0ca2093c72b792f99ccad *src/stri_container_listutf8.h 07e04c0688fc4a369d5aa68687b5ce94 *src/stri_container_logical.h -f3957ade52810e77bc2df5a0a397393b *src/stri_container_regex.cpp +5a6514a388b29e9b4f01757d45f517a3 *src/stri_container_regex.cpp 0925b615641dedc3c727680020f5173a *src/stri_container_regex.h 8a99105a3e8ae6cdcf30b199878dc766 *src/stri_container_usearch.cpp 4ff13df08fded2c1f6224c48855258b3 *src/stri_container_usearch.h -9216bb4e70a63401e74dbfef268542e6 *src/stri_container_utf16.cpp +8ce5acfb2a1a122ed00d5cada13df5c6 *src/stri_container_utf16.cpp f0d8889b042b30059f2b9524765e5d29 *src/stri_container_utf16.h -021b6bd4873784eff92680d0cf57bc5d *src/stri_container_utf8.cpp +9599a71c2b52e5997db888a48378c6b5 *src/stri_container_utf8.cpp 0e24cdb965811e55ee0ec360751bac54 *src/stri_container_utf8.h 4535e78a2bdf56cb324ce149634f8063 *src/stri_container_utf8_indexable.cpp c2d2d5e31de68aa8c278b7533c5f261e *src/stri_container_utf8_indexable.h @@ -957,7 +957,7 @@ 3da4ddc93586b26385a41f8a3a57b0fe *src/stri_encoding_conversion.cpp f262c6635c383fd8f04511235a9d3a2e *src/stri_encoding_detection.cpp 35b78673d352b72755cfafbea26aff13 *src/stri_encoding_management.cpp -c8cfd861ba2aff8eaef2f72723f02d3b *src/stri_escape.cpp +7c918c6eb9a300fd1e9ea04f4a213e9b *src/stri_escape.cpp 88faae74025909c16637fef77eff7583 *src/stri_exception.cpp 6bc8b3d28aa6215d74011a918b5cc82c *src/stri_exception.h 2d5b3d10ac8010363a8822c52f5c05c7 *src/stri_exports.h @@ -969,11 +969,11 @@ 3219e9f771bc5be77adb6f3c4ef0a5f3 *src/stri_macros.h 7cf89193ccce3a2d684e61225f132074 *src/stri_messages.h d36cc491a7ecf2ef1ce1c03d4cee299a *src/stri_pad.cpp -53abfefd98aac3ddfabcd0f55fda557a *src/stri_prepare_arg.cpp +9b3dda57df8d0d6fa6b2c6ecd986b17c *src/stri_prepare_arg.cpp 9e1a522240a9e1600b3f979e062a4976 *src/stri_random.cpp 8ed9cc4f769fd6e8acab340f5b29f5ea *src/stri_reverse.cpp 67ef54f7230a594f304ef8a19648053d *src/stri_search_boundaries_count.cpp -1fa039e012943b7c2f9c9227955e4596 *src/stri_search_boundaries_extract.cpp +ab1ac2301718372bcec5035384fcdb90 *src/stri_search_boundaries_extract.cpp 725f38b39ec808719ddc8e75ffb48e34 *src/stri_search_boundaries_locate.cpp 7f20b7851b1836ed523823a54157dd27 *src/stri_search_boundaries_split.cpp 04cb1d51acf451616c1e417fe81e6dfc *src/stri_search_class_count.cpp @@ -1018,8 +1018,8 @@ 394c1c37fe396d80a021cfae56b4a8ea *src/stri_string8.h 9ec3eb328720a9da00f332dc87476b09 *src/stri_string8buf.h fcfba2b7136f32811c0242c29c916531 *src/stri_stringi.cpp -4c0124f46b8dfbf713e963f449e6b143 *src/stri_stringi.h -2e5d441db9e6c7a221137c25947c3340 *src/stri_sub.cpp +7c39270728a2e88c03eb0f405a8e4d5e *src/stri_stringi.h +7655d14c18ab03f361acfc395de4187a *src/stri_sub.cpp 1a548da707b9cc1de5537e3f554eae4a *src/stri_test.cpp bd10dcf550770f9d399ab6433784e4d2 *src/stri_time_calendar.cpp cefb59d556a129b1d1ad586a59f65499 *src/stri_time_format.cpp @@ -1030,7 +1030,7 @@ 6b6c7ba085629e1cee10dc704fde9607 *src/stri_trans_other.cpp b3a9037ac7945a82bbc6406457bf8c74 *src/stri_trans_transliterate.cpp 844cbcc567b5fb3bcb38dd9042dd7da8 *src/stri_ucnv.cpp -d9d08e5b0d7eb16bdb00c558ddfe96d6 *src/stri_ucnv.h +6bbbc7fabed08770b2701f4233e648c7 *src/stri_ucnv.h b247553b0afead8c6407f54c199abd3f *src/stri_uloc.cpp 8ae2c03485913996d36a769d74aea535 *src/stri_utils.cpp 55110e1f35df17cbccd878bcaf8e2982 *src/stri_wrap.cpp diff -Nru r-cran-stringi-1.1.5/NEWS r-cran-stringi-1.1.6/NEWS --- r-cran-stringi-1.1.5/NEWS 2017-04-07 09:44:26.000000000 +0000 +++ r-cran-stringi-1.1.6/NEWS 2017-11-10 11:22:30.000000000 +0000 @@ -1,6 +1,23 @@ stringi package NEWS and CHANGELOG =============================================================================== +## 1.1.6 (2017-11-10) **CRAN** + +* [WINDOWS SPECIFIC] #270: Strings marked with `latin1` encoding +are now converted internally to UTF-8 using the WINDOWS-1252 codec. +This fixes problems with - among others - displaying the Euro sign. + +* [NEW FEATURE] #263: Add support for custom rule-based break iteration, +see `?stri_opts_brkiter`. + +* [NEW FEATURE] #267: `omit_na=TRUE` in `stri_sub<-` now ignores missing values +in any of the arguments provided. + +* [BUGFIX] fixed unPROTECTed variable names and stack imbalances +as reported by rchk + +------------------------------------------------------------------------------- + ## 1.1.5 (2017-04-07) **CRAN** * [GENERAL] `stringi` now requires ICU4C >= 52. diff -Nru r-cran-stringi-1.1.5/R/compare.R r-cran-stringi-1.1.6/R/compare.R --- r-cran-stringi-1.1.5/R/compare.R 2017-03-21 12:30:29.000000000 +0000 +++ r-cran-stringi-1.1.6/R/compare.R 2017-10-31 15:43:00.000000000 +0000 @@ -49,7 +49,7 @@ #' locale-independent operations: for natural language text processing, #' in which the notion of canonical equivalence is more valid, this might #' not be exactly what you are looking for, see Examples. -#' By the way, note that \pkg{stringi} always silently removes UTF-8 +#' Please note that \pkg{stringi} always silently removes UTF-8 #' BOMs from input strings, so e.g. \code{stri_cmp_eq} does not take #' BOMs into account while comparing strings. #' diff -Nru r-cran-stringi-1.1.5/R/encoding.R r-cran-stringi-1.1.6/R/encoding.R --- r-cran-stringi-1.1.5/R/encoding.R 2017-03-21 12:31:05.000000000 +0000 +++ r-cran-stringi-1.1.6/R/encoding.R 2017-10-31 15:43:00.000000000 +0000 @@ -90,7 +90,7 @@ #' for representing Unicode character strings in \R. UTF-8 has ASCII as its #' subset (code points 1--127 represent the same characters in both of them). #' Code points larger than 127 are represented by multi-byte sequences -#' (from 2 to 4 bytes: by the way, not all sequences of bytes are valid UTF-8, +#' (from 2 to 4 bytes: Please note that not all sequences of bytes are valid UTF-8, #' cf. \code{\link{stri_enc_isutf8}}). #' #' Most of the computations in \pkg{stringi} are performed internally @@ -142,7 +142,8 @@ #' Character strings in \R (internally) can be declared to be in: #' \itemize{ #' \item \code{UTF-8}; -#' \item \code{latin1}, i.e. ISO-8859-1 (Western European); +#' \item \code{latin1}, i.e., either ISO-8859-1 (Western European on +#' Linux, macOS, and other Unixes) or WINDOWS-1252 (Windows); #' \item \code{bytes} -- for strings that #' should be manipulated as sequences of bytes. #' } @@ -153,7 +154,7 @@ #' \item \code{native} (a.k.a. \code{unknown} in \code{\link{Encoding}}; #' quite a misleading name: no explicit encoding mark) -- for #' strings that are assumed to be in your platform's native (default) encoding. -#' This can represent UTF-8 if you are an OS X user, +#' This can represent UTF-8 if you are an macOS user, #' or some 8-bit Windows code page, for example. #' The native encoding used by \R may be determined by examining #' the LC_CTYPE category, see \code{\link{Sys.getlocale}}. diff -Nru r-cran-stringi-1.1.5/R/opts.R r-cran-stringi-1.1.6/R/opts.R --- r-cran-stringi-1.1.5/R/opts.R 2017-03-21 12:36:50.000000000 +0000 +++ r-cran-stringi-1.1.6/R/opts.R 2017-10-31 15:43:00.000000000 +0000 @@ -106,9 +106,9 @@ #' stri_cmp("above mentioned", "above-mentioned") #' stri_cmp("above mentioned", "above-mentioned", alternate_shifted=TRUE) stri_opts_collator <- function(locale=NULL, strength=3L, - alternate_shifted=FALSE, french=FALSE, - uppercase_first=NA, case_level=FALSE, - normalization=FALSE, numeric=FALSE, ...) + alternate_shifted=FALSE, french=FALSE, + uppercase_first=NA, case_level=FALSE, + normalization=FALSE, numeric=FALSE, ...) { opts <- list() if (!missing(locale)) opts["locale"] <- locale @@ -213,8 +213,14 @@ #' in case of the \code{\link{stri_locate_all_boundaries}} and #' \code{\link{stri_split_boundaries}} functions. #' -#' @param type single string; break iterator type, one of \code{character}, -#' \code{line_break}, \code{sentence}, or \code{word}; +#' Note that custom break iterator rules (advanced users only) +#' should be specified as a single string. +#' For a detailed description of the syntax of RBBI rules, please refer +#' to the ICU User Guide on Boundary Analysis. +#' +#' @param type single string; either the break iterator type, one of \code{character}, +#' \code{line_break}, \code{sentence}, \code{word}; +#' or a custom set of ICU break iteration rules. #' see \link{stringi-search-boundaries} #' @param locale single string, \code{NULL} or \code{""} for default locale #' @param skip_word_none logical; perform no action for "words" that diff -Nru r-cran-stringi-1.1.5/R/search_extract_4.R r-cran-stringi-1.1.6/R/search_extract_4.R --- r-cran-stringi-1.1.5/R/search_extract_4.R 2017-03-21 12:36:12.000000000 +0000 +++ r-cran-stringi-1.1.6/R/search_extract_4.R 2017-10-31 15:43:00.000000000 +0000 @@ -103,6 +103,11 @@ #' stri_extract_first_charclass('AaBbCc', '\\p{Ll}') #' stri_extract_last_charclass('AaBbCc', '\\p{Ll}') #' +#' \dontrun{ +#' # emoji support available since ICU 57 +#' stri_extract_all_charclass(stri_enc_fromutf32(32:55200), "\\p{EMOJI}") +#' } +#' #' stri_extract_all_coll(c('AaaaaaaA', 'AAAA'), 'a') #' stri_extract_first_coll(c('Yy\u00FD', 'AAA'), 'y', strength=2, locale="sk_SK") #' stri_extract_last_coll(c('Yy\u00FD', 'AAA'), 'y', strength=1, locale="sk_SK") diff -Nru r-cran-stringi-1.1.5/R/search_locate_bound.R r-cran-stringi-1.1.6/R/search_locate_bound.R --- r-cran-stringi-1.1.5/R/search_locate_bound.R 2017-03-21 12:36:07.000000000 +0000 +++ r-cran-stringi-1.1.6/R/search_locate_bound.R 2017-10-31 15:43:00.000000000 +0000 @@ -90,6 +90,9 @@ #' stri_locate_all_boundaries(test, type="character") #' stri_locate_all_words(test) #' +#' stri_extract_all_boundaries("Mr. Jones and Mrs. Brown are very happy. +#' So am I, Prof. Smith.", type="sentence", locale="en_US@ss=standard") # ICU >= 56 only +#' #' @export #' @family search_locate #' @family indexing diff -Nru r-cran-stringi-1.1.5/R/search_match_4.R r-cran-stringi-1.1.6/R/search_match_4.R --- r-cran-stringi-1.1.5/R/search_match_4.R 2017-03-21 12:36:05.000000000 +0000 +++ r-cran-stringi-1.1.6/R/search_match_4.R 2017-10-31 15:43:00.000000000 +0000 @@ -44,7 +44,7 @@ #' If no pattern match is detected and \code{omit_no_match=FALSE}, #' then \code{NA}s are included in the resulting matrix (matrices), see Examples. #' -#' By the way, \pkg{ICU} regex engine currently does not support named capture groups. +#' Please note: \pkg{ICU} regex engine currently does not support named capture groups. #' #' \code{stri_match}, \code{stri_match_all}, \code{stri_match_first}, #' and \code{stri_match_last} are convenience functions. diff -Nru r-cran-stringi-1.1.5/R/search.R r-cran-stringi-1.1.6/R/search.R --- r-cran-stringi-1.1.5/R/search.R 2017-03-21 12:36:44.000000000 +0000 +++ r-cran-stringi-1.1.6/R/search.R 2017-10-31 15:43:00.000000000 +0000 @@ -185,6 +185,8 @@ #' just before the current position. Does not alter the input position. #' The length of possible strings matched by the look-behind pattern must #' not be unbounded (no \code{*} or \code{+} operators.)} +#' \item{\code{(?...)}}{Named capture group. The \code{} +#' are literal - they appear in the pattern.} #' \item{\code{(?ismwx-ismwx:...)}}{Flag settings. Evaluate the parenthesized #' expression with the specified flags enabled or \code{-}disabled, #' see also \code{\link{stri_opts_regex}}.} @@ -218,6 +220,12 @@ #' \item{\code{\\f}}{Match a FORM FEED, \code{\\u000C}.} #' \item{\code{\\G}}{Match if the current position is at the end of the #' previous match.} +#' \item{\code{\\h}}{Match a Horizontal White Space character. +#' They are characters with Unicode General Category of Space_Separator plus +#' the ASCII tab, \code{\\u0009}. [Since ICU 55]} +#' \item{\code{\\H}}{Match a non-Horizontal White Space character. +#' [Since ICU 55]} +#' \item{\code{\\k}}{Named Capture Back Reference. [Since ICU 55]} #' \item{\code{\\n}}{Match a LINE FEED, \code{\\u000A}.} #' \item{\code{\\N{UNICODE CHARACTER NAME}}}{Match the named character.} #' \item{\code{\\p{UNICODE PROPERTY NAME}}}{Match any character with the @@ -613,6 +621,10 @@ #' \item{\code{CHANGES_WHEN_CASEFOLDED}}{} #' \item{\code{CHANGES_WHEN_CASEMAPPED}}{} #' \item{\code{CHANGES_WHEN_NFKC_CASEFOLDED}}{} +#' \item{\code{EMOJI}}{Since ICU 57} +#' \item{\code{EMOJI_PRESENTATION}}{Since ICU 57} +#' \item{\code{EMOJI_MODIFIER}}{Since ICU 57} +#' \item{\code{EMOJI_MODIFIER_BASE}}{Since ICU 57} #' } #' #' diff -Nru r-cran-stringi-1.1.5/R/search_replace_4.R r-cran-stringi-1.1.6/R/search_replace_4.R --- r-cran-stringi-1.1.5/R/search_replace_4.R 2017-03-21 12:36:04.000000000 +0000 +++ r-cran-stringi-1.1.6/R/search_replace_4.R 2017-10-31 15:43:00.000000000 +0000 @@ -59,6 +59,7 @@ #' of the capture group (their numbering starts from 1). #' In order to treat the \code{$} character literally, #' escape it with a backslash. +#' Moreover, \code{${name}} are used for named capture groups. #' #' \code{stri_replace}, \code{stri_replace_all}, \code{stri_replace_first}, #' and \code{stri_replace_last} are convenience functions; they just call @@ -111,6 +112,12 @@ #' stri_replace_all_regex('123|456|789', '(\\p{N}).(\\p{N})', '$2-$1') #' stri_replace_all_regex(c("stringi R", "REXAMINE", "123"), '( R|R.)', ' r ') #' +#' \dontrun{ +#' # named capture groups available since ICU 55 +#' stri_replace_all_regex("words 123 and numbers 456", +#' "(?[0-9]+)", "!${numbers}!") +#' } +#' #' # Compare the results: #' stri_replace_all_fixed("The quick brown fox jumped over the lazy dog.", #' c("quick", "brown", "fox"), c("slow", "black", "bear"), vectorize_all=TRUE) diff -Nru r-cran-stringi-1.1.5/R/search_split_bound.R r-cran-stringi-1.1.6/R/search_split_bound.R --- r-cran-stringi-1.1.5/R/search_split_bound.R 2017-03-21 12:35:59.000000000 +0000 +++ r-cran-stringi-1.1.6/R/search_split_bound.R 2017-10-31 15:43:00.000000000 +0000 @@ -153,7 +153,7 @@ #' #' # filtered break iterator with the new ICU: #' stri_split_boundaries("Mr. Jones and Mrs. Brown are very happy. -#' So am I, Prof. Smith.", type="sentence", locale="@ss=standard") # ICU >= 56 only +#' So am I, Prof. Smith.", type="sentence", locale="en_US@ss=standard") # ICU >= 56 only #' #' @export #' @family search_split diff -Nru r-cran-stringi-1.1.5/R/sub.R r-cran-stringi-1.1.6/R/sub.R --- r-cran-stringi-1.1.5/R/sub.R 2017-03-21 12:35:50.000000000 +0000 +++ r-cran-stringi-1.1.6/R/sub.R 2017-10-31 15:43:00.000000000 +0000 @@ -74,8 +74,8 @@ #' @param from integer vector or two-column matrix #' @param to integer vector; mutually exclusive with \code{length} and \code{from} being a matrix #' @param length integer vector; mutually exclusive with \code{to} and \code{from} being a matrix -#' @param omit_na single logical value; if \code{TRUE}, missing values in \code{from}, -#' \code{to}, or \code{length} will result in an unchanged input; replacement function only +#' @param omit_na single logical value; if \code{TRUE}, missing values in any of the arguments +#' provided will result in an unchanged input; replacement function only #' @param value character vector to be substituted with; replacement function only #' #' diff -Nru r-cran-stringi-1.1.5/R/trans_transliterate.R r-cran-stringi-1.1.6/R/trans_transliterate.R --- r-cran-stringi-1.1.5/R/trans_transliterate.R 2017-03-21 12:35:18.000000000 +0000 +++ r-cran-stringi-1.1.6/R/trans_transliterate.R 2017-10-31 15:43:00.000000000 +0000 @@ -90,6 +90,9 @@ #' stri_trans_general("stringi", "upper") # see stri_trans_toupper #' stri_trans_general("\u0104", "nfd; lower") # compound id; see stri_trans_nfd #' stri_trans_general("tato nie wraca ranki wieczory", "pl-pl_FONIPA") +#' stri_trans_general("\u2620", "any-name") # character name +#' stri_trans_general("\\N{latin small letter a}", "name-any") # decode name +#' stri_trans_general("\u2620", "hex") # to hex stri_trans_general <- function(str, id) { .Call(C_stri_trans_general, str, id) } diff -Nru r-cran-stringi-1.1.5/src/stri_brkiter.cpp r-cran-stringi-1.1.6/src/stri_brkiter.cpp --- r-cran-stringi-1.1.5/src/stri_brkiter.cpp 2017-04-07 09:50:48.000000000 +0000 +++ r-cran-stringi-1.1.6/src/stri_brkiter.cpp 2017-11-10 13:42:05.000000000 +0000 @@ -49,6 +49,9 @@ * * @version 0.4-1 (Marek Gagolewski, 2014-12-02) * moved to the StriBrkIterOptions class + * + * @version 1.1.6 (Marek Gagolewski, 2017-04-22) + * Add support for RBBI */ void StriBrkIterOptions::setType(SEXP opts_brkiter, const char* _default) { const char* type_opts[] = {"character", "line_break", "sentence", "word", NULL}; @@ -68,14 +71,19 @@ Rf_error(MSG__INCORRECT_BRKITER_OPTION_SPEC); // error() allowed here const char* curname = CHAR(STRING_ELT(names, i)); if (!strcmp(curname, "type")) { - SEXP curval; - PROTECT(curval = stri_prepare_arg_string_1(VECTOR_ELT(opts_brkiter, i), "type")); + SEXP curval, curval2; + PROTECT(curval2 = stri_enc_toutf8(VECTOR_ELT(opts_brkiter, i), + Rf_ScalarLogical(FALSE), + Rf_ScalarLogical(FALSE))); + PROTECT(curval = stri_prepare_arg_string_1(curval2, "type")); if (STRING_ELT(curval, i) == NA_STRING) { UNPROTECT(1); Rf_error(MSG__INCORRECT_MATCH_OPTION, "type"); } - brkiter_cur = stri__match_arg(CHAR(STRING_ELT(curval, i)), type_opts); - UNPROTECT(1); + const char* curval3 = CHAR(STRING_ELT(curval, i)); + this->rules = UnicodeString::fromUTF8(curval3); + brkiter_cur = stri__match_arg(curval3, type_opts); + UNPROTECT(2); break; } } @@ -85,21 +93,25 @@ } switch (brkiter_cur) { - case 0: // character - this->type = UBRK_CHARACTER; - break; - case 1: // line_break - this->type = UBRK_LINE; - break; - case 2: // sentence - this->type = UBRK_SENTENCE; - break; - case 3: // word - this->type = UBRK_WORD; - break; - default: - Rf_error(MSG__INCORRECT_MATCH_OPTION, "type"); // error() allowed here - break; + case 0: // character + this->type = UBRK_CHARACTER; + this->rules = UnicodeString(); + break; + case 1: // line_break + this->type = UBRK_LINE; + this->rules = UnicodeString(); + break; + case 2: // sentence + this->type = UBRK_SENTENCE; + this->rules = UnicodeString(); + break; + case 3: // word + this->type = UBRK_WORD; + this->rules = UnicodeString(); + break; + default: + // do nothing - custom rules specified + break; } } @@ -130,7 +142,8 @@ Rf_error(MSG__INCORRECT_BRKITER_OPTION_SPEC); // error() allowed here const char* curname = CHAR(STRING_ELT(names, i)); if (!strcmp(curname, "locale")) { - this->locale = stri__prepare_arg_locale(VECTOR_ELT(opts_brkiter, i), "locale", true); /* this is R_alloc'ed */ + this->locale = stri__prepare_arg_locale(VECTOR_ELT(opts_brkiter, i), + "locale", true); // this is R_alloc'ed return; } } diff -Nru r-cran-stringi-1.1.5/src/stri_brkiter.h r-cran-stringi-1.1.6/src/stri_brkiter.h --- r-cran-stringi-1.1.5/src/stri_brkiter.h 2017-04-07 09:50:48.000000000 +0000 +++ r-cran-stringi-1.1.6/src/stri_brkiter.h 2017-11-10 13:42:05.000000000 +0000 @@ -47,14 +47,17 @@ * @version 0.4-1 (Marek Gagolewski, 2014-12-02) * * @version 1.1.3 (Marek Gagolewski, 2017-01-07) UBRK_COUNT deprecated + * + * @version 1.1.6 (Marek Gagolewski, 2017-04-22) Add support for RBBI */ class StriBrkIterOptions { protected: - const char* locale; // R_alloc'd + const char* locale; // R_alloc'd + UnicodeString rules; UBreakIteratorType type; - int32_t* skip_rules; // R_alloc'd - R_len_t skip_size; // number of elements in skip_rules + int32_t* skip_rules; // R_alloc'd + R_len_t skip_size; // number of elements in skip_rules private: @@ -95,6 +98,8 @@ * @version 0.4-1 (Marek Gagolewski, 2014-12-02) separate class * * @version 1.1.3 (Marek Gagolewski, 2017-01-07) UBRK_COUNT deprecated + * + * @version 1.1.6 (Marek Gagolewski, 2017-04-22) Add support for RBBI */ class StriUBreakIterator : public StriBrkIterOptions { private: @@ -106,7 +111,14 @@ if (uiterator) throw StriException("!NDEBUG: StriUBreakIterator::open()"); #endif UErrorCode status = U_ZERO_ERROR; - switch (type) { + if (!rules.isEmpty()) { + UParseError parseErr; + uiterator = ubrk_openRules(rules.getTerminatedBuffer(), + -1/*null-terminated*/, NULL, 0, + &parseErr, &status); + } + else { + switch (type) { case UBRK_CHARACTER: // character uiterator = ubrk_open(UBRK_CHARACTER, locale, NULL, 0, &status); break; @@ -121,6 +133,7 @@ break; default: throw StriException(MSG__INTERNAL_ERROR); + } } STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } @@ -179,6 +192,8 @@ * * @version 0.4-1 (Marek Gagolewski, 2014-12-02) * separate class + * + * @version 1.1.6 (Marek Gagolewski, 2017-04-22) Add support for RBBI */ class StriRuleBasedBreakIterator : public StriBrkIterOptions { private: @@ -200,7 +215,14 @@ void open() { UErrorCode status = U_ZERO_ERROR; Locale loc = Locale::createFromName(locale); - switch (type) { + if (!rules.isEmpty()) { + UParseError parseErr; + rbiterator = (BreakIterator*) new RuleBasedBreakIterator( + UnicodeString(rules), parseErr, status + ); + } + else { + switch (type) { case UBRK_CHARACTER: // character rbiterator = (BreakIterator*)BreakIterator::createCharacterInstance(loc, status); break; @@ -215,13 +237,9 @@ break; default: throw StriException(MSG__INTERNAL_ERROR); + } } STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) - -// UnicodeString s = rbiterator->getRules(); -// std::string s2; -// s.toUTF8String(s2); -// printf("%s\n", s2.c_str()); } bool ignoreBoundary(); diff -Nru r-cran-stringi-1.1.5/src/stri_collator.cpp r-cran-stringi-1.1.6/src/stri_collator.cpp --- r-cran-stringi-1.1.5/src/stri_collator.cpp 2017-04-07 09:50:48.000000000 +0000 +++ r-cran-stringi-1.1.6/src/stri_collator.cpp 2017-11-10 13:42:05.000000000 +0000 @@ -61,6 +61,9 @@ * * @version 0.4-1 (Marek Gagolewski, 2014-12-08) * #23: add `overlap` option + * + * @version 1.1.6 (Marek Gagolewski, 2017-11-10) + * PROTECT STRING_ELT(names, i) */ UCollator* stri__ucol_open(SEXP opts_collator) { @@ -96,7 +99,7 @@ if (STRING_ELT(names, i) == NA_STRING) Rf_error(MSG__INCORRECT_COLLATOR_OPTION_SPEC); // error() allowed here - const char* curname = CHAR(STRING_ELT(names, i)); + const char* curname = stri__copy_string_Ralloc(STRING_ELT(names, i), "curname"); /* this is R_alloc'ed */ if (!strcmp(curname, "locale")) { opt_LOCALE = stri__prepare_arg_locale(VECTOR_ELT(opts_collator, i), "locale", true); /* this is R_alloc'ed */ } else if (!strcmp(curname, "strength")) { diff -Nru r-cran-stringi-1.1.5/src/stri_container_bytesearch.cpp r-cran-stringi-1.1.6/src/stri_container_bytesearch.cpp --- r-cran-stringi-1.1.5/src/stri_container_bytesearch.cpp 2017-04-07 09:50:48.000000000 +0000 +++ r-cran-stringi-1.1.6/src/stri_container_bytesearch.cpp 2017-11-10 13:42:05.000000000 +0000 @@ -385,6 +385,9 @@ * * @version 0.4-1 (Marek Gagolewski, 2014-12-08) * add `overlap` option + * + * @version 1.1.6 (Marek Gagolewski, 2017-11-10) + * PROTECT STRING_ELT(names, i) */ uint32_t StriContainerByteSearch::getByteSearchFlags(SEXP opts_fixed, bool allow_overlap) { @@ -404,8 +407,7 @@ if (STRING_ELT(names, i) == NA_STRING) Rf_error(MSG__FIXED_CONFIG_FAILED); // error() call allowed here - const char* curname = CHAR(STRING_ELT(names, i)); - + const char* curname = stri__copy_string_Ralloc(STRING_ELT(names, i), "curname"); /* this is R_alloc'ed */ if (!strcmp(curname, "case_insensitive")) { bool val = stri__prepare_arg_logical_1_notNA(VECTOR_ELT(opts_fixed, i), "case_insensitive"); if (val) flags |= BYTESEARCH_CASE_INSENSITIVE; diff -Nru r-cran-stringi-1.1.5/src/stri_container_regex.cpp r-cran-stringi-1.1.6/src/stri_container_regex.cpp --- r-cran-stringi-1.1.5/src/stri_container_regex.cpp 2017-04-07 09:50:48.000000000 +0000 +++ r-cran-stringi-1.1.6/src/stri_container_regex.cpp 2017-11-10 13:42:05.000000000 +0000 @@ -140,6 +140,9 @@ * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Disallow NA options + * + * @version 1.1.6 (Marek Gagolewski, 2017-11-10) + * PROTECT STRING_ELT(names, i) */ uint32_t StriContainerRegexPattern::getRegexFlags(SEXP opts_regex) { @@ -159,8 +162,7 @@ if (STRING_ELT(names, i) == NA_STRING) Rf_error(MSG__REGEXP_CONFIG_FAILED); // error() call allowed here - const char* curname = CHAR(STRING_ELT(names, i)); - + const char* curname = stri__copy_string_Ralloc(STRING_ELT(names, i), "curname"); /* this is R_alloc'ed */ if (!strcmp(curname, "case_insensitive")) { bool val = stri__prepare_arg_logical_1_notNA(VECTOR_ELT(opts_regex, i), "case_insensitive"); if (val) flags |= UREGEX_CASE_INSENSITIVE; diff -Nru r-cran-stringi-1.1.5/src/stri_container_utf16.cpp r-cran-stringi-1.1.6/src/stri_container_utf16.cpp --- r-cran-stringi-1.1.5/src/stri_container_utf16.cpp 2017-04-07 09:50:48.000000000 +0000 +++ r-cran-stringi-1.1.6/src/stri_container_utf16.cpp 2017-11-10 13:42:05.000000000 +0000 @@ -70,6 +70,9 @@ * @param rstr R character vector * @param nrecycle extend length [vectorization] * @param shallowrecycle will \code{this->str} be ever modified? + * + * @version 1.0.6 (Marek Gagolewski, 2017-05-25) + * #270 latin-1 is windows-1252 on Windows */ StriContainerUTF16::StriContainerUTF16(SEXP rstr, R_len_t _nrecycle, bool _shallowrecycle) { @@ -94,7 +97,12 @@ code to handle several important encodings algorithmically: US-ASCII, ISO-8859-1, UTF-7/8/16/32, SCSU, BOCU-1, CESU-8, and IMAP-mailbox-name */ StriUcnv ucnvASCII("US-ASCII"); +#if defined(_WIN32) || defined(_WIN64) + // #270: latin-1 is windows-1252 on Windows + StriUcnv ucnvLatin1("WINDOWS-1252"); +#else StriUcnv ucnvLatin1("ISO-8859-1"); +#endif StriUcnv ucnvNative(NULL); for (R_len_t i=0; istr} be ever modified? + * + * @version 1.0.6 (Marek Gagolewski, 2017-05-25) + * #270 latin-1 is windows-1252 on Windows */ StriContainerUTF8::StriContainerUTF8(SEXP rstr, R_len_t _nrecycle, bool _shallowrecycle) { @@ -76,7 +79,12 @@ code to handle several important encodings algorithmically: US-ASCII, ISO-8859-1, UTF-7/8/16/32, SCSU, BOCU-1, CESU-8, and IMAP-mailbox-name */ // for conversion from non-utf8/ascii native charsets: +#if defined(_WIN32) || defined(_WIN64) + // #270: latin-1 is windows-1252 on Windows + StriUcnv ucnvLatin1("WINDOWS-1252"); +#else StriUcnv ucnvLatin1("ISO-8859-1"); +#endif StriUcnv ucnvNative(NULL); R_len_t outbufsize = -1; String8buf outbuf(0); diff -Nru r-cran-stringi-1.1.5/src/stri_escape.cpp r-cran-stringi-1.1.6/src/stri_escape.cpp --- r-cran-stringi-1.1.5/src/stri_escape.cpp 2017-04-07 09:50:48.000000000 +0000 +++ r-cran-stringi-1.1.6/src/stri_escape.cpp 2017-11-10 13:42:05.000000000 +0000 @@ -48,6 +48,9 @@ * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc + * + * @version 1.1.6 (Steve Grubb, 2017-07-20) + * if ((char)c >= 32 || (char)c <= 126) should be && */ SEXP stri_escape_unicode(SEXP str) { @@ -83,7 +86,7 @@ U8_NEXT(str_cur_s, j, str_cur_n, c); if (c < 0) throw StriException(MSG__INVALID_UTF8); - else if ((char)c >= 32 || (char)c <= 126) + else if ((char)c >= 32 && (char)c <= 126) bufsize += 1; else if (c <= 0xff) bufsize += 6; // for \a, \n this will be overestimated @@ -115,7 +118,7 @@ case 0x27: out.append("\\'"); break; case 0x5c: out.append("\\\\"); break; default: - if ((char)c >= 32 || (char)c <= 126) // printable characters + if ((char)c >= 32 && (char)c <= 126) // printable characters out.append(1, (char)c); else { sprintf(buf, "\\u%4.4x", (uint16_t)c); diff -Nru r-cran-stringi-1.1.5/src/stri_prepare_arg.cpp r-cran-stringi-1.1.6/src/stri_prepare_arg.cpp --- r-cran-stringi-1.1.5/src/stri_prepare_arg.cpp 2017-04-07 09:50:48.000000000 +0000 +++ r-cran-stringi-1.1.6/src/stri_prepare_arg.cpp 2017-11-10 13:42:05.000000000 +0000 @@ -216,7 +216,7 @@ { SEXP call; PROTECT(call = Rf_lang2(Rf_install("as.character"), x)); - PROTECT(x = Rf_eval(call, R_GlobalEnv)); // this will mark it's encoding manually + PROTECT(x = Rf_eval(call, R_GlobalEnv)); // this will mark its encoding manually UNPROTECT(2); return x; } @@ -224,7 +224,7 @@ { SEXP call; PROTECT(call = Rf_lang2(Rf_install("as.character"), x)); - PROTECT(x = Rf_eval(call, R_GlobalEnv)); // this will mark it's encoding manually + PROTECT(x = Rf_eval(call, R_GlobalEnv)); // this will mark its encoding manually UNPROTECT(2); return x; } @@ -276,7 +276,7 @@ { SEXP call; PROTECT(call = Rf_lang2(Rf_install("as.character"), x)); - PROTECT(x = Rf_eval(call, R_GlobalEnv)); // this will mark it's encoding manually + PROTECT(x = Rf_eval(call, R_GlobalEnv)); // this will mark its encoding manually PROTECT(x = Rf_coerceVector(x, REALSXP)); UNPROTECT(3); return x; @@ -285,7 +285,7 @@ { SEXP call; PROTECT(call = Rf_lang2(Rf_install("as.double"), x)); - PROTECT(x = Rf_eval(call, R_GlobalEnv)); // this will mark it's encoding manually + PROTECT(x = Rf_eval(call, R_GlobalEnv)); // this will mark its encoding manually UNPROTECT(2); return x; } @@ -377,7 +377,7 @@ { SEXP call; PROTECT(call = Rf_lang2(Rf_install("as.character"), x)); - PROTECT(x = Rf_eval(call, R_GlobalEnv)); // this will mark it's encoding manually + PROTECT(x = Rf_eval(call, R_GlobalEnv)); // this will mark its encoding manually PROTECT(x = Rf_coerceVector(x, INTSXP)); UNPROTECT(3); return x; @@ -386,7 +386,7 @@ { SEXP call; PROTECT(call = Rf_lang2(Rf_install("as.integer"), x)); - PROTECT(x = Rf_eval(call, R_GlobalEnv)); // this will mark it's encoding manually + PROTECT(x = Rf_eval(call, R_GlobalEnv)); // this will mark its encoding manually UNPROTECT(2); return x; } @@ -436,7 +436,7 @@ { SEXP call; PROTECT(call = Rf_lang2(Rf_install("as.character"), x)); - PROTECT(x = Rf_eval(call, R_GlobalEnv)); // this will mark it's encoding manually + PROTECT(x = Rf_eval(call, R_GlobalEnv)); // this will mark its encoding manually PROTECT(x = Rf_coerceVector(x, LGLSXP)); UNPROTECT(3); return x; @@ -445,7 +445,7 @@ { SEXP call; PROTECT(call = Rf_lang2(Rf_install("as.logical"), x)); - PROTECT(x = Rf_eval(call, R_GlobalEnv)); // this will mark it's encoding manually + PROTECT(x = Rf_eval(call, R_GlobalEnv)); // this will mark its encoding manually UNPROTECT(2); return x; } @@ -492,7 +492,7 @@ { SEXP call; PROTECT(call = Rf_lang2(Rf_install("as.character"), x)); - PROTECT(x = Rf_eval(call, R_GlobalEnv)); // this will mark it's encoding manually + PROTECT(x = Rf_eval(call, R_GlobalEnv)); // this will mark its encoding manually PROTECT(x = Rf_coerceVector(x, RAWSXP)); UNPROTECT(3); return x; @@ -501,7 +501,7 @@ { SEXP call; PROTECT(call = Rf_lang2(Rf_install("as.raw"), x)); - PROTECT(x = Rf_eval(call, R_GlobalEnv)); // this will mark it's encoding manually + PROTECT(x = Rf_eval(call, R_GlobalEnv)); // this will mark its encoding manually UNPROTECT(2); return x; } @@ -538,11 +538,41 @@ if ((SEXP*)argname == (SEXP*)R_NilValue) argname = ""; - PROTECT(x = stri_prepare_arg_string(x, argname)); + int nprotect = 0; + if (Rf_isFactor(x)) + { + SEXP call; + PROTECT(call = Rf_lang2(Rf_install("as.character"), x)); + PROTECT(x = Rf_eval(call, R_GlobalEnv)); // this will mark its encoding manually + nprotect = 2; + } + else if (Rf_isVectorList(x) || isObject(x)) + { + SEXP call; + PROTECT(call = Rf_lang2(Rf_install("as.character"), x)); + PROTECT(x = Rf_eval(call, R_GlobalEnv)); // this will mark its encoding manually + nprotect = 2; + } + else if ((bool)isString(x)) + nprotect = 0; + else if (Rf_isVectorAtomic(x) || isNull(x)) { + PROTECT(x = Rf_coerceVector(x, STRSXP)); + nprotect = 1; + } + else if ((bool)isSymbol(x)) { + PROTECT(x = Rf_ScalarString(PRINTNAME(x))); + nprotect = 1; + } + else { + Rf_error(MSG__ARG_EXPECTED_STRING, argname); // allowed here + return x; // avoid compiler warning + } + + R_len_t nx = LENGTH(x); if (nx <= 0) { - UNPROTECT(1); + UNPROTECT(nprotect); Rf_error(MSG__ARG_EXPECTED_NOT_EMPTY, argname); // allowed here // won't come here anyway return x; // avoid compiler warning @@ -551,12 +581,13 @@ Rf_warning(MSG__ARG_EXPECTED_1_STRING, argname); SEXP xold = x; PROTECT(x = Rf_allocVector(STRSXP, 1)); + nprotect++; SET_STRING_ELT(x, 0, STRING_ELT(xold, 0)); - UNPROTECT(2); + UNPROTECT(nprotect); return x; } else { // if (nx == 1) - UNPROTECT(1); + UNPROTECT(nprotect); return x; } } @@ -585,11 +616,37 @@ if ((SEXP*)argname == (SEXP*)R_NilValue) argname = ""; - PROTECT(x = stri_prepare_arg_double(x, argname)); + int nprotect = 0; + if (Rf_isFactor(x)) + { + SEXP call; + PROTECT(call = Rf_lang2(Rf_install("as.character"), x)); + PROTECT(x = Rf_eval(call, R_GlobalEnv)); // this will mark its encoding manually + PROTECT(x = Rf_coerceVector(x, REALSXP)); + nprotect = 3; + } + else if (Rf_isVectorList(x) || isObject(x)) + { + SEXP call; + PROTECT(call = Rf_lang2(Rf_install("as.double"), x)); + PROTECT(x = Rf_eval(call, R_GlobalEnv)); // this will mark its encoding manually + nprotect = 2; + } + else if ((bool)isReal(x)) + nprotect = 0; + else if (Rf_isVectorAtomic(x) || isNull(x)) { + PROTECT(x = Rf_coerceVector(x, REALSXP)); + nprotect = 1; + } + else { + Rf_error(MSG__ARG_EXPECTED_NUMERIC, argname); // allowed here + return x; // avoid compiler warning + } + R_len_t nx = LENGTH(x); if (nx <= 0) { - UNPROTECT(1); + UNPROTECT(nprotect); Rf_error(MSG__ARG_EXPECTED_NOT_EMPTY, argname); // allowed here // won't come here anyway return x; // avoid compiler warning @@ -598,12 +655,13 @@ Rf_warning(MSG__ARG_EXPECTED_1_NUMERIC, argname); double x0 = REAL(x)[0]; PROTECT(x = Rf_allocVector(REALSXP, 1)); + nprotect++; REAL(x)[0] = x0; - UNPROTECT(2); + UNPROTECT(nprotect); return x; } else {// if (nx == 1) - UNPROTECT(1); + UNPROTECT(nprotect); return x; } } @@ -632,11 +690,37 @@ if ((SEXP*)argname == (SEXP*)R_NilValue) argname = ""; - PROTECT(x = stri_prepare_arg_integer(x, argname)); + int nprotect = 0; + if (Rf_isFactor(x)) // factors must be checked first (as they are currently represented as integer vectors) + { + SEXP call; + PROTECT(call = Rf_lang2(Rf_install("as.character"), x)); + PROTECT(x = Rf_eval(call, R_GlobalEnv)); // this will mark its encoding manually + PROTECT(x = Rf_coerceVector(x, INTSXP)); + nprotect = 3; + } + else if (Rf_isVectorList(x) || isObject(x)) + { + SEXP call; + PROTECT(call = Rf_lang2(Rf_install("as.integer"), x)); + PROTECT(x = Rf_eval(call, R_GlobalEnv)); // this will mark its encoding manually + nprotect = 2; + } + else if (Rf_isInteger(x)) + nprotect = 0; + else if (Rf_isVectorAtomic(x) || isNull(x)) { + PROTECT(x = Rf_coerceVector(x, INTSXP)); + nprotect = 1; + } + else { + Rf_error(MSG__ARG_EXPECTED_INTEGER, argname); //allowed here + return x; // avoid compiler warning + } + R_len_t nx = LENGTH(x); if (nx <= 0) { - UNPROTECT(1); + UNPROTECT(nprotect); Rf_error(MSG__ARG_EXPECTED_NOT_EMPTY, argname); // allowed here // won't come here anyway return x; // avoid compiler warning @@ -645,12 +729,13 @@ Rf_warning(MSG__ARG_EXPECTED_1_INTEGER, argname); int x0 = INTEGER(x)[0]; PROTECT(x = Rf_allocVector(INTSXP, 1)); + nprotect++; INTEGER(x)[0] = x0; - UNPROTECT(2); + UNPROTECT(nprotect); return x; } else { // if (nx == 1) - UNPROTECT(1); + UNPROTECT(nprotect); return x; } } @@ -676,14 +761,43 @@ */ SEXP stri_prepare_arg_logical_1(SEXP x, const char* argname) { + int nprotect = 0; + if ((SEXP*)argname == (SEXP*)R_NilValue) argname = ""; - PROTECT(x = stri_prepare_arg_logical(x, argname)); + if (Rf_isFactor(x)) + { + SEXP call; + PROTECT(call = Rf_lang2(Rf_install("as.character"), x)); + PROTECT(x = Rf_eval(call, R_GlobalEnv)); // this will mark its encoding manually + PROTECT(x = Rf_coerceVector(x, LGLSXP)); + nprotect = 3; + } + else if (Rf_isVectorList(x) || isObject(x)) + { + SEXP call; + PROTECT(call = Rf_lang2(Rf_install("as.logical"), x)); + PROTECT(x = Rf_eval(call, R_GlobalEnv)); // this will mark its encoding manually + nprotect = 2; + } + else if ((bool)isLogical(x)) { + nprotect = 0; + // do nothing + } + else if (Rf_isVectorAtomic(x) || isNull(x)) { + PROTECT(x = Rf_coerceVector(x, LGLSXP)); + nprotect = 1; + } + else { + Rf_error(MSG__ARG_EXPECTED_LOGICAL, argname); // allowed here + return x; // avoid compiler warning + } + R_len_t nx = LENGTH(x); if (nx <= 0) { - UNPROTECT(1); + UNPROTECT(nprotect); Rf_error(MSG__ARG_EXPECTED_NOT_EMPTY, argname); // allowed here // won't come here anyway return x; // avoid compiler warning @@ -692,12 +806,13 @@ Rf_warning(MSG__ARG_EXPECTED_1_LOGICAL, argname); int x0 = LOGICAL(x)[0]; PROTECT(x = Rf_allocVector(LGLSXP, 1)); + nprotect++; LOGICAL(x)[0] = x0; - UNPROTECT(2); + UNPROTECT(nprotect); return x; } else { // if (nx == 1) - UNPROTECT(1); + UNPROTECT(nprotect); return x; } } @@ -790,6 +905,36 @@ } +/** + * This is a helper function to avoid UNPROTECTED var names warning + * when playing with CHARSXP directly + * + * @param x an R STRING object (from STRING_ELT(charactervector, index)) + * @return an Ralloc'd character string + * + * @version 1.1.6 (Marek Gagolewski, 2017-11-10) + */ +const char* stri__copy_string_Ralloc(SEXP x, const char* argname) +{ + PROTECT(x); + if (x == NA_STRING) { + UNPROTECT(1); + Rf_error(MSG__ARG_EXPECTED_NOT_NA, argname); // allowed here + } + + const char* ret_tmp = (const char*)CHAR(x); // ret may be gc'ed + size_t ret_n = strlen(ret_tmp); + /* R_alloc == Here R will reclaim the memory at the end of the call to .Call */ + char* ret = R_alloc(ret_n+1, (int)sizeof(char)); + if (!ret) { + UNPROTECT(1); + Rf_error(MSG__MEM_ALLOC_ERROR); + } + memcpy(ret, ret_tmp, ret_n+1); + UNPROTECT(1); + return ret; +} + /** Prepare double argument - one value, not NA [no re-encoding done!!!] * * If there are 0 elements -> error diff -Nru r-cran-stringi-1.1.5/src/stri_search_boundaries_extract.cpp r-cran-stringi-1.1.6/src/stri_search_boundaries_extract.cpp --- r-cran-stringi-1.1.5/src/stri_search_boundaries_extract.cpp 2017-04-07 09:50:48.000000000 +0000 +++ r-cran-stringi-1.1.6/src/stri_search_boundaries_extract.cpp 2017-11-10 13:42:05.000000000 +0000 @@ -134,7 +134,7 @@ StriBrkIterOptions opts_brkiter2(opts_brkiter, "line_break"); STRI__ERROR_HANDLER_BEGIN(2) - R_len_t str_length = LENGTH(str); + R_len_t str_length = LENGTH(str); StriContainerUTF8_indexable str_cont(str, str_length); StriRuleBasedBreakIterator brkiter(opts_brkiter2); @@ -169,7 +169,7 @@ for (R_len_t j = 0; iter != occurrences.end(); ++iter, ++j) { pair curo = *iter; SET_STRING_ELT(cur_res, j, - Rf_mkCharLenCE(str_cur_s+curo.first, curo.second-curo.first, CE_UTF8)); + Rf_mkCharLenCE(str_cur_s+curo.first, curo.second-curo.first, CE_UTF8)); } SET_VECTOR_ELT(ret, i, cur_res); STRI__UNPROTECT(1); @@ -177,14 +177,14 @@ if (LOGICAL(simplify)[0] == NA_LOGICAL) { STRI__PROTECT(ret = stri_list2matrix(ret, Rf_ScalarLogical(TRUE), - stri__vector_NA_strings(1), Rf_ScalarInteger(0))) + stri__vector_NA_strings(1), Rf_ScalarInteger(0))) } else if (LOGICAL(simplify)[0]) { STRI__PROTECT(ret = stri_list2matrix(ret, Rf_ScalarLogical(TRUE), - stri__vector_empty_strings(1), Rf_ScalarInteger(0))) + stri__vector_empty_strings(1), Rf_ScalarInteger(0))) } STRI__UNPROTECT_ALL - return ret; + return ret; STRI__ERROR_HANDLER_END({/* no-op */}) } diff -Nru r-cran-stringi-1.1.5/src/stri_stringi.h r-cran-stringi-1.1.6/src/stri_stringi.h --- r-cran-stringi-1.1.5/src/stri_stringi.h 2017-04-07 09:50:48.000000000 +0000 +++ r-cran-stringi-1.1.6/src/stri_stringi.h 2017-11-10 13:42:05.000000000 +0000 @@ -66,6 +66,7 @@ int stri__width_string(const char* str_cur_s, int str_cur_n); // prepare_arg.cpp: +const char* stri__copy_string_Ralloc(SEXP, const char* argname); const char* stri__prepare_arg_string_1_notNA(SEXP x, const char* argname); double stri__prepare_arg_double_1_notNA(SEXP x, const char* argname); int stri__prepare_arg_integer_1_notNA(SEXP x, const char* argname); diff -Nru r-cran-stringi-1.1.5/src/stri_sub.cpp r-cran-stringi-1.1.6/src/stri_sub.cpp --- r-cran-stringi-1.1.5/src/stri_sub.cpp 2017-04-07 09:50:48.000000000 +0000 +++ r-cran-stringi-1.1.6/src/stri_sub.cpp 2017-11-10 13:42:05.000000000 +0000 @@ -51,6 +51,7 @@ from_ismatrix = false; /* it's a column vector */ else if (INTEGER(t)[1] > 2) { /* error() is allowed here */ + UNPROTECT(1); // t Rf_error(MSG__ARG_EXPECTED_MATRIX_WITH_GIVEN_COLUMNS, "from", 2); } UNPROTECT(1); // t @@ -293,13 +294,10 @@ { R_len_t cur_from = from_tab[i % from_len]; R_len_t cur_to = (to_tab)?to_tab[i % to_len]:length_tab[i % length_len]; - if (str_cont.isNA(i) || value_cont.isNA(i)) { - SET_STRING_ELT(ret, i, NA_STRING); - continue; - } - if (cur_from == NA_INTEGER || cur_to == NA_INTEGER) { + if (str_cont.isNA(i) || value_cont.isNA(i) || cur_from == NA_INTEGER || cur_to == NA_INTEGER) { if (omit_na_1) { + // if str_conf is NA, this will be NA_STRING as well. SET_STRING_ELT(ret, i, str_cont.toR(i)); } else { diff -Nru r-cran-stringi-1.1.5/src/stri_ucnv.h r-cran-stringi-1.1.6/src/stri_ucnv.h --- r-cran-stringi-1.1.5/src/stri_ucnv.h 2017-04-07 09:50:48.000000000 +0000 +++ r-cran-stringi-1.1.6/src/stri_ucnv.h 2017-11-10 13:42:05.000000000 +0000 @@ -42,6 +42,9 @@ * A class to manage an encoding converter * * @version 0.2-1 (Marek Gagolewski, 2014-03-28) + * + * @version 1.0.6 (Marek Gagolewski, 2017-05-25) + * #270 latin-1 is windows-1252 on Windows */ class StriUcnv { @@ -171,7 +174,12 @@ m_is8bit = false; return CE_UTF8; } +#if defined(_WIN32) || defined(_WIN64) + // #270: latin-1 is windows-1252 on Windows + else if (!strcmp(ucnv_name, "windows-1252") || !strcmp(ucnv_name, "ibm-5348_P100-1997")) { +#else else if (!strcmp(ucnv_name, "ISO-8859-1")) { +#endif m_is8bit = true; m_isutf8 = false; return CE_LATIN1;