diff -Nru miller-5.3.0/c/cli/mlrcli.c miller-5.4.0/c/cli/mlrcli.c --- miller-5.3.0/c/cli/mlrcli.c 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/c/cli/mlrcli.c 2018-10-14 20:17:52.000000000 +0000 @@ -38,14 +38,17 @@ // ---------------------------------------------------------------- static mapper_setup_t* mapper_lookup_table[] = { + &mapper_altkv_setup, &mapper_bar_setup, &mapper_bootstrap_setup, &mapper_cat_setup, &mapper_check_setup, + &mapper_clean_whitespace_setup, &mapper_count_distinct_setup, &mapper_count_similar_setup, &mapper_cut_setup, &mapper_decimate_setup, + &mapper_fill_down_setup, &mapper_filter_setup, &mapper_fraction_setup, &mapper_grep_setup, diff -Nru miller-5.3.0/c/cli/mlrcli.h miller-5.4.0/c/cli/mlrcli.h --- miller-5.3.0/c/cli/mlrcli.h 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/c/cli/mlrcli.h 2018-10-14 20:17:52.000000000 +0000 @@ -13,6 +13,7 @@ #include "cli/json_array_ingest.h" #include "containers/lhmsll.h" #include "containers/lhmss.h" +#include // ---------------------------------------------------------------- typedef struct _genereator_opts_t { diff -Nru miller-5.3.0/c/containers/lhmsll.h miller-5.4.0/c/containers/lhmsll.h --- miller-5.3.0/c/containers/lhmsll.h 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/c/containers/lhmsll.h 2018-10-14 20:17:52.000000000 +0000 @@ -29,9 +29,9 @@ typedef unsigned char lhmslle_state_t; typedef struct _lhmsll_t { - int num_occupied; - int num_freed; - int array_length; + int num_occupied; + int num_freed; + int array_length; lhmslle_t* entries; lhmslle_state_t* states; lhmslle_t* phead; @@ -40,7 +40,9 @@ // ---------------------------------------------------------------- lhmsll_t* lhmsll_alloc(); + lhmsll_t* lhmsll_copy(lhmsll_t* pmap); + void lhmsll_free(lhmsll_t* pmap); void lhmsll_put(lhmsll_t* pmap, char* key, int value, char free_flags); long long lhmsll_get(lhmsll_t* pmap, char* key); // caller must do lhmsll_has_key to check validity diff -Nru miller-5.3.0/c/containers/lhmss.c miller-5.4.0/c/containers/lhmss.c --- miller-5.3.0/c/containers/lhmss.c 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/c/containers/lhmss.c 2018-10-14 20:17:52.000000000 +0000 @@ -149,6 +149,8 @@ if (pmap->states[index] == OCCUPIED) { // Existing key found in chain; put value. + if (pe->free_flags & FREE_ENTRY_KEY) + free(key); if (pe->free_flags & FREE_ENTRY_VALUE) free(pe->value); pe->value = value; diff -Nru miller-5.3.0/c/draft-release-notes.md miller-5.4.0/c/draft-release-notes.md --- miller-5.3.0/c/draft-release-notes.md 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/c/draft-release-notes.md 2018-10-14 20:17:52.000000000 +0000 @@ -1,54 +1,90 @@ -## Features: +# New data-cleaning features, limited localtime support, and bugfixes -* [**Comment strings in data files:**](http://johnkerl.org/miller-releases/miller-5.3.0/doc/file-formats.html#Comments_in_data) `mlr --skip-comments` allows you to filter out input lines starting with `#`, for all file formats. Likewise, `mlr --skip-comments-with X` lets you specify the comment-string `X`. Comments are only supported at start of data line. `mlr --pass-comments` and `mlr --pass-comments-with X` allow you to forward comments to program output as they are read. +## Features: -* The [**count-similar**](http://johnkerl.org/miller-releases/miller-5.3.0/doc/reference-verbs.html#count-similar) -verb lets you compute cluster sizes by cluster labels. +* The new [**clean-whitespace**](http://johnkerl.org/miller/doc/reference-verbs.html#clean-whitespace) verb resolves +https://github.com/johnkerl/miller/issues/190 from @aborruso. +Along with the new functions +[**strip**](http://johnkerl.org/miller/doc/reference-dsl.html#strip), +[**lstrip**](http://johnkerl.org/miller/doc/reference-dsl.html#lstrip), +[**rstrip**](http://johnkerl.org/miller/doc/reference-dsl.html#rstrip), +[**collapse_whitespace**](http://johnkerl.org/miller/doc/reference-dsl.html#collapse_whitespace), and +[**clean_whitespace**](http://johnkerl.org/miller/doc/reference-dsl.html#clean_whitespace), there is +coarser-grained and finer-grained control over whitespace within field names and/or values. +See the linked-to documentation for examples. + +* The new [**altkv**](http://johnkerl.org/miller/doc/reference-verbs.html#altkv) verb resolves +https://github.com/johnkerl/miller/issues/184 which was originally opened via an email request. This supports mapping +value-lists such as `a,b,c,d` to alternating key-value pairs such as `a=b,c=d`. + +* The new [**fill-down**](http://johnkerl.org/miller/doc/reference-verbs.html#fill-down) verb resolves +https://github.com/johnkerl/miller/issues/189 +by +@aborruso +See the linked-to documentation for examples. + +* The [**uniq**](http://johnkerl.org/miller/doc/reference-verbs.html#verb) verb now has a **uniq -a** +which resolves https://github.com/johnkerl/miller/issues/168 from @sjackman. + +* The new +[**regextract**](http://johnkerl.org/miller/doc/reference-dsl.html#regextract) and +[**regextract_or_else**](http://johnkerl.org/miller/doc/reference-dsl.html#regextract_or_else) +functions resolve +https://github.com/johnkerl/miller/issues/183 +by @aborruso. +xxx. + +* The new [**ssub**](http://johnkerl.org/miller/doc/reference-dsl.html#ssub) function arises from +https://github.com/johnkerl/miller/issues/171 +by @dohse, as a simplified way to avoid escaping characters which are special to regular-expression parsers. + +* There are [**localtime**] functions in response to +https://github.com/johnkerl/miller/issues/170 by @sitaramc, as follows. However note that +as discussed on https://github.com/johnkerl/miller/issues/170 these do not undo one another in all +circumstances. +This is a non-issue for timezones which do not do DST. Otherwise, please use with disclaimers. + * [**localdate**](http://johnkerl.org/miller/doc/reference-dsl.html#localdate) + * [**localtime2sec**](http://johnkerl.org/miller/doc/reference-dsl.html#localtime2sec) + * [**sec2localdate**](http://johnkerl.org/miller/doc/reference-dsl.html#sec2localdate) + * [**sec2localtime**](http://johnkerl.org/miller/doc/reference-dsl.html#sec2localtime) + * [**strftime_local**](http://johnkerl.org/miller/doc/reference-dsl.html#strftime_local) + * [**strptime_local**](http://johnkerl.org/miller/doc/reference-dsl.html#strptime_local) + +## Builds: + +* Windows build-artifacts are now available in Appveyor at +https://ci.appveyor.com/project/johnkerl/miller/build/artifacts, and will be attached to this and future releases. This +reseolvs https://github.com/johnkerl/miller/issues/167, https://github.com/johnkerl/miller/issues/148, and +https://github.com/johnkerl/miller/issues/109. -* While Miller DSL arithmetic gracefully overflows from 64-integer to -double-precision float (see also -[**here**](http://johnkerl.org/miller/doc/reference.html#Arithmetic)), there -are now the **integer-preserving arithmetic operators** -[**`.+`**](http://johnkerl.org/miller-releases/miller-5.3.0/doc/reference-dsl.html#.+) -[**`.-`**](http://johnkerl.org/miller-releases/miller-5.3.0/doc/reference-dsl.html#.-) -[**`.*`**](http://johnkerl.org/miller-releases/miller-5.3.0/doc/reference-dsl.html#.*) -[**`./`**](http://johnkerl.org/miller-releases/miller-5.3.0/doc/reference-dsl.html#./) -[**`.//`**](http://johnkerl.org/miller-releases/miller-5.3.0/doc/reference-dsl.html#.//) -for those times when you want integer overflow. - -* There is a new [**bitcount**](http://johnkerl.org/miller-releases/miller-5.3.0/doc/reference-dsl.html#bitcount) function: for example, `echo x=0xf0000206 | mlr put '$y=bitcount($x)'` produces `x=0xf0000206,y=7`. - -* [**Issue 158**](https://github.com/johnkerl/miller/issues/158): `mlr -T` is -an alias for `--nidx --fs tab`, and `mlr -t` is an alias for `mlr ---tsvlite`. +* Travis builds at https://travis-ci.org/johnkerl/miller/builds now run on OSX as well as Linux. -* The mathematical constants **π and e have been renamed from `PI` and `E` to `M_PI` and `M_E`, respectively**. (It's annoying to get a syntax error when you try to define a variable named `E` in the DSL, when `A` through `D` work just fine.) This is a backward incompatibility, but not enough of us to justify calling this release Miller 6.0.0. +* An Ubuntu 17 build issue was fixed by @singalen on https://github.com/johnkerl/miller/issues/164. ## Documentation: -* As noted -[**here**](http://johnkerl.org/miller-releases/miller-5.3.0/doc/reference-dsl.html#A_note_on_the_complexity_of_Miller’s_expression_language), while Miller has its own DSL there will always be things better expressible in a general-purpose language. The new page -[**Sharing data with other languages**](http://johnkerl.org/miller-releases/miller-5.3.0/doc/data-sharing.html) shows how to seamlessly share data back and forth between **Miller, Ruby, and Python**. [**SQL-input examples**](http://johnkerl.org/miller-releases/miller-5.3.0/doc/10-min.html#SQL-input_examples) and [**SQL-output examples**](http://johnkerl.org/miller-releases/miller-5.3.0/doc/10-min.html#SQL-output_examples) contain detailed information the interplay between **Miller and SQL**. - -* [**Issue 150**](https://github.com/johnkerl/miller/issues/150) raised a -question about suppressing numeric conversion. This resulted in a new FAQ entry -[**How do I suppress numeric conversion?**](http://johnkerl.org/miller/doc/faq.html#How_do_I_suppress_numeric_conversion?), as well as the -longer-term follow-on [**issue 151**](https://github.com/johnkerl/miller/issues/151) which will make numeric conversion happen on a just-in-time basis. - -* To my surprise, **csvlite format options** weren’t listed in `mlr --help` or the manpage. This has been fixed. - -* Documentation for [**auxiliary commands**](http://johnkerl.org/miller-releases/miller-5.3.0/doc/reference.html#Auxiliary_commands) has been expanded, including within the [**manpage**](http://johnkerl.org/miller-releases/miller-5.3.0/doc/manpage.html). +* put/filter documentation was confusing as reported by @NikosAlexandris on +https://github.com/johnkerl/miller/issues/169. -## Bugfixes: +* The new FAQ entry +http://johnkerl.org/miller-releases/miller-head/doc/faq.html#How_to_rectangularize_after_joins_with_unpaired? +resolves +https://github.com/johnkerl/miller/issues/193 +by @aborruso. -* [**Issue 159**](https://github.com/johnkerl/miller/issues/159) fixes regex-match of literal dot. +* The new cookbook entry +http://johnkerl.org/miller/doc/cookbook.html#Options_for_dealing_with_duplicate_rows arises from +https://github.com/johnkerl/miller/issues/168 from @sjackman. -* [**Issue 160**](https://github.com/johnkerl/miller/issues/160) fixes out-of-memory cases for huge files. This is an old bug, as old as Miller, and is due to inadequate testing of huge-file cases. The problem is simple: Miller prefers memory-mapped I/O (using `mmap`) over `stdio` since `mmap` is fractionally faster. Yet as any processing (even `mlr cat`) steps through an input file, more and more pages are faulted in -- and, unfortunately, previous pages are not paged out once memory pressure increases. (This despite gallant attempts with `madvise`.) Once all processing is done, the memory is released; there is no leak per se. But the Miller process can crash before the entire file is read. The solution is equally simple: to prefer `stdio` over `mmap` for files over 4GB in size. (This 4GB threshold is tunable via the `--mmap-below` flag as described in the [manpage](http://johnkerl.org/miller-releases/miller-5.3.0/doc/manpage.html).) +* The unsparsify documentation had some words missing as reported by +@tst2005 on https://github.com/johnkerl/miller/issues/194. -* [**Issue 161**](https://github.com/johnkerl/miller/issues/161) fixes a CSV-parse error (with error message "unwrapped double quote at line 0") when a CSV file starts with the UTF-8 byte-order-mark ("BOM") sequence `0xef` `0xbb` `0xbf` and the header line has double-quoted fields. ([Release 5.2.0](https://github.com/johnkerl/miller/releases/tag/v5.2.0) introduced handling for UTF-8 BOMs, but missed the case of double-quoted header line.) +* There was a typo in the cookpage page http://johnkerl.org/miller/doc/cookbook.html#Full_field_renames_and_reassigns +as fixed by @tst2005 in https://github.com/johnkerl/miller/pull/192. -* [**Issue 162**](https://github.com/johnkerl/miller/issues/162) fixes a corner case doing multi-emit of aggregate variables when the first variable name is a typo. +## Bugfixes: -* The Miller JSON parser used to error with `Unable to parse JSON data: Line 1 column 0: Unexpected 0x00 when seeking value` on empty input, or input with trailing whitespace; this has been fixed. +* There was a memory leak for TSV-format files only as reported by @treynr on https://github.com/johnkerl/miller/issues/181. -There is no prebuilt Windows executable for this release; my apologies. +* Dollar sign in regular expressions were not being escaped properly as reported by @dohse on +https://github.com/johnkerl/miller/issues/171. diff -Nru miller-5.3.0/c/dsl/function_manager.c miller-5.4.0/c/dsl/function_manager.c --- miller-5.3.0/c/dsl/function_manager.c 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/c/dsl/function_manager.c 2018-10-14 20:17:52.000000000 +0000 @@ -231,13 +231,21 @@ {FUNC_CLASS_STRING, ".", 2,0, "String concatenation."}, {FUNC_CLASS_STRING, "gsub", 3,0, "Example: '$name=gsub($name, \"old\", \"new\")'\n(replace all)."}, + {FUNC_CLASS_STRING, "regextract", 2,0, "Example: '$name=regextract($name, \"[A-Z]{3}[0-9]{2}\")'\n."}, + {FUNC_CLASS_STRING, "regextract_or_else", 3,0, "Example: '$name=regextract_or_else($name, \"[A-Z]{3}[0-9]{2}\", \"default\")'\n."}, {FUNC_CLASS_STRING, "strlen", 1,0, "String length."}, {FUNC_CLASS_STRING, "sub", 3,0, "Example: '$name=sub($name, \"old\", \"new\")'\n(replace once)."}, + {FUNC_CLASS_STRING, "ssub", 3,0, "Like sub but does no regexing. No characters are special."}, {FUNC_CLASS_STRING, "substr", 3,0, "substr(s,m,n) gives substring of s from 0-up position m to n \n" "inclusive. Negative indices -len .. -1 alias to 0 .. len-1."}, {FUNC_CLASS_STRING, "tolower", 1,0, "Convert string to lowercase."}, {FUNC_CLASS_STRING, "toupper", 1,0, "Convert string to uppercase."}, + {FUNC_CLASS_STRING, "lstrip", 1,0, "Strip leading whitespace from string."}, + {FUNC_CLASS_STRING, "rstrip", 1,0, "Strip trailing whitespace from string."}, + {FUNC_CLASS_STRING, "strip", 1,0, "Strip leading and trailing whitespace from string."}, + {FUNC_CLASS_STRING, "collapse_whitespace", 1,0, "Strip repeated whitespace from string."}, + {FUNC_CLASS_STRING, "clean_whitespace", 1,0, "Same as collapse_whitespace and strip."}, {FUNC_CLASS_MATH, "abs", 1,0, "Absolute value."}, {FUNC_CLASS_MATH, "acos", 1,0, "Inverse trigonometric cosine."}, @@ -295,11 +303,16 @@ "Formats floating-point seconds as in\nfsec2dhms(500000.25) = \"5d18h53m20.250000s\""}, {FUNC_CLASS_TIME, "fsec2hms", 1,0, "Formats floating-point seconds as in\nfsec2hms(5000.25) = \"01:23:20.250000\""}, + {FUNC_CLASS_TIME, "gmt2sec", 1,0, "Parses GMT timestamp as integer seconds since\nthe epoch."}, + {FUNC_CLASS_TIME, "localtime2sec", 1,0, "Parses local timestamp as integer seconds since\n" + "the epoch. Consults $TZ environment variable."}, + {FUNC_CLASS_TIME, "hms2fsec", 1,0, "Recovers floating-point seconds as in\nhms2fsec(\"01:23:20.250000\") = 5000.250000"}, {FUNC_CLASS_TIME, "hms2sec", 1,0, "Recovers integer seconds as in\nhms2sec(\"01:23:20\") = 5000"}, {FUNC_CLASS_TIME, "sec2dhms", 1,0, "Formats integer seconds as in sec2dhms(500000)\n= \"5d18h53m20s\""}, + {FUNC_CLASS_TIME, "sec2gmt", 1,0, "Formats seconds since epoch (integer part)\n" "as GMT timestamp, e.g. sec2gmt(1440768801.7) = \"2015-08-28T13:33:21Z\".\n" @@ -312,6 +325,19 @@ "Formats seconds since epoch (integer part)\n" "as GMT timestamp with year-month-date, e.g. sec2gmtdate(1440768801.7) = \"2015-08-28\".\n" "Leaves non-numbers as-is."}, + + {FUNC_CLASS_TIME, "sec2localtime", 1,0, "Formats seconds since epoch (integer part)\n" + "as local timestamp, e.g. sec2localtime(1440768801.7) = \"2015-08-28T13:33:21Z\".\n" + "Consults $TZ environment variable. Leaves non-numbers as-is."}, + {FUNC_CLASS_TIME, "sec2localtime", 2,0, + "Formats seconds since epoch as local timestamp with n\n" + "decimal places for seconds, e.g. sec2localtime(1440768801.7,1) = \"2015-08-28T13:33:21.7Z\".\n" + "Consults $TZ environment variable. Leaves non-numbers as-is."}, + {FUNC_CLASS_TIME, "sec2localdate", 1,0, + "Formats seconds since epoch (integer part)\n" + "as local timestamp with year-month-date, e.g. sec2localdate(1440768801.7) = \"2015-08-28\".\n" + "Consults $TZ environment variable. Leaves non-numbers as-is."}, + {FUNC_CLASS_TIME, "sec2hms", 1,0, "Formats integer seconds as in\n" "sec2hms(5000) = \"01:23:20\""}, @@ -320,12 +346,18 @@ "strftime(1440768801.7,\"%Y-%m-%dT%H:%M:%SZ\") = \"2015-08-28T13:33:21Z\", and\n" "strftime(1440768801.7,\"%Y-%m-%dT%H:%M:%3SZ\") = \"2015-08-28T13:33:21.700Z\".\n" "Format strings are as in the C library (please see \"man strftime\" on your system),\n" - "with the Miller-specific addition of \"%1S\" through \"%9S\" which format the seocnds\n" - "with 1 through 9 decimal places, respectively. (\"%S\" uses no decimal places.)"}, + "with the Miller-specific addition of \"%1S\" through \"%9S\" which format the seconds\n" + "with 1 through 9 decimal places, respectively. (\"%S\" uses no decimal places.)\n" + "See also strftime_local."}, + {FUNC_CLASS_TIME, "strftime_local", 2,0, + "Like strftime but consults the $TZ environment variable to get local time zone."}, {FUNC_CLASS_TIME, "strptime", 2,0, "Parses timestamp as floating-point seconds since the epoch,\n" "e.g. strptime(\"2015-08-28T13:33:21Z\",\"%Y-%m-%dT%H:%M:%SZ\") = 1440768801.000000,\n" - "and strptime(\"2015-08-28T13:33:21.345Z\",\"%Y-%m-%dT%H:%M:%SZ\") = 1440768801.345000."}, + "and strptime(\"2015-08-28T13:33:21.345Z\",\"%Y-%m-%dT%H:%M:%SZ\") = 1440768801.345000.\n" + "See also strptime_local."}, + {FUNC_CLASS_TIME, "strptime_local", 2,0, + "Like strptime, but consults $TZ environment variable to find and use local timezone."}, {FUNC_CLASS_TIME, "systime", 0,0, "Floating-point seconds since the epoch,\n" "e.g. 1440768801.748936." }, @@ -468,7 +500,7 @@ // More flexibly, I'd have a list of arities supported by each // function. But this is overkill: there are unary and binary minus and sec2gmt, // and everything else has a single arity. - if (streq(function_name, "-") || streq(function_name, "sec2gmt")) { + if (streq(function_name, "-") || streq(function_name, "sec2gmt") || streq(function_name, "sec2localtime")) { fprintf(stderr, "%s: Function named \"%s\" takes one argument or two; got %d.\n", MLR_GLOBALS.bargv0, function_name, user_provided_arity); } else if (*pvariadic) { @@ -933,11 +965,16 @@ mlr_dsl_ast_node_t* parg2_node = pnode->pchildren->phead->pnext->pvvalue; int type2 = parg2_node->type; - if ((streq(function_name, "=~") || streq(function_name, "!=~")) && type2 == MD_AST_NODE_TYPE_STRING_LITERAL) { + int is_regexy = + streq(function_name, "=~") || + streq(function_name, "!=~") || + streq(function_name, "regextract"); + + if (is_regexy && type2 == MD_AST_NODE_TYPE_STRING_LITERAL) { rval_evaluator_t* parg1 = rval_evaluator_alloc_from_ast(parg1_node, pfmgr, type_inferencing, context_flags); pevaluator = fmgr_alloc_evaluator_from_binary_regex_arg2_func_name(function_name, parg1, parg2_node->text, FALSE); - } else if ((streq(function_name, "=~") || streq(function_name, "!=~")) && type2 == MD_AST_NODE_TYPE_REGEXI) { + } else if (is_regexy && type2 == MD_AST_NODE_TYPE_REGEXI) { rval_evaluator_t* parg1 = rval_evaluator_alloc_from_ast(parg1_node, pfmgr, type_inferencing, context_flags); pevaluator = fmgr_alloc_evaluator_from_binary_regex_arg2_func_name(function_name, parg1, parg2_node->text, TYPE_INFER_STRING_FLOAT_INT); @@ -956,14 +993,19 @@ mlr_dsl_ast_node_t* parg3_node = pnode->pchildren->phead->pnext->pnext->pvvalue; int type2 = parg2_node->type; - if ((streq(function_name, "sub") || streq(function_name, "gsub")) && type2 == MD_AST_NODE_TYPE_STRING_LITERAL) { + int is_regexy = + streq(function_name, "sub") || + streq(function_name, "gsub") || + streq(function_name, "regextract_or_else"); + + if (is_regexy && type2 == MD_AST_NODE_TYPE_STRING_LITERAL) { // sub/gsub-regex special case: rval_evaluator_t* parg1 = rval_evaluator_alloc_from_ast(parg1_node, pfmgr, type_inferencing, context_flags); rval_evaluator_t* parg3 = rval_evaluator_alloc_from_ast(parg3_node, pfmgr, type_inferencing, context_flags); pevaluator = fmgr_alloc_evaluator_from_ternary_regex_arg2_func_name(function_name, parg1, parg2_node->text, FALSE, parg3); - } else if ((streq(function_name, "sub") || streq(function_name, "gsub")) && type2 == MD_AST_NODE_TYPE_REGEXI) { + } else if (is_regexy && type2 == MD_AST_NODE_TYPE_REGEXI) { // sub/gsub-regex special case: rval_evaluator_t* parg1 = rval_evaluator_alloc_from_ast(parg1_node, pfmgr, type_inferencing, context_flags); rval_evaluator_t* parg3 = rval_evaluator_alloc_from_ast(parg3_node, pfmgr, type_inferencing, context_flags); @@ -1133,6 +1175,7 @@ } else if (streq(fnnm, "fsec2dhms")) { return rval_evaluator_alloc_from_s_f_func(s_f_fsec2dhms_func, parg1); } else if (streq(fnnm, "fsec2hms")) { return rval_evaluator_alloc_from_s_f_func(s_f_fsec2hms_func, parg1); } else if (streq(fnnm, "gmt2sec")) { return rval_evaluator_alloc_from_i_s_func(i_s_gmt2sec_func, parg1); + } else if (streq(fnnm, "localtime2sec")) { return rval_evaluator_alloc_from_i_s_func(i_s_localtime2sec_func, parg1); } else if (streq(fnnm, "hexfmt")) { return rval_evaluator_alloc_from_x_x_func(s_x_hexfmt_func, parg1); } else if (streq(fnnm, "hms2fsec")) { return rval_evaluator_alloc_from_f_s_func(f_s_hms2fsec_func, parg1); } else if (streq(fnnm, "hms2sec")) { return rval_evaluator_alloc_from_f_s_func(i_s_hms2sec_func, parg1); @@ -1146,6 +1189,8 @@ } else if (streq(fnnm, "sec2dhms")) { return rval_evaluator_alloc_from_s_i_func(s_i_sec2dhms_func, parg1); } else if (streq(fnnm, "sec2gmt")) { return rval_evaluator_alloc_from_x_x_func(s_x_sec2gmt_func, parg1); } else if (streq(fnnm, "sec2gmtdate")) { return rval_evaluator_alloc_from_x_x_func(s_x_sec2gmtdate_func, parg1); + } else if (streq(fnnm, "sec2localtime")) { return rval_evaluator_alloc_from_x_x_func(s_x_sec2localtime_func, parg1); + } else if (streq(fnnm, "sec2localdate")) { return rval_evaluator_alloc_from_x_x_func(s_x_sec2localdate_func, parg1); } else if (streq(fnnm, "sec2hms")) { return rval_evaluator_alloc_from_s_i_func(s_i_sec2hms_func, parg1); } else if (streq(fnnm, "sgn")) { return rval_evaluator_alloc_from_x_x_func(x_x_sgn_func, parg1); } else if (streq(fnnm, "sin")) { return rval_evaluator_alloc_from_f_f_func(f_f_sin_func, parg1); @@ -1157,6 +1202,11 @@ } else if (streq(fnnm, "tanh")) { return rval_evaluator_alloc_from_f_f_func(f_f_tanh_func, parg1); } else if (streq(fnnm, "tolower")) { return rval_evaluator_alloc_from_s_s_func(s_s_tolower_func, parg1); } else if (streq(fnnm, "toupper")) { return rval_evaluator_alloc_from_s_s_func(s_s_toupper_func, parg1); + } else if (streq(fnnm, "lstrip")) { return rval_evaluator_alloc_from_s_s_func(s_s_lstrip_func, parg1); + } else if (streq(fnnm, "rstrip")) { return rval_evaluator_alloc_from_s_s_func(s_s_rstrip_func, parg1); + } else if (streq(fnnm, "strip")) { return rval_evaluator_alloc_from_s_s_func(s_s_strip_func, parg1); + } else if (streq(fnnm, "collapse_whitespace")) { return rval_evaluator_alloc_from_s_s_func(s_s_collapse_whitespace_func, parg1); + } else if (streq(fnnm, "clean_whitespace")) { return rval_evaluator_alloc_from_s_s_func(s_s_clean_whitespace_func, parg1); } else if (streq(fnnm, "~")) { return rval_evaluator_alloc_from_i_i_func(i_i_bitwise_not_func, parg1); } else return NULL; @@ -1171,6 +1221,8 @@ } else if (streq(fnnm, "^^")) { return rval_evaluator_alloc_from_b_bb_xor_func(parg1, parg2); } else if (streq(fnnm, "=~")) { return rval_evaluator_alloc_from_x_ssc_func( matches_no_precomp_func, parg1, parg2); + } else if (streq(fnnm, "regextract")) { return rval_evaluator_alloc_from_x_ss_func( + regextract_no_precomp_func, parg1, parg2); } else if (streq(fnnm, "!=~")) { return rval_evaluator_alloc_from_x_ssc_func(does_not_match_no_precomp_func, parg1, parg2); } else if (streq(fnnm, "==")) { return rval_evaluator_alloc_from_x_xx_func(eq_op_func, parg1, parg2); } else if (streq(fnnm, "!=")) { return rval_evaluator_alloc_from_x_xx_func(ne_op_func, parg1, parg2); @@ -1200,13 +1252,16 @@ } else if (streq(fnnm, "fmtnum")) { return rval_evaluator_alloc_from_s_xs_func(s_xs_fmtnum_func, parg1, parg2); } else if (streq(fnnm, "urandint")) { return rval_evaluator_alloc_from_i_ii_func(i_ii_urandint_func, parg1, parg2); } else if (streq(fnnm, "sec2gmt")) { return rval_evaluator_alloc_from_x_xi_func(s_xi_sec2gmt_func, parg1, parg2); + } else if (streq(fnnm, "sec2localtime")) { return rval_evaluator_alloc_from_x_xi_func(s_xi_sec2localtime_func, parg1, parg2); } else if (streq(fnnm, "&")) { return rval_evaluator_alloc_from_x_xx_func(x_xx_band_func, parg1, parg2); } else if (streq(fnnm, "|")) { return rval_evaluator_alloc_from_x_xx_func(x_xx_bor_func, parg1, parg2); } else if (streq(fnnm, "^")) { return rval_evaluator_alloc_from_x_xx_func(x_xx_bxor_func, parg1, parg2); } else if (streq(fnnm, "<<")) { return rval_evaluator_alloc_from_i_ii_func(i_ii_bitwise_lsh_func, parg1, parg2); } else if (streq(fnnm, ">>")) { return rval_evaluator_alloc_from_i_ii_func(i_ii_bitwise_rsh_func, parg1, parg2); } else if (streq(fnnm, "strftime")) { return rval_evaluator_alloc_from_x_ns_func(s_ns_strftime_func, parg1, parg2); + } else if (streq(fnnm, "strftime_local")) { return rval_evaluator_alloc_from_x_ns_func(s_ns_strftime_local_func, parg1, parg2); } else if (streq(fnnm, "strptime")) { return rval_evaluator_alloc_from_x_ss_func(i_ss_strptime_func, parg1, parg2); + } else if (streq(fnnm, "strptime_local")) { return rval_evaluator_alloc_from_x_ss_func(i_ss_strptime_local_func, parg1, parg2); } else { return NULL; } } @@ -1217,6 +1272,8 @@ return rval_evaluator_alloc_from_x_sr_func(matches_precomp_func, parg1, regex_string, ignore_case); } else if (streq(fnnm, "!=~")) { return rval_evaluator_alloc_from_x_sr_func(does_not_match_precomp_func, parg1, regex_string, ignore_case); + } else if (streq(fnnm, "regextract")) { + return rval_evaluator_alloc_from_x_se_func(regextract_precomp_func, parg1, regex_string, ignore_case); } else { return NULL; } } @@ -1228,6 +1285,10 @@ return rval_evaluator_alloc_from_s_sss_func(sub_no_precomp_func, parg1, parg2, parg3); } else if (streq(fnnm, "gsub")) { return rval_evaluator_alloc_from_s_sss_func(gsub_no_precomp_func, parg1, parg2, parg3); + } else if (streq(fnnm, "ssub")) { + return rval_evaluator_alloc_from_s_sss_func(s_sss_ssub_func, parg1, parg2, parg3); + } else if (streq(fnnm, "regextract_or_else")) { + return rval_evaluator_alloc_from_s_sss_func(regextract_or_else_no_precomp_func, parg1, parg2, parg3); } else if (streq(fnnm, "logifit")) { return rval_evaluator_alloc_from_f_fff_func(f_fff_logifit_func, parg1, parg2, parg3); } else if (streq(fnnm, "madd")) { @@ -1252,6 +1313,8 @@ return rval_evaluator_alloc_from_x_srs_func(sub_precomp_func, parg1, regex_string, ignore_case, parg3); } else if (streq(fnnm, "gsub")) { return rval_evaluator_alloc_from_x_srs_func(gsub_precomp_func, parg1, regex_string, ignore_case, parg3); + } else if (streq(fnnm, "regextract_or_else")) { + return rval_evaluator_alloc_from_x_ses_func(regextract_or_else_precomp_func, parg1, regex_string, ignore_case, parg3); } else { return NULL; } } diff -Nru miller-5.3.0/c/dsl/rval_evaluators.h miller-5.4.0/c/dsl/rval_evaluators.h --- miller-5.3.0/c/dsl/rval_evaluators.h 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/c/dsl/rval_evaluators.h 2018-10-14 20:17:52.000000000 +0000 @@ -160,6 +160,12 @@ char* regex_string, int ignore_case); +rval_evaluator_t* rval_evaluator_alloc_from_x_se_func( + mv_binary_arg2_regextract_func_t* pfunc, + rval_evaluator_t* parg1, + char* regex_string, + int ignore_case); + rval_evaluator_t* rval_evaluator_alloc_from_s_xs_func( mv_binary_func_t* pfunc, rval_evaluator_t* parg1, @@ -171,6 +177,9 @@ rval_evaluator_t* rval_evaluator_alloc_from_x_srs_func(mv_ternary_arg2_regex_func_t* pfunc, rval_evaluator_t* parg1, char* regex_string, int ignore_case, rval_evaluator_t* parg3); +rval_evaluator_t* rval_evaluator_alloc_from_x_ses_func(mv_ternary_arg2_regextract_func_t* pfunc, + rval_evaluator_t* parg1, char* regex_string, int ignore_case, rval_evaluator_t* parg3); + // ================================================================ // rval_list_evaluators.c // ================================================================ diff -Nru miller-5.3.0/c/dsl/rval_func_evaluators.c miller-5.4.0/c/dsl/rval_func_evaluators.c --- miller-5.3.0/c/dsl/rval_func_evaluators.c 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/c/dsl/rval_func_evaluators.c 2018-10-14 20:17:52.000000000 +0000 @@ -1160,6 +1160,50 @@ } // ---------------------------------------------------------------- +typedef struct _rval_evaluator_x_se_state_t { + mv_binary_arg2_regextract_func_t* pfunc; + rval_evaluator_t* parg1; + regex_t regex; +} rval_evaluator_x_se_state_t; + +static mv_t rval_evaluator_x_se_func(void* pvstate, variables_t* pvars) { + rval_evaluator_x_se_state_t* pstate = pvstate; + mv_t val1 = pstate->parg1->pprocess_func(pstate->parg1->pvstate, pvars); + + NULL_OR_ERROR_OUT_FOR_STRINGS(val1); + if (!mv_is_string_or_empty(&val1)) + return mv_error(); + + return pstate->pfunc(&val1, &pstate->regex); +} + +static void rval_evaluator_x_se_free(rval_evaluator_t* pevaluator) { + rval_evaluator_x_se_state_t* pstate = pevaluator->pvstate; + pstate->parg1->pfree_func(pstate->parg1); + regfree(&pstate->regex); + free(pstate); + free(pevaluator); +} + +rval_evaluator_t* rval_evaluator_alloc_from_x_se_func(mv_binary_arg2_regextract_func_t* pfunc, + rval_evaluator_t* parg1, char* regex_string, int ignore_case) +{ + rval_evaluator_x_se_state_t* pstate = mlr_malloc_or_die(sizeof(rval_evaluator_x_se_state_t)); + pstate->pfunc = pfunc; + pstate->parg1 = parg1; + + int cflags = ignore_case ? REG_ICASE : 0; + regcomp_or_die(&pstate->regex, regex_string, cflags); + + rval_evaluator_t* pevaluator = mlr_malloc_or_die(sizeof(rval_evaluator_t)); + pevaluator->pvstate = pstate; + pevaluator->pprocess_func = rval_evaluator_x_se_func; + pevaluator->pfree_func = rval_evaluator_x_se_free; + + return pevaluator; +} + +// ---------------------------------------------------------------- typedef struct _rval_evaluator_s_xs_state_t { mv_binary_func_t* pfunc; rval_evaluator_t* parg1; @@ -1315,3 +1359,59 @@ return pevaluator; } + +// ---------------------------------------------------------------- +typedef struct _rval_evaluator_x_ses_state_t { + mv_ternary_arg2_regextract_func_t* pfunc; + rval_evaluator_t* parg1; + regex_t regex; + rval_evaluator_t* parg3; +} rval_evaluator_x_ses_state_t; + +static mv_t rval_evaluator_x_ses_func(void* pvstate, variables_t* pvars) { + rval_evaluator_x_ses_state_t* pstate = pvstate; + + mv_t val1 = pstate->parg1->pprocess_func(pstate->parg1->pvstate, pvars); + NULL_OR_ERROR_OUT_FOR_STRINGS(val1); + if (!mv_is_string_or_empty(&val1)) + return mv_error(); + + mv_t val3 = pstate->parg3->pprocess_func(pstate->parg3->pvstate, pvars); + NULL_OR_ERROR_OUT_FOR_STRINGS(val3); + if (!mv_is_string_or_empty(&val3)) { + mv_free(&val3); + return mv_error(); + } + + return pstate->pfunc(&val1, &pstate->regex, &val3); +} + +static void rval_evaluator_x_ses_free(rval_evaluator_t* pevaluator) { + rval_evaluator_x_ses_state_t* pstate = pevaluator->pvstate; + pstate->parg1->pfree_func(pstate->parg1); + regfree(&pstate->regex); + pstate->parg3->pfree_func(pstate->parg3); + free(pstate); + free(pevaluator); +} + +rval_evaluator_t* rval_evaluator_alloc_from_x_ses_func(mv_ternary_arg2_regextract_func_t* pfunc, + rval_evaluator_t* parg1, char* regex_string, int ignore_case, rval_evaluator_t* parg3) +{ + rval_evaluator_x_ses_state_t* pstate = mlr_malloc_or_die(sizeof(rval_evaluator_x_ses_state_t)); + pstate->pfunc = pfunc; + + pstate->parg1 = parg1; + + int cflags = ignore_case ? REG_ICASE : 0; + regcomp_or_die(&pstate->regex, regex_string, cflags); + + pstate->parg3 = parg3; + + rval_evaluator_t* pevaluator = mlr_malloc_or_die(sizeof(rval_evaluator_t)); + pevaluator->pvstate = pstate; + pevaluator->pprocess_func = rval_evaluator_x_ses_func; + pevaluator->pfree_func = rval_evaluator_x_ses_free; + + return pevaluator; +} diff -Nru miller-5.3.0/c/experimental/strx.c miller-5.4.0/c/experimental/strx.c --- miller-5.3.0/c/experimental/strx.c 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/c/experimental/strx.c 2018-10-14 20:17:52.000000000 +0000 @@ -139,7 +139,7 @@ MLR_GLOBALS.bargv0, time_string, format_string, MLR_GLOBALS.bargv0); exit(1); } - time_t iseconds = mlr_arch_timegm(&tm); + time_t iseconds = mlr_arch_timegmlocal(&tm); printf("%s, %s -> %u, \"%s\"\n", time_string, format_string, (unsigned)iseconds, strptime_retval); return 0; } @@ -206,7 +206,7 @@ printf("STRPTIME ELIDE RETVAL \"%s\"\n", strptime_retval); // 6. Convert the tm to a time_t (seconds since the epoch) and then add the fractional seceonds. - time_t iseconds = mlr_arch_timegm(&tm); + time_t iseconds = mlr_arch_timegmlocal(&tm); printf("ISECONDS %u\n", (unsigned)iseconds); double fseconds = iseconds + fractional_seconds; printf("FSECONDS %.6lf\n", fseconds); diff -Nru miller-5.3.0/c/input/lrec_reader_mmap_csv.c miller-5.4.0/c/input/lrec_reader_mmap_csv.c --- miller-5.3.0/c/input/lrec_reader_mmap_csv.c 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/c/input/lrec_reader_mmap_csv.c 2018-10-14 20:17:52.000000000 +0000 @@ -462,10 +462,11 @@ lrec_t* prec = lrec_unbacked_alloc(); for (rsllse_t* pd = pdata_fields->phead; idx < pdata_fields->length && pd != NULL; pd = pd->pnext) { idx++; - char free_flags = pd->free_flag; - char* key = low_int_to_string(idx, &free_flags); + char key_free_flags = 0; + char* key = low_int_to_string(idx, &key_free_flags); + char value_free_flags = pd->free_flag; // Transfer pointer-free responsibility from the rslls to the lrec object - lrec_put_ext(prec, key, pd->value, free_flags, pd->quote_flag); + lrec_put_ext(prec, key, pd->value, key_free_flags | value_free_flags, pd->quote_flag); pd->free_flag = 0; } return prec; diff -Nru miller-5.3.0/c/input/lrec_reader_stdio_csv.c miller-5.4.0/c/input/lrec_reader_stdio_csv.c --- miller-5.3.0/c/input/lrec_reader_stdio_csv.c 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/c/input/lrec_reader_stdio_csv.c 2018-10-14 20:17:52.000000000 +0000 @@ -511,10 +511,11 @@ int idx = 0; for (rsllse_t* pd = pdata_fields->phead; pd != NULL; pd = pd->pnext) { idx++; - char free_flags = pd->free_flag; - char* key = low_int_to_string(idx, &free_flags); + char key_free_flags = 0; + char* key = low_int_to_string(idx, &key_free_flags); + char value_free_flags = pd->free_flag; // Transfer pointer-free responsibility from the rslls to the lrec object - lrec_put_ext(prec, key, pd->value, free_flags, pd->quote_flag); + lrec_put_ext(prec, key, pd->value, key_free_flags | value_free_flags, pd->quote_flag); pd->free_flag = 0; } return prec; diff -Nru miller-5.3.0/c/lib/Makefile.am miller-5.4.0/c/lib/Makefile.am --- miller-5.3.0/c/lib/Makefile.am 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/c/lib/Makefile.am 2018-10-14 20:17:52.000000000 +0000 @@ -15,6 +15,7 @@ mlrstat.h \ mlrregex.c \ mlrregex.h \ + mlrtimezone.h \ mlrutil.c \ mlrutil.h \ mlrval.c \ diff -Nru miller-5.3.0/c/lib/mlr_arch.c miller-5.4.0/c/lib/mlr_arch.c --- miller-5.3.0/c/lib/mlr_arch.c 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/c/lib/mlr_arch.c 2018-10-14 20:17:52.000000000 +0000 @@ -41,21 +41,33 @@ // ---------------------------------------------------------------- // See the GNU timegm manpage -- this is what it does. -time_t mlr_arch_timegm(struct tm* ptm) { +time_t mlr_arch_timegmlocal(struct tm* ptm, timezone_handling_t timezone_handling) { #ifdef MLR_ON_MSYS2 + // Crap, we're offering limited Windows support :( + if (timezone_handling != TIMEZONE_HANDLING_GMT) { + fprintf(stderr, "%s: Local timezone is not handled for output.\n", + MLR_GLOBALS.bargv0); + exit(1); + } return nlnet_timegm(ptm); #else time_t ret; - char* tz = getenv("TZ"); - mlr_arch_setenv("TZ", "GMT0"); - tzset(); - ret = mktime(ptm); - if (tz) { - mlr_arch_setenv("TZ", tz); + + if (timezone_handling == TIMEZONE_HANDLING_GMT) { + char* tz = getenv("TZ"); + mlr_arch_setenv("TZ", "GMT0"); + tzset(); + ret = mktime(ptm); + if (tz) { + mlr_arch_setenv("TZ", tz); + } else { + mlr_arch_unsetenv("TZ"); + } + tzset(); } else { - mlr_arch_unsetenv("TZ"); + ret = mktime(ptm); } - tzset(); + return ret; #endif } diff -Nru miller-5.3.0/c/lib/mlr_arch.h miller-5.4.0/c/lib/mlr_arch.h --- miller-5.3.0/c/lib/mlr_arch.h 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/c/lib/mlr_arch.h 2018-10-14 20:17:52.000000000 +0000 @@ -3,6 +3,7 @@ #include #include +#include "mlrtimezone.h" // ================================================================ // Miller compiles without ifdefs on Linux, BSDs, and MacOSX -- but @@ -35,6 +36,6 @@ int mlr_arch_unsetenv(const char *name); char *mlr_arch_strptime(const char *s, const char *format, struct tm *ptm); -time_t mlr_arch_timegm(struct tm* ptm); +time_t mlr_arch_timegmlocal(struct tm* ptm, timezone_handling_t timezone_handling); #endif // MLR_ARCH_H diff -Nru miller-5.3.0/c/lib/mlrdatetime.c miller-5.4.0/c/lib/mlrdatetime.c --- miller-5.3.0/c/lib/mlrdatetime.c 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/c/lib/mlrdatetime.c 2018-10-14 20:17:52.000000000 +0000 @@ -26,14 +26,29 @@ // to produce a formatted string. The only complication is that we support "%1S" through "%9S" for // formatting the seconds with a desired number of decimal places. -char* mlr_alloc_time_string_from_seconds(double seconds_since_the_epoch, char* format_string) { +char* mlr_alloc_time_string_from_seconds(double seconds_since_the_epoch, char* format_string, + timezone_handling_t timezone_handling) +{ // 1. Split out the integer seconds since the epoch, which the stdlib can handle, and // the fractional part, which it cannot. time_t iseconds = (time_t) seconds_since_the_epoch; double fracsec = seconds_since_the_epoch - iseconds; - struct tm tm = *gmtime(&iseconds); // No gmtime_r on Windows so just use gmtime. + struct tm tm; + switch(timezone_handling) { + case TIMEZONE_HANDLING_GMT: + tm = *gmtime(&iseconds); // No gmtime_r on Windows so just use gmtime. + break; + case TIMEZONE_HANDLING_LOCAL: + tm = *localtime(&iseconds); // No gmtime_r on Windows so just use gmtime. + break; + default: + fprintf(stderr, "%s: internal coding error detected in file %s at line %d.\n", + MLR_GLOBALS.bargv0, __FILE__, __LINE__); + exit(1); + break; + } // 2. See if "%nS" (for n in 1..9) is a substring of the format string. char* middle_nS_format = NULL; @@ -155,8 +170,9 @@ // to play some tricks, inspired in part by some ideas on StackOverflow. Special shout-out // to @tinkerware on Github for the push in the right direction! :) -double mlr_seconds_from_time_string(char* time_string, char* format_string) { - +double mlr_seconds_from_time_string(char* time_string, char* format_string, + timezone_handling_t timezone_handling) +{ struct tm tm; // 1. Just try strptime on the input as-is and return quickly if it's OK. @@ -168,7 +184,20 @@ MLR_GLOBALS.bargv0, time_string, format_string, MLR_GLOBALS.bargv0); exit(1); } - return (double)mlr_arch_timegm(&tm); + + // printf("TIME_STRING %s\n", time_string); + // printf("FORMAT_STRING %s\n", time_string); + // printf("tm_year =%d\n", tm.tm_year); + // printf("tm_mon =%d\n", tm.tm_mon); + // printf("tm_mday =%d\n", tm.tm_mday); + // printf("tm_wday =%d\n", tm.tm_wday); + // printf("tm_yday =%d\n", tm.tm_yday); + // printf("tm_hour =%d\n", tm.tm_hour); + // printf("tm_min =%d\n", tm.tm_min); + // printf("tm_sec =%d\n", tm.tm_sec); + // printf("tm_isdst =%d\n", tm.tm_isdst); + + return (double)mlr_arch_timegmlocal(&tm, timezone_handling); } // 2. Now either there's floating-point seconds in the input, or something else is wrong. @@ -254,5 +283,5 @@ free(elided_fraction_input); // 8. Convert the tm to a time_t (seconds since the epoch) and then add the fractional seconds. - return mlr_arch_timegm(&tm) + fractional_seconds; + return mlr_arch_timegmlocal(&tm, timezone_handling) + fractional_seconds; } diff -Nru miller-5.3.0/c/lib/mlrdatetime.h miller-5.4.0/c/lib/mlrdatetime.h --- miller-5.3.0/c/lib/mlrdatetime.h 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/c/lib/mlrdatetime.h 2018-10-14 20:17:52.000000000 +0000 @@ -5,13 +5,16 @@ #include #include #include +#include "mlrtimezone.h" // Seconds since the epoch. double get_systime(); // These use the gmtime/timegm and strftime/strptime standard-library functions, with the addition // of support for floating-point seconds since the epoch. -char* mlr_alloc_time_string_from_seconds(double seconds_since_the_epoch, char* format); -double mlr_seconds_from_time_string(char* string, char* format); +char* mlr_alloc_time_string_from_seconds(double seconds_since_the_epoch, char* format, + timezone_handling_t timezone_handling); +double mlr_seconds_from_time_string(char* string, char* format, + timezone_handling_t timezone_handling); #endif // MLRDATETIME_H diff -Nru miller-5.3.0/c/lib/mlrregex.c miller-5.4.0/c/lib/mlrregex.c --- miller-5.3.0/c/lib/mlrregex.c 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/c/lib/mlrregex.c 2018-10-14 20:17:52.000000000 +0000 @@ -202,6 +202,34 @@ } // ---------------------------------------------------------------- +char* regextract(char* input, regex_t* pregex) { + const size_t nmatchmax = 1; + regmatch_t matches[nmatchmax]; + + int matched = regmatch_or_die(pregex, input, nmatchmax, matches); + if (!matched) { + return NULL; + } + regmatch_t* pmatch = &matches[0]; + int len = pmatch->rm_eo - pmatch->rm_so; + return mlr_alloc_string_from_char_range(&input[pmatch->rm_so], len); +} + +// ---------------------------------------------------------------- +char* regextract_or_else(char* input, regex_t* pregex, char* default_value) { + const size_t nmatchmax = 1; + regmatch_t matches[nmatchmax]; + + int matched = regmatch_or_die(pregex, input, nmatchmax, matches); + if (!matched) { + return mlr_strdup_or_die(default_value); + } + regmatch_t* pmatch = &matches[0]; + int len = pmatch->rm_eo - pmatch->rm_so; + return mlr_alloc_string_from_char_range(&input[pmatch->rm_so], len); +} + +// ---------------------------------------------------------------- // Slot 0 is the entire matched input string. // Slots 1 and up are substring matches for parenthesized capture expressions (if any). // Example regex "a(.*)e" with input string "abcde": slot 1 points to "bcd" and match_count = 2. diff -Nru miller-5.3.0/c/lib/mlrregex.h miller-5.4.0/c/lib/mlrregex.h --- miller-5.3.0/c/lib/mlrregex.h 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/c/lib/mlrregex.h 2018-10-14 20:17:52.000000000 +0000 @@ -30,8 +30,12 @@ char* regex_sub(char* input, regex_t* pregex, string_builder_t* psb, char* replacement, int* pmatched, int* pall_captured); -char* regex_gsub(char* input, regex_t* pregex, string_builder_t* psb, char* replacement, int* pmatched, int* pall_captured, - char *pfree_flags); +char* regex_gsub(char* input, regex_t* pregex, string_builder_t* psb, char* replacement, + int* pmatched, int* pall_captured, char *pfree_flags); + +// The return value is dynamically allocated if there is a match, else it returns null. +char* regextract(char* input, regex_t* pregex); +char* regextract_or_else(char* input, regex_t* pregex, char* default_value); // The regex library gives us an array of match pointers into the input string. This function strdups them // out into separate storage, to implement "\0", "\1", "\2", etc. regex-captures for the =~ and !=~ operators. diff -Nru miller-5.3.0/c/lib/mlrtimezone.h miller-5.4.0/c/lib/mlrtimezone.h --- miller-5.3.0/c/lib/mlrtimezone.h 1970-01-01 00:00:00.000000000 +0000 +++ miller-5.4.0/c/lib/mlrtimezone.h 2018-10-14 20:17:52.000000000 +0000 @@ -0,0 +1,9 @@ +#ifndef MLRTIMEZONE_H +#define MLRTIMEZONE_H + +typedef enum _timezone_handling_t { + TIMEZONE_HANDLING_GMT, + TIMEZONE_HANDLING_LOCAL +} timezone_handling_t; + +#endif // MLRTIMEZONE_H diff -Nru miller-5.3.0/c/lib/mvfuncs.c miller-5.4.0/c/lib/mvfuncs.c --- miller-5.3.0/c/lib/mvfuncs.c 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/c/lib/mvfuncs.c 2018-10-14 20:17:52.000000000 +0000 @@ -193,6 +193,100 @@ } // ---------------------------------------------------------------- +mv_t regextract_no_precomp_func(mv_t* pval1, mv_t* pval2) { + regex_t regex; + mv_t rv = regextract_precomp_func(pval1, regcomp_or_die(®ex, pval2->u.strv, 0)); + regfree(®ex); + mv_free(pval2); + return rv; +} + +// ---------------------------------------------------------------- +mv_t regextract_precomp_func(mv_t* pval1, regex_t* pregex) { + char* input = pval1->u.strv; + char* output = regextract(input, pregex); + + mv_free(pval1); + if (output == NULL) { + return mv_absent(); + } else { + return mv_from_string_with_free(output); + } +} + +// ---------------------------------------------------------------- +mv_t regextract_or_else_no_precomp_func(mv_t* pval1, mv_t* pval2, mv_t* pval3) { + regex_t regex; + mv_t rv = regextract_or_else_precomp_func(pval1, regcomp_or_die(®ex, pval2->u.strv, 0), pval3); + regfree(®ex); + mv_free(pval2); + return rv; +} + +// ---------------------------------------------------------------- +mv_t regextract_or_else_precomp_func(mv_t* pval1, regex_t* pregex, mv_t* pval3) { + char* input = pval1->u.strv; + char* default_value = pval3->u.strv; + char* output = regextract_or_else(input, pregex, default_value); + + mv_free(pval1); + mv_free(pval3); + return mv_from_string_with_free(output); +} + +// ---------------------------------------------------------------- +// String-substitution with no regexes or special characters. +// It is assumed that all inputs have already been checked to be strings. +mv_t s_sss_ssub_func(mv_t* pmvinput, mv_t* pmvold, mv_t* pmvnew) { + char* pinput = pmvinput->u.strv; + char* pold = pmvold->u.strv; + char* pnew = pmvnew->u.strv; + + char* pmatch = strstr(pinput, pold); + + if (pmatch == NULL) { + mv_free(pmvold); + mv_free(pmvnew); + return *pmvinput; + } else { + // Example: + // input: aaaaOOObbbbb + // old: OOO + // new: NNNNN + // Output length: strlen(aaaa) + strlen(NNNNN) + strlen(bbbbb) + + // Compute lengths + int input_length = strlen(pinput); + int old_length = strlen(pold); + int new_length = strlen(pnew); + int output_length = input_length - old_length + new_length + 1; + int pre_length = pmatch - pinput; // the "aaaa" part + int post_length = input_length - pre_length - old_length; // the "bbbbb" part + + // Allocate output + char* poutput = mlr_malloc_or_die(output_length); + char* p = poutput; + + // Populate output + strncpy(p, pinput, pre_length); + p += pre_length; + + strcpy(p, pnew); + p += new_length; + + strcpy(p, &pinput[pre_length + old_length]); + p += post_length; + + *p = 0; + + mv_free(pmvinput); + mv_free(pmvold); + mv_free(pmvnew); + return mv_from_string(poutput, FREE_ENTRY_VALUE); + } +} + +// ---------------------------------------------------------------- // https://en.wikipedia.org/wiki/Hamming_weight static const unsigned long long _m1 = 0x5555555555555555; @@ -294,6 +388,19 @@ } // ---------------------------------------------------------------- +mv_t i_s_strlen_func(mv_t* pval1) { + mv_t rv = mv_from_int(strlen_for_utf8_display(pval1->u.strv)); + mv_free(pval1); + return rv; +} + +mv_t s_x_typeof_func(mv_t* pval1) { + mv_t rv = mv_from_string(mt_describe_type(pval1->type), NO_FREE); + mv_free(pval1); + return rv; +} + +// ---------------------------------------------------------------- mv_t s_s_tolower_func(mv_t* pval1) { char* string = mlr_strdup_or_die(pval1->u.strv); for (char* c = string; *c; c++) @@ -314,21 +421,72 @@ return mv_from_string_with_free(string); } -mv_t i_s_strlen_func(mv_t* pval1) { - mv_t rv = mv_from_int(strlen_for_utf8_display(pval1->u.strv)); - mv_free(pval1); - return rv; +// ---------------------------------------------------------------- +mv_t s_s_lstrip_func(mv_t* pval1) { + if (!isspace(pval1->u.strv[0])) { + return *pval1; + } else { + char* p = pval1->u.strv; + while (isspace(*p)) { + p++; + } + return mv_from_string(mlr_strdup_or_die(p), FREE_ENTRY_VALUE); + } } -mv_t s_x_typeof_func(mv_t* pval1) { - mv_t rv = mv_from_string(mt_describe_type(pval1->type), NO_FREE); +mv_t s_s_rstrip_func(mv_t* pval1) { + char* start = pval1->u.strv; + int oldlen = strlen(start); + char* last_non_space = &start[oldlen-1]; + while ((start <= last_non_space) && isspace(*last_non_space)) + last_non_space--; + if (last_non_space < start) { + mv_free(pval1); + return mv_empty(); + } else { + int newlen = (last_non_space - start) + 1; + char* retval = mlr_malloc_or_die(newlen + 1); + memcpy(retval, start, newlen); + retval[newlen] = 0; + mv_free(pval1); + return mv_from_string(retval, FREE_ENTRY_VALUE); + } +} + +mv_t s_s_strip_func(mv_t* pval1) { + mv_t temp = s_s_rstrip_func(pval1); + return s_s_lstrip_func(&temp); +} + +mv_t s_s_collapse_whitespace_func(mv_t* pval1) { + int len = strlen(pval1->u.strv); + char* retval = mlr_malloc_or_die(len+1); + char* pdst = retval; + int last_was_space = FALSE; + for (char* psrc = pval1->u.strv; *psrc; psrc++) { + int current_is_space = isspace(*psrc); + if (last_was_space && current_is_space) { + } else { + *pdst = *psrc; + pdst++; + } + last_was_space = current_is_space; + } + *pdst = 0; mv_free(pval1); - return rv; + return mv_from_string(retval, FREE_ENTRY_VALUE); +} + +mv_t s_s_clean_whitespace_func(mv_t* pval1) { + mv_t temp = s_s_collapse_whitespace_func(pval1); + return s_s_strip_func(&temp); } // ---------------------------------------------------------------- // Precondition: psec is either int or float. -mv_t time_string_from_seconds(mv_t* psec, char* format) { +mv_t time_string_from_seconds(mv_t* psec, char* format, + timezone_handling_t timezone_handling) +{ double seconds_since_the_epoch = 0.0; if (psec->type == MT_FLOAT) { if (isinf(psec->u.fltv) || isnan(psec->u.fltv)) { @@ -339,13 +497,16 @@ seconds_since_the_epoch = psec->u.intv; } - char* string = mlr_alloc_time_string_from_seconds(seconds_since_the_epoch, format); + char* string = mlr_alloc_time_string_from_seconds(seconds_since_the_epoch, format, + timezone_handling); return mv_from_string_with_free(string); } // ---------------------------------------------------------------- -static mv_t sec2gmt_s_n(mv_t* pa) { return time_string_from_seconds(pa, ISO8601_TIME_FORMAT); } +static mv_t sec2gmt_s_n(mv_t* pa) { + return time_string_from_seconds(pa, ISO8601_TIME_FORMAT, TIMEZONE_HANDLING_GMT); +} static mv_unary_func_t* sec2gmt_dispositions[MT_DIM] = { /*ERROR*/ _err1, @@ -361,15 +522,15 @@ // Precondition: val2 is already asserted int static mv_t sec2gmt_s_ni(mv_t* pa, mv_t* pb) { switch (pb->u.intv) { - case 1: return time_string_from_seconds(pa, ISO8601_TIME_FORMAT_1); break; - case 2: return time_string_from_seconds(pa, ISO8601_TIME_FORMAT_2); break; - case 3: return time_string_from_seconds(pa, ISO8601_TIME_FORMAT_3); break; - case 4: return time_string_from_seconds(pa, ISO8601_TIME_FORMAT_4); break; - case 5: return time_string_from_seconds(pa, ISO8601_TIME_FORMAT_5); break; - case 6: return time_string_from_seconds(pa, ISO8601_TIME_FORMAT_6); break; - case 7: return time_string_from_seconds(pa, ISO8601_TIME_FORMAT_7); break; - case 8: return time_string_from_seconds(pa, ISO8601_TIME_FORMAT_8); break; - case 9: return time_string_from_seconds(pa, ISO8601_TIME_FORMAT_9); break; + case 1: return time_string_from_seconds(pa, ISO8601_TIME_FORMAT_1, TIMEZONE_HANDLING_GMT); break; + case 2: return time_string_from_seconds(pa, ISO8601_TIME_FORMAT_2, TIMEZONE_HANDLING_GMT); break; + case 3: return time_string_from_seconds(pa, ISO8601_TIME_FORMAT_3, TIMEZONE_HANDLING_GMT); break; + case 4: return time_string_from_seconds(pa, ISO8601_TIME_FORMAT_4, TIMEZONE_HANDLING_GMT); break; + case 5: return time_string_from_seconds(pa, ISO8601_TIME_FORMAT_5, TIMEZONE_HANDLING_GMT); break; + case 6: return time_string_from_seconds(pa, ISO8601_TIME_FORMAT_6, TIMEZONE_HANDLING_GMT); break; + case 7: return time_string_from_seconds(pa, ISO8601_TIME_FORMAT_7, TIMEZONE_HANDLING_GMT); break; + case 8: return time_string_from_seconds(pa, ISO8601_TIME_FORMAT_8, TIMEZONE_HANDLING_GMT); break; + case 9: return time_string_from_seconds(pa, ISO8601_TIME_FORMAT_9, TIMEZONE_HANDLING_GMT); break; default: return mv_error(); } } @@ -386,7 +547,9 @@ mv_t s_xi_sec2gmt_func(mv_t* pval1, mv_t* pval2) { return (sec2gmtn_dispositions[pval1->type])(pval1, pval2); } // ---------------------------------------------------------------- -static mv_t sec2gmtdate_s_n(mv_t* pa) { return time_string_from_seconds(pa, ISO8601_DATE_FORMAT); } +static mv_t sec2gmtdate_s_n(mv_t* pa) { + return time_string_from_seconds(pa, ISO8601_DATE_FORMAT, TIMEZONE_HANDLING_GMT); +} static mv_unary_func_t* sec2gmtdate_dispositions[MT_DIM] = { /*ERROR*/ _err1, @@ -401,29 +564,112 @@ mv_t s_x_sec2gmtdate_func(mv_t* pval1) { return (sec2gmtdate_dispositions[pval1->type])(pval1); } // ---------------------------------------------------------------- +static mv_t sec2localtime_s_n(mv_t* pa) { + return time_string_from_seconds(pa, ISO8601_LOCAL_TIME_FORMAT, TIMEZONE_HANDLING_LOCAL); +} + +static mv_unary_func_t* sec2localtime_dispositions[MT_DIM] = { + /*ERROR*/ _err1, + /*ABSENT*/ _a1, + /*EMPTY*/ _emt1, + /*STRING*/ _0, + /*INT*/ sec2localtime_s_n, + /*FLOAT*/ sec2localtime_s_n, + /*BOOL*/ _0, +}; +mv_t s_x_sec2localtime_func(mv_t* pval1) { return (sec2localtime_dispositions[pval1->type])(pval1); } + +// Precondition: val2 is already asserted int +static mv_t sec2localtime_s_ni(mv_t* pa, mv_t* pb) { + switch (pb->u.intv) { + case 1: return time_string_from_seconds(pa, ISO8601_LOCAL_TIME_FORMAT_1, TIMEZONE_HANDLING_LOCAL); break; + case 2: return time_string_from_seconds(pa, ISO8601_LOCAL_TIME_FORMAT_2, TIMEZONE_HANDLING_LOCAL); break; + case 3: return time_string_from_seconds(pa, ISO8601_LOCAL_TIME_FORMAT_3, TIMEZONE_HANDLING_LOCAL); break; + case 4: return time_string_from_seconds(pa, ISO8601_LOCAL_TIME_FORMAT_4, TIMEZONE_HANDLING_LOCAL); break; + case 5: return time_string_from_seconds(pa, ISO8601_LOCAL_TIME_FORMAT_5, TIMEZONE_HANDLING_LOCAL); break; + case 6: return time_string_from_seconds(pa, ISO8601_LOCAL_TIME_FORMAT_6, TIMEZONE_HANDLING_LOCAL); break; + case 7: return time_string_from_seconds(pa, ISO8601_LOCAL_TIME_FORMAT_7, TIMEZONE_HANDLING_LOCAL); break; + case 8: return time_string_from_seconds(pa, ISO8601_LOCAL_TIME_FORMAT_8, TIMEZONE_HANDLING_LOCAL); break; + case 9: return time_string_from_seconds(pa, ISO8601_LOCAL_TIME_FORMAT_9, TIMEZONE_HANDLING_LOCAL); break; + default: return mv_error(); + } +} + +static mv_binary_func_t* sec2localn_dispositions[MT_DIM] = { + /*ERROR*/ _err, + /*ABSENT*/ _a, + /*EMPTY*/ _emt, + /*STRING*/ _1, + /*INT*/ sec2localtime_s_ni, + /*FLOAT*/ sec2localtime_s_ni, + /*BOOL*/ _1, +}; +mv_t s_xi_sec2localtime_func(mv_t* pval1, mv_t* pval2) { return (sec2localn_dispositions[pval1->type])(pval1, pval2); } + +// ---------------------------------------------------------------- +static mv_t sec2localdate_s_n(mv_t* pa) { + return time_string_from_seconds(pa, ISO8601_DATE_FORMAT, TIMEZONE_HANDLING_LOCAL); +} + +static mv_unary_func_t* sec2localdate_dispositions[MT_DIM] = { + /*ERROR*/ _err1, + /*ABSENT*/ _a1, + /*EMPTY*/ _emt1, + /*STRING*/ _0, + /*INT*/ sec2localdate_s_n, + /*FLOAT*/ sec2localdate_s_n, + /*BOOL*/ _0, +}; + +mv_t s_x_sec2localdate_func(mv_t* pval1) { return (sec2localdate_dispositions[pval1->type])(pval1); } + + +// ---------------------------------------------------------------- mv_t s_ns_strftime_func(mv_t* pval1, mv_t* pval2) { - mv_t rv = time_string_from_seconds(pval1, pval2->u.strv); + mv_t rv = time_string_from_seconds(pval1, pval2->u.strv, TIMEZONE_HANDLING_GMT); mv_free(pval2); return rv; } // ---------------------------------------------------------------- -static mv_t seconds_from_time_string(char* string, char* format) { +mv_t s_ns_strftime_local_func(mv_t* pval1, mv_t* pval2) { + mv_t rv = time_string_from_seconds(pval1, pval2->u.strv, TIMEZONE_HANDLING_LOCAL); + mv_free(pval2); + return rv; +} + +// ---------------------------------------------------------------- +static mv_t seconds_from_time_string(char* string, char* format, + timezone_handling_t timezone_handling) +{ if (*string == '\0') { return mv_empty(); } else { - return mv_from_float(mlr_seconds_from_time_string(string, format)); + return mv_from_float(mlr_seconds_from_time_string(string, format, timezone_handling)); } } mv_t i_s_gmt2sec_func(mv_t* pval1) { - mv_t rv = seconds_from_time_string(pval1->u.strv, ISO8601_TIME_FORMAT); + mv_t rv = seconds_from_time_string(pval1->u.strv, ISO8601_TIME_FORMAT, TIMEZONE_HANDLING_GMT); + mv_free(pval1); + return rv; +} + +mv_t i_s_localtime2sec_func(mv_t* pval1) { + mv_t rv = seconds_from_time_string(pval1->u.strv, ISO8601_LOCAL_TIME_FORMAT, TIMEZONE_HANDLING_LOCAL); mv_free(pval1); return rv; } mv_t i_ss_strptime_func(mv_t* pval1, mv_t* pval2) { - mv_t rv = seconds_from_time_string(pval1->u.strv, pval2->u.strv); + mv_t rv = seconds_from_time_string(pval1->u.strv, pval2->u.strv, TIMEZONE_HANDLING_GMT); + mv_free(pval1); + mv_free(pval2); + return rv; +} + +mv_t i_ss_strptime_local_func(mv_t* pval1, mv_t* pval2) { + mv_t rv = seconds_from_time_string(pval1->u.strv, pval2->u.strv, TIMEZONE_HANDLING_LOCAL); mv_free(pval1); mv_free(pval2); return rv; diff -Nru miller-5.3.0/c/lib/mvfuncs.h miller-5.4.0/c/lib/mvfuncs.h --- miller-5.3.0/c/lib/mvfuncs.h 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/c/lib/mvfuncs.h 2018-10-14 20:17:52.000000000 +0000 @@ -28,6 +28,17 @@ #define ISO8601_TIME_FORMAT_9 "%Y-%m-%dT%H:%M:%9SZ" #define ISO8601_DATE_FORMAT "%Y-%m-%d" +#define ISO8601_LOCAL_TIME_FORMAT "%Y-%m-%d %H:%M:%S" +#define ISO8601_LOCAL_TIME_FORMAT_1 "%Y-%m-%d %H:%M:%1S" +#define ISO8601_LOCAL_TIME_FORMAT_2 "%Y-%m-%d %H:%M:%2S" +#define ISO8601_LOCAL_TIME_FORMAT_3 "%Y-%m-%d %H:%M:%3S" +#define ISO8601_LOCAL_TIME_FORMAT_4 "%Y-%m-%d %H:%M:%4S" +#define ISO8601_LOCAL_TIME_FORMAT_5 "%Y-%m-%d %H:%M:%5S" +#define ISO8601_LOCAL_TIME_FORMAT_6 "%Y-%m-%d %H:%M:%6S" +#define ISO8601_LOCAL_TIME_FORMAT_7 "%Y-%m-%d %H:%M:%7S" +#define ISO8601_LOCAL_TIME_FORMAT_8 "%Y-%m-%d %H:%M:%8S" +#define ISO8601_LOCAL_TIME_FORMAT_9 "%Y-%m-%d %H:%M:%9S" + // ---------------------------------------------------------------- typedef mv_t mv_variadic_func_t(mv_t* pvals, int nvals); typedef mv_t mv_zary_func_t(); @@ -35,8 +46,10 @@ typedef mv_t mv_binary_func_t(mv_t* pval1, mv_t* pval2); typedef mv_t mv_binary_arg3_capture_func_t(mv_t* pval1, mv_t* pval2, string_array_t** ppregex_captures); typedef mv_t mv_binary_arg2_regex_func_t(mv_t* pval1, regex_t* pregex, string_builder_t* psb, string_array_t** ppregex_captures); +typedef mv_t mv_binary_arg2_regextract_func_t(mv_t* pval1, regex_t* pregex); typedef mv_t mv_ternary_func_t(mv_t* pval1, mv_t* pval2, mv_t* pval3); typedef mv_t mv_ternary_arg2_regex_func_t(mv_t* pval1, regex_t* pregex, string_builder_t* psb, mv_t* pval3); +typedef mv_t mv_ternary_arg2_regextract_func_t(mv_t* pval1, regex_t* pregex, mv_t* pval3); // ---------------------------------------------------------------- static inline mv_t b_b_not_func(mv_t* pval1) { @@ -182,10 +195,15 @@ mv_t i_iii_modexp_func(mv_t* pval1, mv_t* pval2, mv_t* pval3); // ---------------------------------------------------------------- -mv_t s_s_tolower_func(mv_t* pval1); -mv_t s_s_toupper_func(mv_t* pval1); mv_t i_s_strlen_func(mv_t* pval1); mv_t s_x_typeof_func(mv_t* pval1); +mv_t s_s_tolower_func(mv_t* pval1); +mv_t s_s_toupper_func(mv_t* pval1); +mv_t s_s_lstrip_func(mv_t* pval1); +mv_t s_s_rstrip_func(mv_t* pval1); +mv_t s_s_strip_func(mv_t* pval1); +mv_t s_s_collapse_whitespace_func(mv_t* pval1); +mv_t s_s_clean_whitespace_func(mv_t* pval1); mv_t s_xx_dot_func(mv_t* pval1, mv_t* pval2); @@ -193,14 +211,30 @@ mv_t sub_precomp_func(mv_t* pval1, regex_t* pregex, string_builder_t* psb, mv_t* pval3); mv_t gsub_no_precomp_func(mv_t* pval1, mv_t* pval2, mv_t* pval3); mv_t gsub_precomp_func(mv_t* pval1, regex_t* pregex, string_builder_t* psb, mv_t* pval3); +mv_t regextract_no_precomp_func(mv_t* pval1, mv_t* pval2); +mv_t regextract_precomp_func(mv_t* pval1, regex_t* pregex); +mv_t regextract_or_else_no_precomp_func(mv_t* pval1, mv_t* pval2, mv_t* pval3); +mv_t regextract_or_else_precomp_func(mv_t* pval1, regex_t* pregex, mv_t* pval3); +// String-substitution with no regexes or special characters. +mv_t s_sss_ssub_func(mv_t* pstring, mv_t* pold, mv_t* pnew); // ---------------------------------------------------------------- mv_t s_x_sec2gmt_func(mv_t* pval1); mv_t s_xi_sec2gmt_func(mv_t* pval1, mv_t* pval2); mv_t s_x_sec2gmtdate_func(mv_t* pval1); + +mv_t s_x_sec2localtime_func(mv_t* pval1); +mv_t s_xi_sec2localtime_func(mv_t* pval1, mv_t* pval2); +mv_t s_x_sec2localdate_func(mv_t* pval1); + mv_t i_s_gmt2sec_func(mv_t* pval1); +mv_t i_s_localtime2sec_func(mv_t* pval1); + mv_t s_ns_strftime_func(mv_t* pval1, mv_t* pval2); +mv_t s_ns_strftime_local_func(mv_t* pval1, mv_t* pval2); + mv_t i_ss_strptime_func(mv_t* pval1, mv_t* pval2); +mv_t i_ss_strptime_local_func(mv_t* pval1, mv_t* pval2); mv_t s_i_sec2hms_func(mv_t* pval1); mv_t s_f_fsec2hms_func(mv_t* pval1); @@ -211,7 +245,8 @@ mv_t i_s_dhms2sec_func(mv_t* pval1); mv_t f_s_dhms2fsec_func(mv_t* pval1); -mv_t time_string_from_seconds(mv_t* psec, char* format); +mv_t time_string_from_seconds(mv_t* psec, char* format, + timezone_handling_t timezone_handling); // ---------------------------------------------------------------- // arg2 evaluates to string via compound expression; regexes compiled on each call diff -Nru miller-5.3.0/c/Makefile.no-autoconfig miller-5.4.0/c/Makefile.no-autoconfig --- miller-5.3.0/c/Makefile.no-autoconfig 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/c/Makefile.no-autoconfig 2018-10-14 20:17:52.000000000 +0000 @@ -420,8 +420,10 @@ top: mlr mlrg tests install: mlr tests + mkdir -p $(INSTALLDIR) cp mlr $(INSTALLDIR) installhome: mlr tests + mkdir -p $(HOME)/bin cp mlr $(HOME)/bin # ================================================================ diff -Nru miller-5.3.0/c/mapping/Makefile.am miller-5.4.0/c/mapping/Makefile.am --- miller-5.3.0/c/mapping/Makefile.am 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/c/mapping/Makefile.am 2018-10-14 20:17:52.000000000 +0000 @@ -1,13 +1,16 @@ noinst_LTLIBRARIES= libmapping.la libmapping_la_SOURCES= \ mapper.h \ + mapper_altkv.c \ mapper_bar.c \ mapper_bootstrap.c \ mapper_cat.c \ mapper_check.c \ + mapper_clean_whitespace.c \ mapper_count_similar.c \ mapper_cut.c \ mapper_decimate.c \ + mapper_fill_down.c \ mapper_grep.c \ mapper_group_like.c \ mapper_having_fields.c \ diff -Nru miller-5.3.0/c/mapping/mapper_altkv.c miller-5.4.0/c/mapping/mapper_altkv.c --- miller-5.3.0/c/mapping/mapper_altkv.c 1970-01-01 00:00:00.000000000 +0000 +++ miller-5.4.0/c/mapping/mapper_altkv.c 2018-10-14 20:17:52.000000000 +0000 @@ -0,0 +1,106 @@ +#include +#include "cli/argparse.h" +#include "lib/mlrutil.h" +#include "lib/mtrand.h" +#include "containers/sllv.h" +#include "mapping/mappers.h" + +typedef struct _mapper_altkv_state_t { + ap_state_t* pargp; +} mapper_altkv_state_t; + +static void mapper_altkv_usage(FILE* o, char* argv0, char* verb); +static mapper_t* mapper_altkv_parse_cli(int* pargi, int argc, char** argv, + cli_reader_opts_t* _, cli_writer_opts_t* __); +static mapper_t* mapper_altkv_alloc(ap_state_t* pargp); +static void mapper_altkv_free(mapper_t* pmapper, context_t* _); +static sllv_t* mapper_altkv_process(lrec_t* pinrec, context_t* pctx, void* pvstate); + +// ---------------------------------------------------------------- +mapper_setup_t mapper_altkv_setup = { + .verb = "altkv", + .pusage_func = mapper_altkv_usage, + .pparse_func = mapper_altkv_parse_cli, + .ignores_input = FALSE, +}; + +// ---------------------------------------------------------------- +static void mapper_altkv_usage(FILE* o, char* argv0, char* verb) { + fprintf(o, "Usage: %s %s [no options]\n", argv0, verb); + fprintf(o, "Given fields with values of the form a,b,c,d,e,f emits a=b,c=d,e=f pairs.\n"); +} + +static mapper_t* mapper_altkv_parse_cli(int* pargi, int argc, char** argv, + cli_reader_opts_t* _, cli_writer_opts_t* __) +{ + if ((argc - *pargi) < 1) { + mapper_altkv_usage(stderr, argv[0], argv[*pargi]); + return NULL; + } + + char* verb = argv[*pargi]; + *pargi += 1; + + ap_state_t* pstate = ap_alloc(); + + if (!ap_parse(pstate, verb, pargi, argc, argv)) { + mapper_altkv_usage(stderr, argv[0], verb); + return NULL; + } + + mapper_t* pmapper = mapper_altkv_alloc(pstate); + return pmapper; +} + +// ---------------------------------------------------------------- +static mapper_t* mapper_altkv_alloc(ap_state_t* pargp) { + mapper_t* pmapper = mlr_malloc_or_die(sizeof(mapper_t)); + + mapper_altkv_state_t* pstate = mlr_malloc_or_die(sizeof(mapper_altkv_state_t)); + pstate->pargp = pargp; + + pmapper->pvstate = pstate; + pmapper->pprocess_func = mapper_altkv_process; + pmapper->pfree_func = mapper_altkv_free; + + return pmapper; +} + +static void mapper_altkv_free(mapper_t* pmapper, context_t* _) { + mapper_altkv_state_t* pstate = pmapper->pvstate; + // Free the container + ap_free(pstate->pargp); + free(pstate); + free(pmapper); +} + +// ---------------------------------------------------------------- +static sllv_t* mapper_altkv_process(lrec_t* pinrec, context_t* pctx, void* pvstate) { + if (pinrec == NULL) { // End of input stream: emit null. + return sllv_single(NULL); + } + + lrec_t* poutrec = lrec_unbacked_alloc(); + int output_field_number = 1; + for (lrece_t* pe = pinrec->phead; pe != NULL; /* increment in loop */) { + + if (pe->pnext == NULL) { // Odd-numbered field count + char* key = mlr_alloc_string_from_int(output_field_number); + char* value = mlr_strdup_or_die(pe->value); + lrec_put(poutrec, key, value, FREE_ENTRY_KEY | FREE_ENTRY_VALUE); + } else { + char* key = mlr_strdup_or_die(pe->value); + char* value = mlr_strdup_or_die(pe->pnext->value); + lrec_put(poutrec, key, value, FREE_ENTRY_KEY | FREE_ENTRY_VALUE); + } + + output_field_number++; + pe = pe->pnext; + if (pe == NULL) {// Odd-numbered field count + break; + } + pe = pe->pnext; + } + + return sllv_single(poutrec); +} diff -Nru miller-5.3.0/c/mapping/mapper_clean_whitespace.c miller-5.4.0/c/mapping/mapper_clean_whitespace.c --- miller-5.3.0/c/mapping/mapper_clean_whitespace.c 1970-01-01 00:00:00.000000000 +0000 +++ miller-5.4.0/c/mapping/mapper_clean_whitespace.c 2018-10-14 20:17:52.000000000 +0000 @@ -0,0 +1,171 @@ +#include "lib/mlrutil.h" +#include "lib/mvfuncs.h" +#include "containers/sllv.h" +#include "mapping/mappers.h" +#include "cli/argparse.h" + +#define RENAME_SB_ALLOC_LENGTH 16 + +typedef struct _mapper_clean_whitespace_state_t { + ap_state_t* pargp; + int do_keys; + int do_values; +} mapper_clean_whitespace_state_t; + +static void mapper_clean_whitespace_usage(FILE* o, char* argv0, char* verb); +static mapper_t* mapper_clean_whitespace_parse_cli(int* pargi, int argc, char** argv, + cli_reader_opts_t* _, cli_writer_opts_t* __); +static mapper_t* mapper_clean_whitespace_alloc(ap_state_t* pargp, int do_keys, int do_values); +static void mapper_clean_whitespace_free(mapper_t* pmapper, context_t* _); +static sllv_t* mapper_clean_whitespace_kvprocess(lrec_t* pinrec, context_t* pctx, void* pvstate); +static sllv_t* mapper_clean_whitespace_kprocess(lrec_t* pinrec, context_t* pctx, void* pvstate); +static sllv_t* mapper_clean_whitespace_vprocess(lrec_t* pinrec, context_t* pctx, void* pvstate); + +// ---------------------------------------------------------------- +mapper_setup_t mapper_clean_whitespace_setup = { + .verb = "clean-whitespace", + .pusage_func = mapper_clean_whitespace_usage, + .pparse_func = mapper_clean_whitespace_parse_cli, + .ignores_input = FALSE, +}; + +// ---------------------------------------------------------------- +static void mapper_clean_whitespace_usage(FILE* o, char* argv0, char* verb) { + fprintf(o, "Usage: %s %s [options] {old1,new1,old2,new2,...}\n", argv0, verb); + fprintf(o, "For each record, for each field in the record, whitespace-cleans the keys and\n"); + fprintf(o, "values. Whitespace-cleaning entails stripping leading and trailing whitespace,\n"); + fprintf(o, "and replacing multiple whitespace with singles. For finer-grained control,\n"); + fprintf(o, "please see the DSL functions lstrip, rstrip, strip, collapse_whitespace,\n"); + fprintf(o, "and clean_whitespace.\n"); + fprintf(o, "\n"); + fprintf(o, "Options:\n"); + fprintf(o, "-k|--keys-only Do not touch values.\n"); + fprintf(o, "-v|--values-only Do not touch keys.\n"); + fprintf(o, "It is an error to specify -k as well as -v.\n"); +} + +static mapper_t* mapper_clean_whitespace_parse_cli(int* pargi, int argc, char** argv, + cli_reader_opts_t* _, cli_writer_opts_t* __) +{ + int kflag = FALSE; + int vflag = FALSE; + + char* verb = argv[(*pargi)++]; + + ap_state_t* pstate = ap_alloc(); + ap_define_true_flag(pstate, "-k", &kflag); + ap_define_true_flag(pstate, "--keys-only", &kflag); + ap_define_true_flag(pstate, "-v", &vflag); + ap_define_true_flag(pstate, "--values-only", &vflag); + + if (!ap_parse(pstate, verb, pargi, argc, argv)) { + mapper_clean_whitespace_usage(stderr, argv[0], verb); + return NULL; + } + + int do_keys = TRUE; + int do_values = TRUE; + if (kflag && vflag) { + mapper_clean_whitespace_usage(stderr, argv[0], verb); + return NULL; + } else if (kflag) { + do_values = FALSE; + } else if (vflag) { + do_keys = FALSE; + } + + return mapper_clean_whitespace_alloc(pstate, do_keys, do_values); +} + +// ---------------------------------------------------------------- +static mapper_t* mapper_clean_whitespace_alloc(ap_state_t* pargp, int do_keys, int do_values) { + mapper_t* pmapper = mlr_malloc_or_die(sizeof(mapper_t)); + + mapper_clean_whitespace_state_t* pstate = mlr_malloc_or_die(sizeof(mapper_clean_whitespace_state_t)); + + pstate->pargp = pargp; + if (do_keys && do_values) { + pmapper->pprocess_func = mapper_clean_whitespace_kvprocess; + } else if (do_keys) { + pmapper->pprocess_func = mapper_clean_whitespace_kprocess; + } else if (do_values) { + pmapper->pprocess_func = mapper_clean_whitespace_vprocess; + } + pmapper->pfree_func = mapper_clean_whitespace_free; + + pmapper->pvstate = (void*)pstate; + return pmapper; +} + +static void mapper_clean_whitespace_free(mapper_t* pmapper, context_t* _) { + mapper_clean_whitespace_state_t* pstate = pmapper->pvstate; + ap_free(pstate->pargp); + free(pstate); + free(pmapper); +} + +// ---------------------------------------------------------------- +static sllv_t* mapper_clean_whitespace_kvprocess(lrec_t* pinrec, context_t* pctx, void* pvstate) { + if (pinrec != NULL) { + lrec_t* poutrec = lrec_unbacked_alloc(); + for (lrece_t* pe = pinrec->phead; pe != NULL; pe = pe->pnext) { + mv_t old_key = mv_from_string_with_free(mlr_strdup_or_die(pe->key)); + mv_t old_value = mv_from_string_with_free(mlr_strdup_or_die(pe->value)); + mv_t new_key = s_s_clean_whitespace_func(&old_key); + mv_t new_value = s_s_clean_whitespace_func(&old_value); + char free_flags = 0; + if (new_key.free_flags & FREE_ENTRY_VALUE) + free_flags |= FREE_ENTRY_KEY; + if (new_value.free_flags & FREE_ENTRY_VALUE) + free_flags |= FREE_ENTRY_VALUE; + lrec_put(poutrec, new_key.u.strv, new_value.u.strv, free_flags); + } + lrec_free(pinrec); + return sllv_single(poutrec); + } + else { + return sllv_single(NULL); + } +} + +// ---------------------------------------------------------------- +static sllv_t* mapper_clean_whitespace_kprocess(lrec_t* pinrec, context_t* pctx, void* pvstate) { + if (pinrec != NULL) { + lrec_t* poutrec = lrec_unbacked_alloc(); + for (lrece_t* pe = pinrec->phead; pe != NULL; pe = pe->pnext) { + mv_t old_key = mv_from_string_with_free(mlr_strdup_or_die(pe->key)); + mv_t value = mv_from_string_with_free(mlr_strdup_or_die(pe->value)); + mv_t new_key = s_s_clean_whitespace_func(&old_key); + char free_flags = FREE_ENTRY_VALUE; + if (new_key.free_flags & FREE_ENTRY_VALUE) + free_flags |= FREE_ENTRY_KEY; + lrec_put(poutrec, new_key.u.strv, value.u.strv, free_flags); + } + lrec_free(pinrec); + return sllv_single(poutrec); + } + else { + return sllv_single(NULL); + } +} + +// ---------------------------------------------------------------- +static sllv_t* mapper_clean_whitespace_vprocess(lrec_t* pinrec, context_t* pctx, void* pvstate) { + if (pinrec != NULL) { + lrec_t* poutrec = lrec_unbacked_alloc(); + for (lrece_t* pe = pinrec->phead; pe != NULL; pe = pe->pnext) { + mv_t key = mv_from_string_with_free(mlr_strdup_or_die(pe->key)); + mv_t old_value = mv_from_string_with_free(mlr_strdup_or_die(pe->value)); + mv_t new_value = s_s_clean_whitespace_func(&old_value); + char free_flags = FREE_ENTRY_KEY; + if (new_value.free_flags & FREE_ENTRY_VALUE) + free_flags |= FREE_ENTRY_VALUE; + lrec_put(poutrec, key.u.strv, new_value.u.strv, free_flags); + } + lrec_free(pinrec); + return sllv_single(poutrec); + } + else { + return sllv_single(NULL); + } +} diff -Nru miller-5.3.0/c/mapping/mapper_fill_down.c miller-5.4.0/c/mapping/mapper_fill_down.c --- miller-5.3.0/c/mapping/mapper_fill_down.c 1970-01-01 00:00:00.000000000 +0000 +++ miller-5.4.0/c/mapping/mapper_fill_down.c 2018-10-14 20:17:52.000000000 +0000 @@ -0,0 +1,127 @@ +#include +#include +#include +#include +#include "lib/mlrutil.h" +#include "containers/sllv.h" +#include "containers/lhmss.h" +#include "containers/mixutil.h" +#include "mapping/mappers.h" +#include "cli/argparse.h" + +typedef struct _mapper_fill_down_state_t { + ap_state_t* pargp; + slls_t* pfill_down_field_names; + lhmss_t* plast_non_null_values; + int only_if_absent; +} mapper_fill_down_state_t; + +static void mapper_fill_down_usage(FILE* o, char* argv0, char* verb); +static mapper_t* mapper_fill_down_parse_cli(int* pargi, int argc, char** argv, + cli_reader_opts_t* _, cli_writer_opts_t* __); +static mapper_t* mapper_fill_down_alloc(ap_state_t* pargp, slls_t* pfill_down_field_names, int only_if_absent); +static void mapper_fill_down_free(mapper_t* pmapper, context_t* _); +static sllv_t* mapper_fill_down_process(lrec_t* pinrec, context_t* pctx, void* pvstate); + +// ---------------------------------------------------------------- +mapper_setup_t mapper_fill_down_setup = { + .verb = "fill-down", + .pusage_func = mapper_fill_down_usage, + .pparse_func = mapper_fill_down_parse_cli, + .ignores_input = FALSE, +}; + +// ---------------------------------------------------------------- +static void mapper_fill_down_usage(FILE* o, char* argv0, char* verb) { + fprintf(o, "Usage: %s %s [options]\n", argv0, verb); + fprintf(o, "-f {a,b,c} Field names for fill-down\n"); + fprintf(o, "-a|--only-if-absent Field names for fill-down\n"); + fprintf(o, "If a given record has a missing value for a given field, fill that from\n"); + fprintf(o, "the corresponding value from a previous record, if any.\n"); + fprintf(o, "By default, a 'missing' field either is absent, or has the empty-string value.\n"); + fprintf(o, "With -a, a field is 'missing' only if it is absent.\n"); +} + +static mapper_t* mapper_fill_down_parse_cli(int* pargi, int argc, char** argv, + cli_reader_opts_t* _, cli_writer_opts_t* __) +{ + slls_t* pfill_down_field_names = NULL; + int only_if_absent = FALSE; + + char* verb = argv[(*pargi)++]; + + ap_state_t* pstate = ap_alloc(); + ap_define_string_list_flag(pstate, "-f", &pfill_down_field_names); + ap_define_true_flag(pstate, "-a", &only_if_absent); + ap_define_true_flag(pstate, "--only-if-absent", &only_if_absent); + + if (!ap_parse(pstate, verb, pargi, argc, argv)) { + mapper_fill_down_usage(stderr, argv[0], verb); + return NULL; + } + if (pfill_down_field_names == NULL) { + mapper_fill_down_usage(stderr, argv[0], verb); + return NULL; + } + + return mapper_fill_down_alloc(pstate, pfill_down_field_names, only_if_absent); +} + +// ---------------------------------------------------------------- +static mapper_t* mapper_fill_down_alloc(ap_state_t* pargp, slls_t* pfill_down_field_names, int only_if_absent) { + mapper_t* pmapper = mlr_malloc_or_die(sizeof(mapper_t)); + + mapper_fill_down_state_t* pstate = mlr_malloc_or_die(sizeof(mapper_fill_down_state_t)); + + pstate->pargp = pargp; + pstate->pfill_down_field_names = pfill_down_field_names; + pstate->plast_non_null_values = lhmss_alloc(); + pstate->only_if_absent = only_if_absent; + + pmapper->pvstate = pstate; + pmapper->pprocess_func = mapper_fill_down_process; + pmapper->pfree_func = mapper_fill_down_free; + + return pmapper; +} + +static void mapper_fill_down_free(mapper_t* pmapper, context_t* _) { + mapper_fill_down_state_t* pstate = pmapper->pvstate; + slls_free(pstate->pfill_down_field_names); + lhmss_free(pstate->plast_non_null_values); + ap_free(pstate->pargp); + free(pstate); + free(pmapper); +} + +// ---------------------------------------------------------------- +static sllv_t* mapper_fill_down_process(lrec_t* pinrec, context_t* pctx, void* pvstate) { + mapper_fill_down_state_t* pstate = pvstate; + if (pinrec == NULL) { // end of record stream + return sllv_single(NULL); + } + + for (sllse_t* pe = pstate->pfill_down_field_names->phead; pe != NULL; pe = pe->pnext) { + char* pkey = pe->value; + char* pvalue = lrec_get(pinrec, pkey); + int present = (pstate->only_if_absent) + ? (pvalue != NULL) + : (pvalue != NULL && *pvalue); + if (present) { + // Remember it for a subsequent record lacking this field + lhmss_put(pstate->plast_non_null_values, + mlr_strdup_or_die(pkey), + mlr_strdup_or_die(pvalue), + FREE_ENTRY_KEY | FREE_ENTRY_VALUE); + } else { + // Reuse previously seen value, if any + char* prev = lhmss_get(pstate->plast_non_null_values, pkey); + if (prev != NULL) { + lrec_put(pinrec, mlr_strdup_or_die(pkey), mlr_strdup_or_die(prev), + FREE_ENTRY_KEY | FREE_ENTRY_VALUE); + } + } + } + + return sllv_single(pinrec); +} diff -Nru miller-5.3.0/c/mapping/mapper_put_or_filter.c miller-5.4.0/c/mapping/mapper_put_or_filter.c --- miller-5.3.0/c/mapping/mapper_put_or_filter.c 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/c/mapping/mapper_put_or_filter.c 2018-10-14 20:17:52.000000000 +0000 @@ -42,6 +42,7 @@ } expression_info_t; // ---------------------------------------------------------------- +static void mapper_put_or_filter_usage(FILE* o, char* argv0, char* verb); static void mapper_put_usage(FILE* o, char* argv0, char* verb); static void mapper_filter_usage(FILE* o, char* argv0, char* verb); static void shared_usage(FILE* o, char* argv0, char* verb); @@ -90,6 +91,19 @@ }; // ---------------------------------------------------------------- +static void mapper_put_or_filter_usage(FILE* o, char* argv0, char* verb) { + if (streq(verb, "filter")) { + mapper_filter_usage(o, argv0, verb); + } else if (streq(verb, "put")) { + mapper_put_usage(o, argv0, verb); + } else { + fprintf(stderr, "%s: internal coding error detected at file %s line %d.\n", + MLR_GLOBALS.bargv0, __FILE__, __LINE__); + exit(1); + } +} + +// ---------------------------------------------------------------- static void mapper_put_usage(FILE* o, char* argv0, char* verb) { fprintf(o, "Usage: %s %s [options] {expression}\n", argv0, verb); fprintf(o, "Adds/updates specified field(s). Expressions are semicolon-separated and must\n"); @@ -261,7 +275,7 @@ int argi = *pargi; if ((argc - argi) < 1) { - mapper_put_usage(stderr, argv[0], argv[argi]); + mapper_put_or_filter_usage(stderr, argv[0], argv[argi]); return NULL; } char* verb = argv[argi++]; @@ -280,7 +294,7 @@ } else if (streq(argv[argi], "-f")) { if ((argc - argi) < 2) { - mapper_put_usage(stderr, argv[0], verb); + mapper_put_or_filter_usage(stderr, argv[0], verb); return NULL; } expression_info_t* pexpression_info = mlr_malloc_or_die(sizeof(expression_info_t)); @@ -291,7 +305,7 @@ } else if (streq(argv[argi], "-e")) { if ((argc - argi) < 2) { - mapper_put_usage(stderr, argv[0], verb); + mapper_put_or_filter_usage(stderr, argv[0], verb); return NULL; } expression_info_t* pexpression_info = mlr_malloc_or_die(sizeof(expression_info_t)); @@ -328,7 +342,7 @@ argi += 1; } else if (streq(argv[argi], "--oflatsep")) { if ((argc - argi) < 2) { - mapper_put_usage(stderr, argv[0], verb); + mapper_put_or_filter_usage(stderr, argv[0], verb); return NULL; } oosvar_flatten_separator = argv[argi+1]; @@ -338,14 +352,14 @@ argi += 1; } else { - mapper_put_usage(stderr, argv[0], verb); + mapper_put_or_filter_usage(stderr, argv[0], verb); return NULL; } } if (expression_infos->length == 0) { if ((argc - argi) < 1) { - mapper_put_usage(stderr, argv[0], verb); + mapper_put_or_filter_usage(stderr, argv[0], verb); return NULL; } mlr_dsl_expression = mlr_strdup_or_die(argv[argi++]); diff -Nru miller-5.3.0/c/mapping/mapper_sec2gmt.c miller-5.4.0/c/mapping/mapper_sec2gmt.c --- miller-5.3.0/c/mapping/mapper_sec2gmt.c 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/c/mapping/mapper_sec2gmt.c 2018-10-14 20:17:52.000000000 +0000 @@ -133,7 +133,7 @@ } else { mv_t mval = mv_scan_number_nullable(sval); if (!mv_is_error(&mval)) { - mv_t stamp = time_string_from_seconds(&mval, pstate->format_string); + mv_t stamp = time_string_from_seconds(&mval, pstate->format_string, TIMEZONE_HANDLING_GMT); lrec_put(pinrec, name, stamp.u.strv, FREE_ENTRY_VALUE); } } diff -Nru miller-5.3.0/c/mapping/mapper_sec2gmtdate.c miller-5.4.0/c/mapping/mapper_sec2gmtdate.c --- miller-5.3.0/c/mapping/mapper_sec2gmtdate.c 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/c/mapping/mapper_sec2gmtdate.c 2018-10-14 20:17:52.000000000 +0000 @@ -91,7 +91,8 @@ } else { mv_t mval = mv_scan_number_nullable(sval); if (!mv_is_error(&mval)) { - mv_t stamp = time_string_from_seconds(&mval, ISO8601_DATE_FORMAT); + mv_t stamp = time_string_from_seconds(&mval, ISO8601_DATE_FORMAT, + TIMEZONE_HANDLING_GMT); lrec_put(pinrec, name, stamp.u.strv, FREE_ENTRY_VALUE); } } diff -Nru miller-5.3.0/c/mapping/mappers.h miller-5.4.0/c/mapping/mappers.h --- miller-5.3.0/c/mapping/mappers.h 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/c/mapping/mappers.h 2018-10-14 20:17:52.000000000 +0000 @@ -4,14 +4,17 @@ #include "containers/slls.h" #include "mapping/mapper.h" +extern mapper_setup_t mapper_altkv_setup; extern mapper_setup_t mapper_bar_setup; extern mapper_setup_t mapper_bootstrap_setup; extern mapper_setup_t mapper_cat_setup; extern mapper_setup_t mapper_check_setup; +extern mapper_setup_t mapper_clean_whitespace_setup; extern mapper_setup_t mapper_count_distinct_setup; extern mapper_setup_t mapper_count_similar_setup; extern mapper_setup_t mapper_cut_setup; extern mapper_setup_t mapper_decimate_setup; +extern mapper_setup_t mapper_fill_down_setup; extern mapper_setup_t mapper_filter_setup; extern mapper_setup_t mapper_fraction_setup; extern mapper_setup_t mapper_grep_setup; diff -Nru miller-5.3.0/c/mapping/mapper_uniq.c miller-5.4.0/c/mapping/mapper_uniq.c --- miller-5.3.0/c/mapping/mapper_uniq.c 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/c/mapping/mapper_uniq.c 2018-10-14 20:17:52.000000000 +0000 @@ -4,6 +4,7 @@ #include #include "lib/mlrutil.h" #include "containers/sllv.h" +#include "containers/lhmsll.h" #include "containers/lhmslv.h" #include "containers/lhmsv.h" #include "containers/lhmsll.h" @@ -15,28 +16,88 @@ typedef struct _mapper_uniq_state_t { ap_state_t* pargp; - slls_t* pgroup_by_field_names; - int show_counts; - int show_num_distinct_only; + slls_t* pgroup_by_field_names; + int show_counts; + int show_num_distinct_only; + lhmsll_t* puniqified_record_counts; // lrec_sprintf -> counts + lhmsv_t* puniqified_records; // lrec_sprintf -> records lhmslv_t* pcounts_by_group; - lhmsv_t* pcounts_unlashed; // string field name -> string field value -> long long count + lhmsv_t* pcounts_unlashed; // string field name -> string field value -> long long count char* output_field_name; } mapper_uniq_state_t; -static void mapper_uniq_usage(FILE* o, char* argv0, char* verb); -static mapper_t* mapper_uniq_parse_cli(int* pargi, int argc, char** argv, - cli_reader_opts_t* _, cli_writer_opts_t* __); -static void mapper_count_distinct_usage(FILE* o, char* argv0, char* verb); -static mapper_t* mapper_count_distinct_parse_cli(int* pargi, int argc, char** argv, - cli_reader_opts_t* _, cli_writer_opts_t* __); -static mapper_t* mapper_uniq_alloc(ap_state_t* pargp, slls_t* pgroup_by_field_names, int do_lashed, - int show_counts, int show_num_distinct_only, char* output_field_name); -static void mapper_uniq_free(mapper_t* pmapper, context_t* _); - -static sllv_t* mapper_uniq_process_unlashed(lrec_t* pinrec, context_t* pctx, void* pvstate); -static sllv_t* mapper_uniq_process_num_distinct_only(lrec_t* pinrec, context_t* pctx, void* pvstate); -static sllv_t* mapper_uniq_process_with_counts(lrec_t* pinrec, context_t* pctx, void* pvstate); -static sllv_t* mapper_uniq_process_no_counts(lrec_t* pinrec, context_t* pctx, void* pvstate); +// ---------------------------------------------------------------- +static void mapper_uniq_usage( + FILE* o, + char* argv0, + char* verb); + +static mapper_t* mapper_uniq_parse_cli( + int* pargi, + int argc, + char** argv, + cli_reader_opts_t* _, + cli_writer_opts_t* __); + +static void mapper_count_distinct_usage( + FILE* o, + char* argv0, + char* verb); + +static mapper_t* mapper_count_distinct_parse_cli( + int* pargi, + int argc, + char** argv, + cli_reader_opts_t* _, + cli_writer_opts_t* __); + +static mapper_t* mapper_uniq_alloc( + ap_state_t* pargp, + slls_t* pgroup_by_field_names, + int do_lashed, + int show_counts, + int show_num_distinct_only, + char* output_field_name, + int uniqify_entire_records); + +static void mapper_uniq_free( + mapper_t* pmapper, + context_t* _); + +static sllv_t* mapper_uniq_process_uniqify_entire_records( + lrec_t* pinrec, + context_t* pctx, + void* pvstate); + +static sllv_t* mapper_uniq_process_uniqify_entire_records_show_counts( + lrec_t* pinrec, + context_t* pctx, + void* pvstate); + +static sllv_t* mapper_uniq_process_uniqify_entire_records_show_num_distinct_only( + lrec_t* pinrec, + context_t* pctx, + void* pvstate); + +static sllv_t* mapper_uniq_process_unlashed( + lrec_t* pinrec, + context_t* pctx, + void* pvstate); + +static sllv_t* mapper_uniq_process_num_distinct_only( + lrec_t* pinrec, + context_t* pctx, + void* pvstate); + +static sllv_t* mapper_uniq_process_with_counts( + lrec_t* pinrec, + context_t* pctx, + void* pvstate); + +static sllv_t* mapper_uniq_process_no_counts( + lrec_t* pinrec, + context_t* pctx, + void* pvstate); // ---------------------------------------------------------------- mapper_setup_t mapper_count_distinct_setup = { @@ -54,8 +115,16 @@ }; // ---------------------------------------------------------------- -static void mapper_count_distinct_usage(FILE* o, char* argv0, char* verb) { +static void mapper_count_distinct_usage( + FILE* o, + char* argv0, + char* verb) +{ fprintf(o, "Usage: %s %s [options]\n", argv0, verb); + fprintf(o, "Prints number of records having distinct values for specified field names.\n"); + fprintf(o, "Same as uniq -c.\n"); + fprintf(o, "\n"); + fprintf(o, "Options:\n"); fprintf(o, "-f {a,b,c} Field names for distinct count.\n"); fprintf(o, "-n Show only the number of distinct values. Not compatible with -u.\n"); fprintf(o, "-o {name} Field name for output count. Default \"%s\".\n", DEFAULT_OUTPUT_FIELD_NAME); @@ -65,13 +134,15 @@ fprintf(o, " and b field values. With -f a,b and with -u, computes counts\n"); fprintf(o, " for distinct a field values and counts for distinct b field\n"); fprintf(o, " values separately.\n"); - fprintf(o, "Prints number of records having distinct values for specified field names.\n"); - fprintf(o, "Same as uniq -c.\n"); } // ---------------------------------------------------------------- -static mapper_t* mapper_count_distinct_parse_cli(int* pargi, int argc, char** argv, - cli_reader_opts_t* _, cli_writer_opts_t* __) +static mapper_t* mapper_count_distinct_parse_cli( + int* pargi, + int argc, + char** argv, + cli_reader_opts_t* _, + cli_writer_opts_t* __) { slls_t* pfield_names = NULL; int show_num_distinct_only = FALSE; @@ -101,28 +172,43 @@ } return mapper_uniq_alloc(pstate, pfield_names, do_lashed, TRUE, show_num_distinct_only, - output_field_name); + output_field_name, FALSE); } // ---------------------------------------------------------------- -static void mapper_uniq_usage(FILE* o, char* argv0, char* verb) { +static void mapper_uniq_usage( + FILE* o, + char* argv0, + char* verb) +{ fprintf(o, "Usage: %s %s [options]\n", argv0, verb); + fprintf(o, "Prints distinct values for specified field names. With -c, same as\n"); + fprintf(o, "count-distinct. For uniq, -f is a synonym for -g.\n"); + fprintf(o, "\n"); + fprintf(o, "Options:\n"); fprintf(o, "-g {d,e,f} Group-by-field names for uniq counts.\n"); fprintf(o, "-c Show repeat counts in addition to unique values.\n"); fprintf(o, "-n Show only the number of distinct values.\n"); fprintf(o, "-o {name} Field name for output count. Default \"%s\".\n", DEFAULT_OUTPUT_FIELD_NAME); - fprintf(o, "Prints distinct values for specified field names. With -c, same as\n"); - fprintf(o, "count-distinct. For uniq, -f is a synonym for -g.\n"); + fprintf(o, "-a Output each unique record only once. Incompatible with -g.\n"); + fprintf(o, " With -c, produces unique records, with repeat counts for each.\n"); + fprintf(o, " With -n, produces only one record which is the unique-record count.\n"); + fprintf(o, " With neither -c nor -n, produces unique records.\n"); } -static mapper_t* mapper_uniq_parse_cli(int* pargi, int argc, char** argv, - cli_reader_opts_t* _, cli_writer_opts_t* __) +static mapper_t* mapper_uniq_parse_cli( + int* pargi, + int argc, + char** argv, + cli_reader_opts_t* _, + cli_writer_opts_t* __) { slls_t* pgroup_by_field_names = NULL; int show_counts = FALSE; int show_num_distinct_only = FALSE; char* output_field_name = DEFAULT_OUTPUT_FIELD_NAME; int do_lashed = TRUE; + int uniqify_entire_records = FALSE; char* verb = argv[(*pargi)++]; @@ -132,39 +218,66 @@ ap_define_true_flag(pstate, "-c", &show_counts); ap_define_true_flag(pstate, "-n", &show_num_distinct_only); ap_define_string_flag(pstate, "-o", &output_field_name); + ap_define_true_flag(pstate, "-a", &uniqify_entire_records); if (!ap_parse(pstate, verb, pargi, argc, argv)) { mapper_uniq_usage(stderr, argv[0], verb); return NULL; } - if (pgroup_by_field_names == NULL) { - mapper_uniq_usage(stderr, argv[0], verb); - return NULL; + if (uniqify_entire_records) { + if (pgroup_by_field_names != NULL) { + mapper_uniq_usage(stderr, argv[0], verb); + return NULL; + } + if (show_counts && show_num_distinct_only) { + mapper_uniq_usage(stderr, argv[0], verb); + return NULL; + } + } else { + if (pgroup_by_field_names == NULL) { + mapper_uniq_usage(stderr, argv[0], verb); + return NULL; + } } return mapper_uniq_alloc(pstate, pgroup_by_field_names, do_lashed, show_counts, show_num_distinct_only, - output_field_name); + output_field_name, uniqify_entire_records); } // ---------------------------------------------------------------- -static mapper_t* mapper_uniq_alloc(ap_state_t* pargp, slls_t* pgroup_by_field_names, int do_lashed, - int show_counts, int show_num_distinct_only, char* output_field_name) +static mapper_t* mapper_uniq_alloc( + ap_state_t* pargp, + slls_t* pgroup_by_field_names, + int do_lashed, + int show_counts, + int show_num_distinct_only, + char* output_field_name, + int uniqify_entire_records) { mapper_t* pmapper = mlr_malloc_or_die(sizeof(mapper_t)); mapper_uniq_state_t* pstate = mlr_malloc_or_die(sizeof(mapper_uniq_state_t)); - pstate->pargp = pargp; - pstate->pgroup_by_field_names = pgroup_by_field_names; - pstate->show_counts = show_counts; - pstate->show_num_distinct_only = show_num_distinct_only; - pstate->pcounts_by_group = lhmslv_alloc(); - pstate->pcounts_unlashed = lhmsv_alloc(); - pstate->output_field_name = output_field_name; + pstate->pargp = pargp; + pstate->pgroup_by_field_names = pgroup_by_field_names; + pstate->show_counts = show_counts; + pstate->show_num_distinct_only = show_num_distinct_only; + pstate->puniqified_record_counts = lhmsll_alloc(); + pstate->puniqified_records = lhmsv_alloc(); + pstate->pcounts_by_group = lhmslv_alloc(); + pstate->pcounts_unlashed = lhmsv_alloc(); + pstate->output_field_name = output_field_name; pmapper->pvstate = pstate; - if (!do_lashed) + if (uniqify_entire_records) { + if (show_counts) + pmapper->pprocess_func = mapper_uniq_process_uniqify_entire_records_show_counts; + else if (show_num_distinct_only) + pmapper->pprocess_func = mapper_uniq_process_uniqify_entire_records_show_num_distinct_only; + else + pmapper->pprocess_func = mapper_uniq_process_uniqify_entire_records; + } else if (!do_lashed) pmapper->pprocess_func = mapper_uniq_process_unlashed; else if (show_num_distinct_only) pmapper->pprocess_func = mapper_uniq_process_num_distinct_only; @@ -177,30 +290,142 @@ return pmapper; } -static void mapper_uniq_free(mapper_t* pmapper, context_t* _) { +static void mapper_uniq_free( + mapper_t* pmapper, + context_t* _) +{ mapper_uniq_state_t* pstate = pmapper->pvstate; + slls_free(pstate->pgroup_by_field_names); + + lhmsll_free(pstate->puniqified_record_counts); + pstate->puniqified_record_counts = NULL; + + // lhmslv_free will free the keys: we only need to free the void-star values. + for (lhmsve_t* pa = pstate->puniqified_records->phead; pa != NULL; pa = pa->pnext) { + lrec_t* prec = pa->pvvalue; + lrec_free(prec); + } + lhmsv_free(pstate->puniqified_records); + pstate->puniqified_records = NULL; + // lhmslv_free will free the keys: we only need to free the void-star values. for (lhmslve_t* pa = pstate->pcounts_by_group->phead; pa != NULL; pa = pa->pnext) { unsigned long long* pcount = pa->pvvalue; free(pcount); } lhmslv_free(pstate->pcounts_by_group); + pstate->pcounts_by_group = NULL; + for (lhmsve_t* pb = pstate->pcounts_unlashed->phead; pb != NULL; pb = pb->pnext) { lhmsll_t* pmap = pb->pvvalue; lhmsll_free(pmap); } lhmsv_free(pstate->pcounts_unlashed); + pstate->pcounts_unlashed = NULL; + pstate->pgroup_by_field_names = NULL; pstate->pcounts_by_group = NULL; - pstate->pcounts_unlashed = NULL; + ap_free(pstate->pargp); free(pstate); free(pmapper); } // ---------------------------------------------------------------- -static sllv_t* mapper_uniq_process_unlashed(lrec_t* pinrec, context_t* pctx, void* pvstate) { +// Print each unique record only once (on first occurrence). +static sllv_t* mapper_uniq_process_uniqify_entire_records( + lrec_t* pinrec, + context_t* pctx, + void* pvstate) +{ + mapper_uniq_state_t* pstate = pvstate; + if (pinrec != NULL) { + char* lrec_as_string = lrec_sprint(pinrec, "\xfc", "\xfd", "\xfe"); + if (lhmsll_has_key(pstate->puniqified_record_counts, lrec_as_string)) { + // have seen + free(lrec_as_string); + lrec_free(pinrec); + return sllv_single(NULL); + } else { + lhmsll_put(pstate->puniqified_record_counts, lrec_as_string, 1LL, FREE_ENTRY_KEY); + return sllv_single(pinrec); + } + } else { // end of record stream + return sllv_single(NULL); + } +} + +// Print each unique record only once, with uniqueness counts. This means +// non-streaming, with output at end of stream. +static sllv_t* mapper_uniq_process_uniqify_entire_records_show_counts( + lrec_t* pinrec, + context_t* pctx, + void* pvstate) +{ + mapper_uniq_state_t* pstate = pvstate; + if (pinrec != NULL) { + char* lrec_as_string = lrec_sprint(pinrec, "\xfc", "\xfd", "\xfe"); + if (lhmsll_test_and_increment(pstate->puniqified_record_counts, lrec_as_string)) { + // have seen; count was just incremented + free(lrec_as_string); + lrec_free(pinrec); + return sllv_single(NULL); + } else { + lhmsll_put(pstate->puniqified_record_counts, lrec_as_string, 1LL, FREE_ENTRY_KEY); + lhmsv_put(pstate->puniqified_records, mlr_strdup_or_die(lrec_as_string), pinrec, FREE_ENTRY_KEY); + return sllv_single(NULL); + } + } else { // end of record stream + sllv_t* poutrecs = sllv_alloc(); + + for (lhmsve_t* pe = pstate->puniqified_records->phead; pe != NULL; pe = pe->pnext) { + lrec_t* prec = pe->pvvalue; + long long count = lhmsll_get(pstate->puniqified_record_counts, pe->key); + lrec_prepend(prec, pstate->output_field_name, mlr_alloc_string_from_ll(count), FREE_ENTRY_VALUE); + sllv_append(poutrecs, prec); + pe->pvvalue = NULL; // transfer ownership to poutrecs + } + + return poutrecs; + } +} + +// Print count of unique records. This means non-streaming, with output at end +// of stream. +static sllv_t* mapper_uniq_process_uniqify_entire_records_show_num_distinct_only( + lrec_t* pinrec, + context_t* pctx, + void* pvstate) +{ + mapper_uniq_state_t* pstate = pvstate; + if (pinrec != NULL) { + char* lrec_as_string = lrec_sprint(pinrec, "\xfc", "\xfd", "\xfe"); + if (lhmsll_has_key(pstate->puniqified_record_counts, lrec_as_string)) { + free(lrec_as_string); + lrec_free(pinrec); + return sllv_single(NULL); + } else { + lhmsll_put(pstate->puniqified_record_counts, lrec_as_string, 1LL, FREE_ENTRY_KEY); + lrec_free(pinrec); + return sllv_single(NULL); + } + } else { // end of record stream + sllv_t* poutrecs = sllv_alloc(); + lrec_t* poutrec = lrec_unbacked_alloc(); + long long count = pstate->puniqified_record_counts->num_occupied; + lrec_put(poutrec, pstate->output_field_name, mlr_alloc_string_from_ll(count), FREE_ENTRY_VALUE); + sllv_append(poutrecs, poutrec); + return poutrecs; + } +} + +// ---------------------------------------------------------------- +static sllv_t* mapper_uniq_process_unlashed( + lrec_t* pinrec, + context_t* pctx, + void* pvstate) +{ mapper_uniq_state_t* pstate = pvstate; if (pinrec != NULL) { for (sllse_t* pe = pstate->pgroup_by_field_names->phead; pe != NULL; pe = pe->pnext) { @@ -239,7 +464,11 @@ } } -static sllv_t* mapper_uniq_process_num_distinct_only(lrec_t* pinrec, context_t* pctx, void* pvstate) { +static sllv_t* mapper_uniq_process_num_distinct_only( + lrec_t* pinrec, + context_t* pctx, + void* pvstate) +{ mapper_uniq_state_t* pstate = pvstate; if (pinrec != NULL) { slls_t* pgroup_by_field_values = mlr_reference_selected_values_from_record(pinrec, @@ -271,7 +500,11 @@ } } -static sllv_t* mapper_uniq_process_with_counts(lrec_t* pinrec, context_t* pctx, void* pvstate) { +static sllv_t* mapper_uniq_process_with_counts( + lrec_t* pinrec, + context_t* pctx, + void* pvstate) +{ mapper_uniq_state_t* pstate = pvstate; if (pinrec != NULL) { slls_t* pgroup_by_field_values = mlr_reference_selected_values_from_record(pinrec, @@ -315,7 +548,11 @@ } } -static sllv_t* mapper_uniq_process_no_counts(lrec_t* pinrec, context_t* pctx, void* pvstate) { +static sllv_t* mapper_uniq_process_no_counts( + lrec_t* pinrec, + context_t* pctx, + void* pvstate) +{ mapper_uniq_state_t* pstate = pvstate; if (pinrec == NULL) { return sllv_single(NULL); diff -Nru miller-5.3.0/c/mlrvers.h miller-5.4.0/c/mlrvers.h --- miller-5.3.0/c/mlrvers.h 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/c/mlrvers.h 2018-10-14 20:17:52.000000000 +0000 @@ -1,5 +1,5 @@ #ifndef MLRVERS_H #define MLRVERS_H // Manually increment on updates to https://github.com/johnkerl/miller/releases -#define MLR_VERSION "v5.3.0" +#define MLR_VERSION "v5.4.0" #endif // MLRVERS_H diff -Nru miller-5.3.0/c/oo miller-5.4.0/c/oo --- miller-5.3.0/c/oo 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/c/oo 2018-10-14 20:17:52.000000000 +0000 @@ -51,39 +51,3 @@ vee= # ---------------------------------------------------------------- -# It's annoying trying to check in text files (especially CSV) with CRLF -# to Git, given that it likes to 'fix' line endings for multi-platform use. -# It's easy to simply create CRLF on the fly. -run_mlr_for_auxents termcvt --lf2crlf < $indir/comments/comments1.json > $outdir/comments1-crlf.json - -echo "AAA" -ls -l $indir/comments/comments1.json -ls -l $outdir/comments1-crlf.json -hex $outdir/comments1-crlf.json - -mention input comments1-crlf.json -run_cat $outdir/comments1-crlf.json - -echo "BBB" -ls -l $outdir/comments1-crlf.json -hex $outdir/comments1-crlf.json - -mention skip comments1-crlf.json -run_mlr --skip-comments --ijson --odkvp cat < $outdir/comments1-crlf.json -echo "CCC" -ls -l $outdir/comments1-crlf.json -hex $outdir/comments1-crlf.json -run_mlr --skip-comments --ijson --odkvp cat $outdir/comments1-crlf.json -echo "DDD" -ls -l $outdir/comments1-crlf.json -hex $outdir/comments1-crlf.json - -mention pass comments1-crlf.json -run_mlr --pass-comments --ijson --odkvp cat < $outdir/comments1-crlf.json -echo "EEE" -ls -l $outdir/comments1-crlf.json -hex $outdir/comments1-crlf.json -run_mlr --pass-comments --ijson --odkvp cat $outdir/comments1-crlf.json -echo "FFF" -ls -l $outdir/comments1-crlf.json -hex $outdir/comments1-crlf.json diff -Nru miller-5.3.0/c/regdiff miller-5.4.0/c/regdiff --- miller-5.3.0/c/regdiff 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/c/regdiff 2018-10-14 20:17:52.000000000 +0000 @@ -1,2 +1,2 @@ #!/bin/bash -diff -I '^mlr ' "$@" reg_test/expected/out output-regtest/out +diff -a -I '^mlr ' -I '^cat ' "$@" reg_test/expected/out output-regtest/out diff -Nru miller-5.3.0/c/reg_test/expected/out miller-5.4.0/c/reg_test/expected/out --- miller-5.3.0/c/reg_test/expected/out 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/c/reg_test/expected/out 2018-10-14 20:17:52.000000000 +0000 @@ -468,6 +468,12 @@ hat wye 9 [0.0314419]#.........[0.75868] [0.134189]*******...[0.976181] pan wye 10 [0.0314419]******....[0.75868] [0.134189]*********.[0.976181] +mlr altkv +a=b,c=d,e=f + +mlr altkv +a=b,c=d,e=f,4=g + ================================================================ TRIVIAL RETAINERS @@ -979,6 +985,39 @@ a=zee,b=wye,foo=1 a=pan,b=wye,foo=1 +mlr uniq -a ./reg_test/input/repeats.dkvp +color=red,shape=square,flag=0 +color=purple,shape=triangle,flag=0 +color=yellow,shape=circle,flag=1 +color=red,shape=circle,flag=1 +color=purple,shape=square,flag=0 +color=red,shape=square,flag=1 +color=yellow,shape=triangle,flag=1 + +mlr uniq -a -c ./reg_test/input/repeats.dkvp +count=17,color=red,shape=square,flag=0 +count=11,color=purple,shape=triangle,flag=0 +count=11,color=yellow,shape=circle,flag=1 +count=6,color=red,shape=circle,flag=1 +count=7,color=purple,shape=square,flag=0 +count=3,color=red,shape=square,flag=1 +count=2,color=yellow,shape=triangle,flag=1 + +mlr uniq -a -c -o foo ./reg_test/input/repeats.dkvp +foo=17,color=red,shape=square,flag=0 +foo=11,color=purple,shape=triangle,flag=0 +foo=11,color=yellow,shape=circle,flag=1 +foo=6,color=red,shape=circle,flag=1 +foo=7,color=purple,shape=square,flag=0 +foo=3,color=red,shape=square,flag=1 +foo=2,color=yellow,shape=triangle,flag=1 + +mlr uniq -a -n ./reg_test/input/repeats.dkvp +count=7 + +mlr uniq -a -n -o bar ./reg_test/input/repeats.dkvp +bar=7 + mlr count-distinct -f a -o foo ./reg_test/input/small ./reg_test/input/abixy a=pan,foo=4 a=eks,foo=6 @@ -1053,6 +1092,172 @@ ================================================================ +WHITESPACE-REDUCTION + +mlr --icsv --ojson cat ./reg_test/input/clean-whitespace.csv +{ "n": 1, "a": "xy", "b ": 2, " c": 3 } +{ "n": 2, "a": "xy ", "b ": 2, " c": 3 } +{ "n": 3, "a": "xy ", "b ": 2, " c": 3 } +{ "n": 4, "a": "xy ", "b ": 2, " c": 3 } +{ "n": 5, "a": "xy", "b ": 2, " c": 3 } +{ "n": 6, "a": " xy", "b ": 2, " c": 3 } +{ "n": 7, "a": " xy", "b ": 2, " c": 3 } +{ "n": 8, "a": " xy", "b ": 2, " c": 3 } +{ "n": 9, "a": "xy", "b ": 2, " c": 3 } +{ "n": 10, "a": " xy ", "b ": 2, " c": 3 } +{ "n": 11, "a": " xy ", "b ": 2, " c": 3 } +{ "n": 12, "a": " xy ", "b ": 2, " c": 3 } +{ "n": 13, "a": "", "b ": 2, " c": 3 } +{ "n": 14, "a": " ", "b ": 2, " c": 3 } +{ "n": 15, "a": " ", "b ": 2, " c": 3 } +{ "n": 16, "a": " ", "b ": 2, " c": 3 } + +mlr --icsv --ojson put $a = lstrip($a) ./reg_test/input/clean-whitespace.csv +{ "n": 1, "a": "xy", "b ": 2, " c": 3 } +{ "n": 2, "a": "xy ", "b ": 2, " c": 3 } +{ "n": 3, "a": "xy ", "b ": 2, " c": 3 } +{ "n": 4, "a": "xy ", "b ": 2, " c": 3 } +{ "n": 5, "a": "xy", "b ": 2, " c": 3 } +{ "n": 6, "a": "xy", "b ": 2, " c": 3 } +{ "n": 7, "a": "xy", "b ": 2, " c": 3 } +{ "n": 8, "a": "xy", "b ": 2, " c": 3 } +{ "n": 9, "a": "xy", "b ": 2, " c": 3 } +{ "n": 10, "a": "xy ", "b ": 2, " c": 3 } +{ "n": 11, "a": "xy ", "b ": 2, " c": 3 } +{ "n": 12, "a": "xy ", "b ": 2, " c": 3 } +{ "n": 13, "a": "", "b ": 2, " c": 3 } +{ "n": 14, "a": "", "b ": 2, " c": 3 } +{ "n": 15, "a": "", "b ": 2, " c": 3 } +{ "n": 16, "a": "", "b ": 2, " c": 3 } + +mlr --icsv --ojson put $a = rstrip($a) ./reg_test/input/clean-whitespace.csv +{ "n": 1, "a": "xy", "b ": 2, " c": 3 } +{ "n": 2, "a": "xy", "b ": 2, " c": 3 } +{ "n": 3, "a": "xy", "b ": 2, " c": 3 } +{ "n": 4, "a": "xy", "b ": 2, " c": 3 } +{ "n": 5, "a": "xy", "b ": 2, " c": 3 } +{ "n": 6, "a": " xy", "b ": 2, " c": 3 } +{ "n": 7, "a": " xy", "b ": 2, " c": 3 } +{ "n": 8, "a": " xy", "b ": 2, " c": 3 } +{ "n": 9, "a": "xy", "b ": 2, " c": 3 } +{ "n": 10, "a": " xy", "b ": 2, " c": 3 } +{ "n": 11, "a": " xy", "b ": 2, " c": 3 } +{ "n": 12, "a": " xy", "b ": 2, " c": 3 } +{ "n": 13, "a": "", "b ": 2, " c": 3 } +{ "n": 14, "a": "", "b ": 2, " c": 3 } +{ "n": 15, "a": "", "b ": 2, " c": 3 } +{ "n": 16, "a": "", "b ": 2, " c": 3 } + +mlr --icsv --ojson put $a = strip($a) ./reg_test/input/clean-whitespace.csv +{ "n": 1, "a": "xy", "b ": 2, " c": 3 } +{ "n": 2, "a": "xy", "b ": 2, " c": 3 } +{ "n": 3, "a": "xy", "b ": 2, " c": 3 } +{ "n": 4, "a": "xy", "b ": 2, " c": 3 } +{ "n": 5, "a": "xy", "b ": 2, " c": 3 } +{ "n": 6, "a": "xy", "b ": 2, " c": 3 } +{ "n": 7, "a": "xy", "b ": 2, " c": 3 } +{ "n": 8, "a": "xy", "b ": 2, " c": 3 } +{ "n": 9, "a": "xy", "b ": 2, " c": 3 } +{ "n": 10, "a": "xy", "b ": 2, " c": 3 } +{ "n": 11, "a": "xy", "b ": 2, " c": 3 } +{ "n": 12, "a": "xy", "b ": 2, " c": 3 } +{ "n": 13, "a": "", "b ": 2, " c": 3 } +{ "n": 14, "a": "", "b ": 2, " c": 3 } +{ "n": 15, "a": "", "b ": 2, " c": 3 } +{ "n": 16, "a": "", "b ": 2, " c": 3 } + +mlr --icsv --ojson put $a = collapse_whitespace($a) ./reg_test/input/clean-whitespace.csv +{ "n": 1, "a": "xy", "b ": 2, " c": 3 } +{ "n": 2, "a": "xy ", "b ": 2, " c": 3 } +{ "n": 3, "a": "xy ", "b ": 2, " c": 3 } +{ "n": 4, "a": "xy ", "b ": 2, " c": 3 } +{ "n": 5, "a": "xy", "b ": 2, " c": 3 } +{ "n": 6, "a": " xy", "b ": 2, " c": 3 } +{ "n": 7, "a": " xy", "b ": 2, " c": 3 } +{ "n": 8, "a": " xy", "b ": 2, " c": 3 } +{ "n": 9, "a": "xy", "b ": 2, " c": 3 } +{ "n": 10, "a": " xy ", "b ": 2, " c": 3 } +{ "n": 11, "a": " xy ", "b ": 2, " c": 3 } +{ "n": 12, "a": " xy ", "b ": 2, " c": 3 } +{ "n": 13, "a": "", "b ": 2, " c": 3 } +{ "n": 14, "a": " ", "b ": 2, " c": 3 } +{ "n": 15, "a": " ", "b ": 2, " c": 3 } +{ "n": 16, "a": " ", "b ": 2, " c": 3 } + +mlr --icsv --ojson put $a = clean_whitespace($a) ./reg_test/input/clean-whitespace.csv +{ "n": 1, "a": "xy", "b ": 2, " c": 3 } +{ "n": 2, "a": "xy", "b ": 2, " c": 3 } +{ "n": 3, "a": "xy", "b ": 2, " c": 3 } +{ "n": 4, "a": "xy", "b ": 2, " c": 3 } +{ "n": 5, "a": "xy", "b ": 2, " c": 3 } +{ "n": 6, "a": "xy", "b ": 2, " c": 3 } +{ "n": 7, "a": "xy", "b ": 2, " c": 3 } +{ "n": 8, "a": "xy", "b ": 2, " c": 3 } +{ "n": 9, "a": "xy", "b ": 2, " c": 3 } +{ "n": 10, "a": "xy", "b ": 2, " c": 3 } +{ "n": 11, "a": "xy", "b ": 2, " c": 3 } +{ "n": 12, "a": "xy", "b ": 2, " c": 3 } +{ "n": 13, "a": "", "b ": 2, " c": 3 } +{ "n": 14, "a": "", "b ": 2, " c": 3 } +{ "n": 15, "a": "", "b ": 2, " c": 3 } +{ "n": 16, "a": "", "b ": 2, " c": 3 } + +mlr --icsv --ojson clean-whitespace -k ./reg_test/input/clean-whitespace.csv +{ "n": 1, "a": "xy", "b": 2, "c": 3 } +{ "n": 2, "a": "xy ", "b": 2, "c": 3 } +{ "n": 3, "a": "xy ", "b": 2, "c": 3 } +{ "n": 4, "a": "xy ", "b": 2, "c": 3 } +{ "n": 5, "a": "xy", "b": 2, "c": 3 } +{ "n": 6, "a": " xy", "b": 2, "c": 3 } +{ "n": 7, "a": " xy", "b": 2, "c": 3 } +{ "n": 8, "a": " xy", "b": 2, "c": 3 } +{ "n": 9, "a": "xy", "b": 2, "c": 3 } +{ "n": 10, "a": " xy ", "b": 2, "c": 3 } +{ "n": 11, "a": " xy ", "b": 2, "c": 3 } +{ "n": 12, "a": " xy ", "b": 2, "c": 3 } +{ "n": 13, "a": "", "b": 2, "c": 3 } +{ "n": 14, "a": " ", "b": 2, "c": 3 } +{ "n": 15, "a": " ", "b": 2, "c": 3 } +{ "n": 16, "a": " ", "b": 2, "c": 3 } + +mlr --icsv --ojson clean-whitespace -v ./reg_test/input/clean-whitespace.csv +{ "n": 1, "a": "xy", "b ": 2, " c": 3 } +{ "n": 2, "a": "xy", "b ": 2, " c": 3 } +{ "n": 3, "a": "xy", "b ": 2, " c": 3 } +{ "n": 4, "a": "xy", "b ": 2, " c": 3 } +{ "n": 5, "a": "xy", "b ": 2, " c": 3 } +{ "n": 6, "a": "xy", "b ": 2, " c": 3 } +{ "n": 7, "a": "xy", "b ": 2, " c": 3 } +{ "n": 8, "a": "xy", "b ": 2, " c": 3 } +{ "n": 9, "a": "xy", "b ": 2, " c": 3 } +{ "n": 10, "a": "xy", "b ": 2, " c": 3 } +{ "n": 11, "a": "xy", "b ": 2, " c": 3 } +{ "n": 12, "a": "xy", "b ": 2, " c": 3 } +{ "n": 13, "a": "", "b ": 2, " c": 3 } +{ "n": 14, "a": "", "b ": 2, " c": 3 } +{ "n": 15, "a": "", "b ": 2, " c": 3 } +{ "n": 16, "a": "", "b ": 2, " c": 3 } + +mlr --icsv --ojson clean-whitespace ./reg_test/input/clean-whitespace.csv +{ "n": 1, "a": "xy", "b": 2, "c": 3 } +{ "n": 2, "a": "xy", "b": 2, "c": 3 } +{ "n": 3, "a": "xy", "b": 2, "c": 3 } +{ "n": 4, "a": "xy", "b": 2, "c": 3 } +{ "n": 5, "a": "xy", "b": 2, "c": 3 } +{ "n": 6, "a": "xy", "b": 2, "c": 3 } +{ "n": 7, "a": "xy", "b": 2, "c": 3 } +{ "n": 8, "a": "xy", "b": 2, "c": 3 } +{ "n": 9, "a": "xy", "b": 2, "c": 3 } +{ "n": 10, "a": "xy", "b": 2, "c": 3 } +{ "n": 11, "a": "xy", "b": 2, "c": 3 } +{ "n": 12, "a": "xy", "b": 2, "c": 3 } +{ "n": 13, "a": "", "b": 2, "c": 3 } +{ "n": 14, "a": "", "b": 2, "c": 3 } +{ "n": 15, "a": "", "b": 2, "c": 3 } +{ "n": 16, "a": "", "b": 2, "c": 3 } + + +================================================================ SORT mlr sort -f a ./reg_test/input/abixy @@ -3568,6 +3773,22 @@ ================================================================ +FILL-DOWN + +mlr --csv fill-down -f a,b,c ./reg_test/input/fill-down.csv +a,b,c +1,,3 +4,5,6 +7,5,9 + +mlr --csv fill-down -f a,b,c -a ./reg_test/input/fill-down.csv +a,b,c +1,,3 +4,5,6 +7,,9 + + +================================================================ SEQGEN mlr seqgen --start 1 --stop 5 --step 1 @@ -23403,7 +23624,7 @@ ================================================================ -DSL DATETIME FUNCTIONS +DSL GMT DATE/TIME FUNCTIONS mlr --csvlite put $gmt = sec2gmt($sec) ./reg_test/input/sec2gmt n,sec,gmt @@ -23927,7 +24148,68 @@ ================================================================ -DSL SUB/GSUB +DSL LOCAL DATE/TIME FUNCTIONS + +mlr --opprint put $b=localtime2sec($a); $c=sec2localtime($b); $d=sec2localdate($b) +a b c d +2017-02-18 23:00:00 1487469600.000000 2017-02-18 23:00:00 2017-02-18 +2017-02-18 23:59:59 1487473199.000000 2017-02-18 23:59:59 2017-02-18 +2017-02-19 00:00:00 1487473200.000000 2017-02-19 00:00:00 2017-02-19 +2017-02-19 00:30:00 1487475000.000000 2017-02-19 00:30:00 2017-02-19 +2017-02-19 01:00:00 1487476800.000000 2017-02-19 01:00:00 2017-02-19 +2017-10-14 23:00:00 1508032800.000000 2017-10-14 23:00:00 2017-10-14 +2017-10-14 23:59:59 1508036399.000000 2017-10-14 23:59:59 2017-10-14 +2017-10-15 00:00:00 1508036400.000000 2017-10-15 01:00:00 2017-10-15 +2017-10-15 00:30:00 1508038200.000000 2017-10-15 01:30:00 2017-10-15 +2017-10-15 01:00:00 1508040000.000000 2017-10-15 02:00:00 2017-10-15 + +mlr --opprint put $b=localtime2sec($a); $c=sec2localtime($b); $d=sec2localdate($b) +a b c d +2017-02-14 00:00:00 1487041200.000000 2017-02-14 01:00:00 2017-02-14 +2017-02-15 00:00:00 1487127600.000000 2017-02-15 01:00:00 2017-02-15 +2017-02-16 00:00:00 1487214000.000000 2017-02-16 01:00:00 2017-02-16 +2017-02-17 00:00:00 1487300400.000000 2017-02-17 01:00:00 2017-02-17 +2017-02-18 00:00:00 1487386800.000000 2017-02-18 01:00:00 2017-02-18 +2017-02-19 00:00:00 1487473200.000000 2017-02-19 00:00:00 2017-02-19 +2017-02-20 00:00:00 1487559600.000000 2017-02-20 00:00:00 2017-02-20 +2017-10-12 00:00:00 1507777200.000000 2017-10-12 00:00:00 2017-10-12 +2017-10-13 00:00:00 1507863600.000000 2017-10-13 00:00:00 2017-10-13 +2017-10-14 00:00:00 1507950000.000000 2017-10-14 00:00:00 2017-10-14 +2017-10-15 00:00:00 1508036400.000000 2017-10-15 01:00:00 2017-10-15 +2017-10-16 00:00:00 1508122800.000000 2017-10-16 01:00:00 2017-10-16 +2017-10-17 00:00:00 1508209200.000000 2017-10-17 01:00:00 2017-10-17 +2017-10-18 00:00:00 1508295600.000000 2017-10-18 01:00:00 2017-10-18 +2017-10-19 00:00:00 1508382000.000000 2017-10-19 01:00:00 2017-10-19 + +mlr --opprint put $b=strptime_local($a, "%Y-%m-%d %H:%M:%S"); $c=strftime_local($b, "%Y-%m-%d %H:%M:%S") +a b c +2017-02-18 23:00:00 1487469600.000000 2017-02-18 23:00:00 +2017-02-18 23:59:59 1487473199.000000 2017-02-18 23:59:59 +2017-02-19 00:00:00 1487473200.000000 2017-02-19 00:00:00 +2017-02-19 00:30:00 1487475000.000000 2017-02-19 00:30:00 +2017-02-19 01:00:00 1487476800.000000 2017-02-19 01:00:00 +2017-10-14 23:00:00 1508032800.000000 2017-10-14 23:00:00 +2017-10-14 23:59:59 1508036399.000000 2017-10-14 23:59:59 +2017-10-15 00:00:00 1508036400.000000 2017-10-15 01:00:00 +2017-10-15 00:30:00 1508038200.000000 2017-10-15 01:30:00 +2017-10-15 01:00:00 1508040000.000000 2017-10-15 02:00:00 + +mlr --opprint put $b=strptime_local($a, "%Y-%m-%d %H:%M:%S"); $c=strftime_local($b, "%Y-%m-%d %H:%M:%S") +a b c +2017-02-18 23:00:00 1487469600.000000 2017-02-18 23:00:00 +2017-02-18 23:59:59 1487473199.000000 2017-02-18 23:59:59 +2017-02-19 00:00:00 1487473200.000000 2017-02-19 00:00:00 +2017-02-19 00:30:00 1487475000.000000 2017-02-19 00:30:00 +2017-02-19 01:00:00 1487476800.000000 2017-02-19 01:00:00 +2017-10-14 23:00:00 1508032800.000000 2017-10-14 23:00:00 +2017-10-14 23:59:59 1508036399.000000 2017-10-14 23:59:59 +2017-10-15 00:00:00 1508036400.000000 2017-10-15 01:00:00 +2017-10-15 00:30:00 1508038200.000000 2017-10-15 01:30:00 +2017-10-15 01:00:00 1508040000.000000 2017-10-15 02:00:00 + + +================================================================ +DSL SUB/GSUB/REGEX_EXTRACT mlr --opprint put $y = sub($x, "e.*l", "") ./reg_test/input/sub.dat x y @@ -24176,6 +24458,421 @@ x y +mlr --opprint put $y = ssub($x, "HE", "") ./reg_test/input/sub.dat +x y +hello hello +HELLO LLO +world world +WORLD WORLD + +mlr --opprint put $y = ssub($x, "HE", "HE") ./reg_test/input/sub.dat +x y +hello hello +HELLO HELLO +world world +WORLD WORLD + +mlr --opprint put $y = ssub($x, "HE", "12345") ./reg_test/input/sub.dat +x y +hello hello +HELLO 12345LLO +world world +WORLD WORLD + +mlr --opprint put $y = ssub($x, "LL", "1") ./reg_test/input/sub.dat +x y +hello hello +HELLO HE1O +world world +WORLD WORLD + +mlr --opprint put $y = ssub($x, "LL", "12") ./reg_test/input/sub.dat +x y +hello hello +HELLO HE12O +world world +WORLD WORLD + +mlr --opprint put $y = ssub($x, "LL", "12345") ./reg_test/input/sub.dat +x y +hello hello +HELLO HE12345O +world world +WORLD WORLD + +mlr --opprint put $y = ssub($x, "LLO", "") ./reg_test/input/sub.dat +x y +hello hello +HELLO HE +world world +WORLD WORLD + +mlr --opprint put $y = ssub($x, "LLO", "12") ./reg_test/input/sub.dat +x y +hello hello +HELLO HE12 +world world +WORLD WORLD + +mlr --opprint put $y = ssub($x, "LLO", "123") ./reg_test/input/sub.dat +x y +hello hello +HELLO HE123 +world world +WORLD WORLD + +mlr --opprint put $y = ssub($x, "LLO", "123456") ./reg_test/input/sub.dat +x y +hello hello +HELLO HE123456 +world world +WORLD WORLD + +mlr --opprint put $y = ssub($x, "HELLO", "") ./reg_test/input/sub.dat +x y +hello hello +HELLO - +world world +WORLD WORLD + +mlr --opprint put $y = ssub($x, "HELLO", "1234") ./reg_test/input/sub.dat +x y +hello hello +HELLO 1234 +world world +WORLD WORLD + +mlr --opprint put $y = ssub($x, "HELLO", "12345") ./reg_test/input/sub.dat +x y +hello hello +HELLO 12345 +world world +WORLD WORLD + +mlr --opprint put $y = ssub($x, "HELLO", "1234678") ./reg_test/input/sub.dat +x y +hello hello +HELLO 1234678 +world world +WORLD WORLD + +mlr --opprint put $y = ssub($x, "nonesuch", "") ./reg_test/input/sub.dat +x y +hello hello +HELLO HELLO +world world +WORLD WORLD + +mlr --opprint put $y = ssub($x, "nonesuch", "1234") ./reg_test/input/sub.dat +x y +hello hello +HELLO HELLO +world world +WORLD WORLD + +mlr --opprint put $y = ssub($x, "nonesuch", "1234567890") ./reg_test/input/sub.dat +x y +hello hello +HELLO HELLO +world world +WORLD WORLD + +mlr --oxtab put $y = regextract($x, "[A-Z]+") ./reg_test/input/sub.dat +x hello + +x HELLO +y HELLO + +x world + +x WORLD +y WORLD + +mlr --oxtab put $y = regextract($x, "[A-Z]*") ./reg_test/input/sub.dat +x hello +y + +x HELLO +y HELLO + +x world +y + +x WORLD +y WORLD + +mlr --oxtab put $y = regextract($x, "[a-z]+") ./reg_test/input/sub.dat +x hello +y hello + +x HELLO + +x world +y world + +x WORLD + +mlr --oxtab put $y = regextract($x, "[a-z]*") ./reg_test/input/sub.dat +x hello +y hello + +x HELLO +y + +x world +y world + +x WORLD +y + +mlr --oxtab put $y = regextract($x, "[0-9]+") ./reg_test/input/sub.dat +x hello + +x HELLO + +x world + +x WORLD + +mlr --oxtab put $y = regextract($x, "[0-9]*") ./reg_test/input/sub.dat +x hello +y + +x HELLO +y + +x world +y + +x WORLD +y + +mlr --oxtab put $y = regextract($x, "[ef]+") ./reg_test/input/sub.dat +x hello +y e + +x HELLO + +x world + +x WORLD + +mlr --oxtab put $y = regextract($x, "[ef]*") ./reg_test/input/sub.dat +x hello +y + +x HELLO +y + +x world +y + +x WORLD +y + +mlr --oxtab put $y = regextract($x, "[hi]+") ./reg_test/input/sub.dat +x hello +y h + +x HELLO + +x world + +x WORLD + +mlr --oxtab put $y = regextract($x, "[hi]*") ./reg_test/input/sub.dat +x hello +y h + +x HELLO +y + +x world +y + +x WORLD +y + +mlr --oxtab put $y = regextract($x, "[op]+") ./reg_test/input/sub.dat +x hello +y o + +x HELLO + +x world +y o + +x WORLD + +mlr --oxtab put $y = regextract($x, "[op]*") ./reg_test/input/sub.dat +x hello +y + +x HELLO +y + +x world +y + +x WORLD +y + +mlr --oxtab put $y = regextract_or_else($x, "[A-Z]+", "DEFAULT") ./reg_test/input/sub.dat +x hello +y DEFAULT + +x HELLO +y HELLO + +x world +y DEFAULT + +x WORLD +y WORLD + +mlr --oxtab put $y = regextract_or_else($x, "[A-Z]*", "DEFAULT") ./reg_test/input/sub.dat +x hello +y + +x HELLO +y HELLO + +x world +y + +x WORLD +y WORLD + +mlr --oxtab put $y = regextract_or_else($x, "[a-z]+", "DEFAULT") ./reg_test/input/sub.dat +x hello +y hello + +x HELLO +y DEFAULT + +x world +y world + +x WORLD +y DEFAULT + +mlr --oxtab put $y = regextract_or_else($x, "[a-z]*", "DEFAULT") ./reg_test/input/sub.dat +x hello +y hello + +x HELLO +y + +x world +y world + +x WORLD +y + +mlr --oxtab put $y = regextract_or_else($x, "[0-9]+", "DEFAULT") ./reg_test/input/sub.dat +x hello +y DEFAULT + +x HELLO +y DEFAULT + +x world +y DEFAULT + +x WORLD +y DEFAULT + +mlr --oxtab put $y = regextract_or_else($x, "[0-9]*", "DEFAULT") ./reg_test/input/sub.dat +x hello +y + +x HELLO +y + +x world +y + +x WORLD +y + +mlr --oxtab put $y = regextract_or_else($x, "[ef]+", "DEFAULT") ./reg_test/input/sub.dat +x hello +y e + +x HELLO +y DEFAULT + +x world +y DEFAULT + +x WORLD +y DEFAULT + +mlr --oxtab put $y = regextract_or_else($x, "[ef]*", "DEFAULT") ./reg_test/input/sub.dat +x hello +y + +x HELLO +y + +x world +y + +x WORLD +y + +mlr --oxtab put $y = regextract_or_else($x, "[hi]+", "DEFAULT") ./reg_test/input/sub.dat +x hello +y h + +x HELLO +y DEFAULT + +x world +y DEFAULT + +x WORLD +y DEFAULT + +mlr --oxtab put $y = regextract_or_else($x, "[hi]*", "DEFAULT") ./reg_test/input/sub.dat +x hello +y h + +x HELLO +y + +x world +y + +x WORLD +y + +mlr --oxtab put $y = regextract_or_else($x, "[op]+", "DEFAULT") ./reg_test/input/sub.dat +x hello +y o + +x HELLO +y DEFAULT + +x world +y o + +x WORLD +y DEFAULT + +mlr --oxtab put $y = regextract_or_else($x, "[op]*", "DEFAULT") ./reg_test/input/sub.dat +x hello +y + +x HELLO +y + +x world +y + +x WORLD +y + ================================================================ DSL SUBSTR diff -Nru miller-5.3.0/c/reg_test/input/clean-whitespace.csv miller-5.4.0/c/reg_test/input/clean-whitespace.csv --- miller-5.3.0/c/reg_test/input/clean-whitespace.csv 1970-01-01 00:00:00.000000000 +0000 +++ miller-5.4.0/c/reg_test/input/clean-whitespace.csv 2018-10-14 20:17:52.000000000 +0000 @@ -0,0 +1,17 @@ +n,a,b , c +1,xy,2,3 +2,xy ,2,3 +3,xy ,2,3 +4,xy ,2,3 +5,xy,2,3 +6, xy,2,3 +7, xy,2,3 +8, xy,2,3 +9,xy,2,3 +10, xy ,2,3 +11, xy ,2,3 +12, xy ,2,3 +13,,2,3 +14, ,2,3 +15, ,2,3 +16, ,2,3 diff -Nru miller-5.3.0/c/reg_test/input/fill-down.csv miller-5.4.0/c/reg_test/input/fill-down.csv --- miller-5.3.0/c/reg_test/input/fill-down.csv 1970-01-01 00:00:00.000000000 +0000 +++ miller-5.4.0/c/reg_test/input/fill-down.csv 2018-10-14 20:17:52.000000000 +0000 @@ -0,0 +1,4 @@ +a,b,c +1,,3 +4,5,6 +7,,9 diff -Nru miller-5.3.0/c/reg_test/input/Makefile.am miller-5.4.0/c/reg_test/input/Makefile.am --- miller-5.3.0/c/reg_test/input/Makefile.am 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/c/reg_test/input/Makefile.am 2018-10-14 20:17:52.000000000 +0000 @@ -27,6 +27,7 @@ c.pprint \ capture-lengths.dkvp \ capture.dkvp \ + clean-whitespace.csv \ comments \ d.csv \ d.pprint \ @@ -43,6 +44,7 @@ escapes.json \ f.csv \ f.pprint \ + fill-down.csv \ filter-example.dsl \ filter-script-piece-1 \ filter-script-piece-2 \ @@ -149,6 +151,7 @@ regex.dkvp \ regularize.dkvp \ repeat-input.dat \ + repeats.dkvp \ reshape-long-ragged.dkvp \ reshape-long.tbl \ reshape-wide-ragged.dkvp \ diff -Nru miller-5.3.0/c/reg_test/input/repeats.dkvp miller-5.4.0/c/reg_test/input/repeats.dkvp --- miller-5.3.0/c/reg_test/input/repeats.dkvp 1970-01-01 00:00:00.000000000 +0000 +++ miller-5.4.0/c/reg_test/input/repeats.dkvp 2018-10-14 20:17:52.000000000 +0000 @@ -0,0 +1,57 @@ +color=red,shape=square,flag=0 +color=purple,shape=triangle,flag=0 +color=yellow,shape=circle,flag=1 +color=red,shape=circle,flag=1 +color=red,shape=square,flag=0 +color=yellow,shape=circle,flag=1 +color=red,shape=square,flag=0 +color=red,shape=square,flag=0 +color=yellow,shape=circle,flag=1 +color=red,shape=circle,flag=1 +color=yellow,shape=circle,flag=1 +color=yellow,shape=circle,flag=1 +color=purple,shape=triangle,flag=0 +color=yellow,shape=circle,flag=1 +color=yellow,shape=circle,flag=1 +color=red,shape=circle,flag=1 +color=red,shape=square,flag=0 +color=purple,shape=triangle,flag=0 +color=yellow,shape=circle,flag=1 +color=red,shape=square,flag=0 +color=purple,shape=square,flag=0 +color=red,shape=square,flag=0 +color=red,shape=square,flag=1 +color=red,shape=square,flag=0 +color=red,shape=square,flag=0 +color=purple,shape=triangle,flag=0 +color=red,shape=square,flag=0 +color=purple,shape=triangle,flag=0 +color=red,shape=square,flag=0 +color=red,shape=square,flag=0 +color=purple,shape=square,flag=0 +color=red,shape=square,flag=0 +color=red,shape=square,flag=0 +color=purple,shape=triangle,flag=0 +color=yellow,shape=triangle,flag=1 +color=purple,shape=square,flag=0 +color=yellow,shape=circle,flag=1 +color=purple,shape=triangle,flag=0 +color=red,shape=circle,flag=1 +color=purple,shape=triangle,flag=0 +color=purple,shape=triangle,flag=0 +color=red,shape=square,flag=0 +color=red,shape=circle,flag=1 +color=red,shape=square,flag=1 +color=red,shape=square,flag=0 +color=red,shape=circle,flag=1 +color=purple,shape=square,flag=0 +color=purple,shape=square,flag=0 +color=red,shape=square,flag=1 +color=purple,shape=triangle,flag=0 +color=purple,shape=triangle,flag=0 +color=purple,shape=square,flag=0 +color=yellow,shape=circle,flag=1 +color=red,shape=square,flag=0 +color=yellow,shape=triangle,flag=1 +color=yellow,shape=circle,flag=1 +color=purple,shape=square,flag=0 diff -Nru miller-5.3.0/c/reg_test/run miller-5.4.0/c/reg_test/run --- miller-5.3.0/c/reg_test/run 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/c/reg_test/run 2018-10-14 20:17:52.000000000 +0000 @@ -213,6 +213,13 @@ run_mlr --opprint bar -f x,y -c c -x x -b b --lo 0.1 --hi 0.9 -w 20 $indir/abixy run_mlr --opprint bar --auto -f x,y -w 10 $indir/abixy +run_mlr altkv < 0.5) {$z = "flag"}' $indir/abixy # ---------------------------------------------------------------- -announce DSL DATETIME FUNCTIONS +announce DSL GMT DATE/TIME FUNCTIONS run_mlr --csvlite put '$gmt = sec2gmt($sec)' $indir/sec2gmt run_mlr --csvlite put '$gmt = sec2gmt($sec,1)' $indir/sec2gmt @@ -1945,7 +1978,83 @@ run_mlr --csvlite sec2gmtdate sec $indir/sec2gmt # ---------------------------------------------------------------- -announce DSL SUB/GSUB +announce DSL LOCAL DATE/TIME FUNCTIONS + +# See also the system date command: +# export TZ=America/Sao_Paulo; date -j -f "%Y-%m-%d %H:%M:%S %Z" "2017-02-19 00:30:00" +%s +# export TZ=America/Sao_Paulo; date -r 86400 +"%Y-%m-%d %H:%M:%S %Z" + +export TZ=America/Sao_Paulo +echo TZ=$TZ +run_mlr --opprint put '$b=localtime2sec($a); $c=sec2localtime($b); $d=sec2localdate($b)' <<_EOF +a=2017-02-18 23:00:00 +a=2017-02-18 23:59:59 +a=2017-02-19 00:00:00 +a=2017-02-19 00:30:00 +a=2017-02-19 01:00:00 +a=2017-10-14 23:00:00 +a=2017-10-14 23:59:59 +a=2017-10-15 00:00:00 +a=2017-10-15 00:30:00 +a=2017-10-15 01:00:00 +_EOF +export TZ= + +export TZ=America/Sao_Paulo +echo TZ=$TZ +run_mlr --opprint put '$b=localtime2sec($a); $c=sec2localtime($b); $d=sec2localdate($b)' <<_EOF +a=2017-02-14 00:00:00 +a=2017-02-15 00:00:00 +a=2017-02-16 00:00:00 +a=2017-02-17 00:00:00 +a=2017-02-18 00:00:00 +a=2017-02-19 00:00:00 +a=2017-02-20 00:00:00 +a=2017-10-12 00:00:00 +a=2017-10-13 00:00:00 +a=2017-10-14 00:00:00 +a=2017-10-15 00:00:00 +a=2017-10-16 00:00:00 +a=2017-10-17 00:00:00 +a=2017-10-18 00:00:00 +a=2017-10-19 00:00:00 +_EOF +export TZ= + +export TZ=America/Sao_Paulo +echo TZ=$TZ +run_mlr --opprint put '$b=strptime_local($a, "%Y-%m-%d %H:%M:%S"); $c=strftime_local($b, "%Y-%m-%d %H:%M:%S")' <<_EOF +a=2017-02-18 23:00:00 +a=2017-02-18 23:59:59 +a=2017-02-19 00:00:00 +a=2017-02-19 00:30:00 +a=2017-02-19 01:00:00 +a=2017-10-14 23:00:00 +a=2017-10-14 23:59:59 +a=2017-10-15 00:00:00 +a=2017-10-15 00:30:00 +a=2017-10-15 01:00:00 +_EOF +export TZ= + +export TZ=America/Sao_Paulo +echo TZ=$TZ +run_mlr --opprint put '$b=strptime_local($a, "%Y-%m-%d %H:%M:%S"); $c=strftime_local($b, "%Y-%m-%d %H:%M:%S")' <<_EOF +a=2017-02-18 23:00:00 +a=2017-02-18 23:59:59 +a=2017-02-19 00:00:00 +a=2017-02-19 00:30:00 +a=2017-02-19 01:00:00 +a=2017-10-14 23:00:00 +a=2017-10-14 23:59:59 +a=2017-10-15 00:00:00 +a=2017-10-15 00:30:00 +a=2017-10-15 01:00:00 +_EOF +export TZ= + +# ---------------------------------------------------------------- +announce DSL SUB/GSUB/REGEX_EXTRACT run_mlr --opprint put '$y = sub($x, "e.*l", "")' $indir/sub.dat run_mlr --opprint put '$y = sub($x, "e.*l"i, "")' $indir/sub.dat @@ -1976,6 +2085,52 @@ run_mlr --oxtab put -f $indir/subtab3.mlr $indir/subtab.dkvp run_mlr --oxtab put -f $indir/subtab4.mlr $indir/subtab.dkvp +run_mlr --opprint put '$y = ssub($x, "HE", "")' $indir/sub.dat +run_mlr --opprint put '$y = ssub($x, "HE", "HE")' $indir/sub.dat +run_mlr --opprint put '$y = ssub($x, "HE", "12345")' $indir/sub.dat +run_mlr --opprint put '$y = ssub($x, "LL", "1")' $indir/sub.dat +run_mlr --opprint put '$y = ssub($x, "LL", "12")' $indir/sub.dat +run_mlr --opprint put '$y = ssub($x, "LL", "12345")' $indir/sub.dat +run_mlr --opprint put '$y = ssub($x, "LLO", "")' $indir/sub.dat +run_mlr --opprint put '$y = ssub($x, "LLO", "12")' $indir/sub.dat +run_mlr --opprint put '$y = ssub($x, "LLO", "123")' $indir/sub.dat +run_mlr --opprint put '$y = ssub($x, "LLO", "123456")' $indir/sub.dat +run_mlr --opprint put '$y = ssub($x, "HELLO", "")' $indir/sub.dat +run_mlr --opprint put '$y = ssub($x, "HELLO", "1234")' $indir/sub.dat +run_mlr --opprint put '$y = ssub($x, "HELLO", "12345")' $indir/sub.dat +run_mlr --opprint put '$y = ssub($x, "HELLO", "1234678")' $indir/sub.dat +run_mlr --opprint put '$y = ssub($x, "nonesuch", "")' $indir/sub.dat +run_mlr --opprint put '$y = ssub($x, "nonesuch", "1234")' $indir/sub.dat +run_mlr --opprint put '$y = ssub($x, "nonesuch", "1234567890")' $indir/sub.dat + +run_mlr --oxtab put '$y = regextract($x, "[A-Z]+")' $indir/sub.dat +run_mlr --oxtab put '$y = regextract($x, "[A-Z]*")' $indir/sub.dat +run_mlr --oxtab put '$y = regextract($x, "[a-z]+")' $indir/sub.dat +run_mlr --oxtab put '$y = regextract($x, "[a-z]*")' $indir/sub.dat +run_mlr --oxtab put '$y = regextract($x, "[0-9]+")' $indir/sub.dat +run_mlr --oxtab put '$y = regextract($x, "[0-9]*")' $indir/sub.dat + +run_mlr --oxtab put '$y = regextract($x, "[ef]+")' $indir/sub.dat +run_mlr --oxtab put '$y = regextract($x, "[ef]*")' $indir/sub.dat +run_mlr --oxtab put '$y = regextract($x, "[hi]+")' $indir/sub.dat +run_mlr --oxtab put '$y = regextract($x, "[hi]*")' $indir/sub.dat +run_mlr --oxtab put '$y = regextract($x, "[op]+")' $indir/sub.dat +run_mlr --oxtab put '$y = regextract($x, "[op]*")' $indir/sub.dat + +run_mlr --oxtab put '$y = regextract_or_else($x, "[A-Z]+", "DEFAULT")' $indir/sub.dat +run_mlr --oxtab put '$y = regextract_or_else($x, "[A-Z]*", "DEFAULT")' $indir/sub.dat +run_mlr --oxtab put '$y = regextract_or_else($x, "[a-z]+", "DEFAULT")' $indir/sub.dat +run_mlr --oxtab put '$y = regextract_or_else($x, "[a-z]*", "DEFAULT")' $indir/sub.dat +run_mlr --oxtab put '$y = regextract_or_else($x, "[0-9]+", "DEFAULT")' $indir/sub.dat +run_mlr --oxtab put '$y = regextract_or_else($x, "[0-9]*", "DEFAULT")' $indir/sub.dat + +run_mlr --oxtab put '$y = regextract_or_else($x, "[ef]+", "DEFAULT")' $indir/sub.dat +run_mlr --oxtab put '$y = regextract_or_else($x, "[ef]*", "DEFAULT")' $indir/sub.dat +run_mlr --oxtab put '$y = regextract_or_else($x, "[hi]+", "DEFAULT")' $indir/sub.dat +run_mlr --oxtab put '$y = regextract_or_else($x, "[hi]*", "DEFAULT")' $indir/sub.dat +run_mlr --oxtab put '$y = regextract_or_else($x, "[op]+", "DEFAULT")' $indir/sub.dat +run_mlr --oxtab put '$y = regextract_or_else($x, "[op]*", "DEFAULT")' $indir/sub.dat + # ---------------------------------------------------------------- announce DSL SUBSTR @@ -7028,7 +7183,7 @@ # when the diff is long. set +e -diff -I '^mlr' -I '^cat' -C5 $expfile $outfile +diff -a -I '^mlr' -I '^cat' -C5 $expfile $outfile status=$? if [ $status -eq 0 ]; then diff -Nru miller-5.3.0/c/todo.txt miller-5.4.0/c/todo.txt --- miller-5.3.0/c/todo.txt 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/c/todo.txt 2018-10-14 20:17:52.000000000 +0000 @@ -6,11 +6,23 @@ * synctool alias/flag handling ... ----------------------------------------------------------------- +================================================================ + +* uniq: + o UT cases x all + o faqent w/ ref to head -n 1 -g + - xxx + - xxx + - xxx + +! APPVEYOR ! + ? bare ./configure ? ! dirname/basename functions ! +* MILLER_{MAJOR/MINOR/MICRO} builtins ? + * sort fcn / issue77 airable: diff -Nru miller-5.3.0/c/unit_test/test_mlrregex.c miller-5.4.0/c/unit_test/test_mlrregex.c --- miller-5.3.0/c/unit_test/test_mlrregex.c 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/c/unit_test/test_mlrregex.c 2018-10-14 20:17:52.000000000 +0000 @@ -133,11 +133,124 @@ return 0; } +// ---------------------------------------------------------------- +static char * test_regextract() { + char* input = NULL; + char* sregex = NULL; + char* output = NULL; + regex_t regex; + int cflags = 0; + + input = "abcdef"; + sregex = ".+"; + regcomp_or_die(®ex, sregex, cflags); + output = regextract(input, ®ex); + mu_assert_lf(output != NULL); + mu_assert_lf(streq(output, input)); + printf("regextract input=\"%s\" regex=\"%s\" output=\"%s\"\n", input, sregex, output); + free(output); + + input = "abcdef"; + sregex = "[a-z]+"; + regcomp_or_die(®ex, sregex, cflags); + output = regextract(input, ®ex); + mu_assert_lf(output != NULL); + mu_assert_lf(streq(output, input)); + printf("regextract input=\"%s\" regex=\"%s\" output=\"%s\"\n", input, sregex, output); + free(output); + + input = "abcdef"; + sregex = "[0-9]+"; + regcomp_or_die(®ex, sregex, cflags); + output = regextract(input, ®ex); + mu_assert_lf(output == NULL); + printf("regextract input=\"%s\" regex=\"%s\" output=NULL\n", input, sregex); + free(output); + + input = "abc345"; + sregex = "[0-9]+"; + regcomp_or_die(®ex, sregex, cflags); + output = regextract(input, ®ex); + printf("regextract input=\"%s\" regex=\"%s\" output=\"%s\"\n", input, sregex, output); + mu_assert_lf(output != NULL); + mu_assert_lf(streq(output, "345")); + free(output); + + input = "789xyz"; + sregex = "[0-9]+"; + regcomp_or_die(®ex, sregex, cflags); + output = regextract(input, ®ex); + printf("regextract input=\"%s\" regex=\"%s\" output=\"%s\"\n", input, sregex, output); + mu_assert_lf(output != NULL); + mu_assert_lf(streq(output, "789")); + free(output); + + return 0; +} + +// ---------------------------------------------------------------- +static char * test_regextract_or_else() { + char* input = NULL; + char* sregex = NULL; + char* default_value = "DEFAULT"; + char* output = NULL; + regex_t regex; + int cflags = 0; + + input = "abcdef"; + sregex = ".+"; + regcomp_or_die(®ex, sregex, cflags); + output = regextract_or_else(input, ®ex, default_value); + mu_assert_lf(output != NULL); + mu_assert_lf(streq(output, input)); + printf("regextract_or_else input=\"%s\" regex=\"%s\" default=\"%s\" output=\"%s\"\n", input, sregex, default_value, output); + free(output); + + input = "abcdef"; + sregex = "[a-z]+"; + regcomp_or_die(®ex, sregex, cflags); + output = regextract_or_else(input, ®ex, default_value); + mu_assert_lf(output != NULL); + mu_assert_lf(streq(output, input)); + printf("regextract_or_else input=\"%s\" regex=\"%s\" default=\"%s\" output=\"%s\"\n", input, sregex, default_value, output); + free(output); + + input = "abcdef"; + sregex = "[0-9]+"; + regcomp_or_die(®ex, sregex, cflags); + output = regextract_or_else(input, ®ex, default_value); + mu_assert_lf(output != NULL); + mu_assert_lf(streq(output, default_value)); + printf("regextract_or_else input=\"%s\" regex=\"%s\" default=\"%s\" output=NULL\n", input, sregex, default_value); + free(output); + + input = "abc345"; + sregex = "[0-9]+"; + regcomp_or_die(®ex, sregex, cflags); + output = regextract_or_else(input, ®ex, default_value); + printf("regextract_or_else input=\"%s\" regex=\"%s\" default=\"%s\" output=\"%s\"\n", input, sregex, default_value, output); + mu_assert_lf(output != NULL); + mu_assert_lf(streq(output, "345")); + free(output); + + input = "789xyz"; + sregex = "[0-9]+"; + regcomp_or_die(®ex, sregex, cflags); + output = regextract_or_else(input, ®ex, default_value); + printf("regextract_or_else input=\"%s\" regex=\"%s\" default=\"%s\" output=\"%s\"\n", input, sregex, default_value, output); + mu_assert_lf(output != NULL); + mu_assert_lf(streq(output, "789")); + free(output); + + return 0; +} // ================================================================ static char * all_tests() { mu_run_test(test_save_regex_captures); mu_run_test(test_interpolate_regex_captures); + mu_run_test(test_regextract); + mu_run_test(test_regextract_or_else); return 0; } diff -Nru miller-5.3.0/configure.ac miller-5.4.0/configure.ac --- miller-5.3.0/configure.ac 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/configure.ac 2018-10-14 20:17:52.000000000 +0000 @@ -1,6 +1,6 @@ AC_PREREQ([2.60]) # Manually increment on updates to https://github.com/johnkerl/miller/releases -AC_INIT([mlr],[5.3.0]) +AC_INIT([mlr],[5.4.0]) AC_CONFIG_SRCDIR([c/mlrmain.c]) AC_CONFIG_HEADERS([config.h]) AC_CONFIG_AUX_DIR([autotools]) diff -Nru miller-5.3.0/data/gen.c miller-5.4.0/data/gen.c --- miller-5.3.0/data/gen.c 1970-01-01 00:00:00.000000000 +0000 +++ miller-5.4.0/data/gen.c 2018-10-14 20:17:52.000000000 +0000 @@ -0,0 +1,12 @@ +#include + +int main(int argc, char** argv) { + long long n = 100; + if (argc == 2) { + (void)sscanf(argv[1], "%lld", &n); + } + for (long long i = 0; i < n; i++) { + printf("i=%lld\n", i); + } + return 0; +} diff -Nru miller-5.3.0/debian/changelog miller-5.4.0/debian/changelog --- miller-5.3.0/debian/changelog 2018-05-13 16:26:14.000000000 +0000 +++ miller-5.4.0/debian/changelog 2018-10-20 19:23:02.000000000 +0000 @@ -1,3 +1,11 @@ +miller (5.4.0-1) unstable; urgency=medium + + * New upstream release. + * Set “Rules-Requires-Root: no”. + * Standards-Version 4.2.1, no further change required. + + -- Stephen Kitt Sat, 20 Oct 2018 21:23:02 +0200 + miller (5.3.0-2) unstable; urgency=medium * Migrate to Salsa. diff -Nru miller-5.3.0/debian/control miller-5.4.0/debian/control --- miller-5.3.0/debian/control 2018-05-13 16:25:49.000000000 +0000 +++ miller-5.4.0/debian/control 2018-10-20 19:21:55.000000000 +0000 @@ -4,10 +4,11 @@ Priority: optional Build-Depends: debhelper (>= 11~), flex -Standards-Version: 4.1.4 +Standards-Version: 4.2.1 Vcs-Browser: https://salsa.debian.org/debian/miller Vcs-Git: https://salsa.debian.org/debian/miller.git Homepage: https://github.com/johnkerl/miller +Rules-Requires-Root: no Package: miller Architecture: any diff -Nru miller-5.3.0/doc/build.html miller-5.4.0/doc/build.html --- miller-5.3.0/doc/build.html 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/doc/build.html 2018-10-14 20:17:52.000000000 +0000 @@ -516,8 +516,9 @@
  • Create the Github release tag:
      +
    • Don’t forget the v in v3.4.0
    • Write the release notes -
    • Attach the release tarball, SRPM, and binaries +
    • Attach the release tarball, SRPM, and binaries. Double-check assets were successfully uploaded.
    • Publish the release
    @@ -547,11 +548,18 @@ git checkout -b miller-3.4.0 shasum -a 256 /path/to/mlr-3.4.0.tar.gz edit Formula/miller.rb +# Test the URL from the line like +# url "https://github.com/johnkerl/miller/releases/download/v3.4.0/mlr-3.4.0.tar.gz" +# in a browser for typos +# A '@BrewTestBot Test this please' comment within the homebrew-core pull request will restart the homebrew travis build git add Formula/miller.rb git commit -m 'miller 3.4.0' git push -u origin miller-3.4.0 (submit the pull request) +
  • Update + https://github.com/johnkerl/miller/issues/163 + (release-tracker issue).
  • Social-media updates. diff -Nru miller-5.3.0/doc/content-for-build.html miller-5.4.0/doc/content-for-build.html --- miller-5.3.0/doc/content-for-build.html 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/doc/content-for-build.html 2018-10-14 20:17:52.000000000 +0000 @@ -293,12 +293,15 @@ mlr.static to ../mlr.{arch}.
  • Pull back release tarball mlr-3.4.0.tar.gz and SRPM mlr-3.4.0-1.el6.src.rpm from buildbox, and mlr.{arch} binaries from whatever buildboxes. +
  • Download mlr.exe and msys-2.0.dll from + https://ci.appveyor.com/project/johnkerl/miller/build/artifacts.
  • Create the Github release tag:
      +
    • Don’t forget the v in v3.4.0
    • Write the release notes -
    • Attach the release tarball, SRPM, and binaries +
    • Attach the release tarball, SRPM, and binaries. Double-check assets were successfully uploaded.
    • Publish the release
    @@ -328,11 +331,18 @@ git checkout -b miller-3.4.0 shasum -a 256 /path/to/mlr-3.4.0.tar.gz edit Formula/miller.rb +# Test the URL from the line like +# url "https://github.com/johnkerl/miller/releases/download/v3.4.0/mlr-3.4.0.tar.gz" +# in a browser for typos +# A '@BrewTestBot Test this please' comment within the homebrew-core pull request will restart the homebrew travis build git add Formula/miller.rb git commit -m 'miller 3.4.0' git push -u origin miller-3.4.0 (submit the pull request) +
  • Update + https://github.com/johnkerl/miller/issues/163 + (release-tracker issue).
  • Social-media updates. diff -Nru miller-5.3.0/doc/content-for-cookbook.html miller-5.4.0/doc/content-for-cookbook.html --- miller-5.3.0/doc/content-for-cookbook.html 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/doc/content-for-cookbook.html 2018-10-14 20:17:52.000000000 +0000 @@ -106,6 +106,22 @@ +

    Options for dealing with duplicate rows

    + +
    + +

    If your data has records appearing multiple times, you can use +mlr uniq to show and/or count the unique +records. + +

    If you want to look at partial uniqueness — for example, show only +the first record for each unique combination of the account_id and +account_status fields — you might use mlr head -n 1 -g +account_id,account_status. Please also see mlr head. + + +

    Data-cleaning examples

    diff -Nru miller-5.3.0/doc/content-for-faq.html miller-5.4.0/doc/content-for-faq.html --- miller-5.3.0/doc/content-for-faq.html 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/doc/content-for-faq.html 2018-10-14 20:17:52.000000000 +0000 @@ -272,6 +272,32 @@ right file.
    +

    How to rectangularize after joins with unpaired?

    + +
    + +

    Suppose you have the following two data files: + +POKI_INCLUDE_ESCAPED(data/color-codes.csv)HERE +POKI_INCLUDE_ESCAPED(data/color-names.csv)HERE + +

    Joining on color the results are as expected: + +POKI_RUN_COMMAND{{mlr --csv join -j id -f data/color-codes.csv data/color-names.csv}}HERE + +

    However, if we ask for left-unpaireds, since there’s no +color column, we get a row not having the same column names as the +other: + +POKI_RUN_COMMAND{{mlr --csv join --ul -j id -f data/color-codes.csv data/color-names.csv}}HERE + +

    To fix this, we can use unsparsify: + +POKI_RUN_COMMAND{{mlr --csv join --ul -j id -f data/color-codes.csv then unsparsify --fill-with "" data/color-names.csv}}HERE + +

    Thanks to @aborruso for the tip! + +

    What about XML or JSON file formats?

    diff -Nru miller-5.3.0/doc/content-for-reference-verbs.html miller-5.4.0/doc/content-for-reference-verbs.html --- miller-5.3.0/doc/content-for-reference-verbs.html 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/doc/content-for-reference-verbs.html 2018-10-14 20:17:52.000000000 +0000 @@ -49,6 +49,19 @@
    +

    altkv

    + +
    + +

    Map list of values to alternating key/value pairs. + +POKI_RUN_COMMAND{{mlr altkv -h}}HERE + +POKI_RUN_COMMAND{{echo 'a,b,c,d,e,f' | mlr altkv}}HERE +POKI_RUN_COMMAND{{echo 'a,b,c,d,e,f,g' | mlr altkv}}HERE + +

    +

    bar

    @@ -161,6 +174,28 @@
    +

    clean-whitespace

    + +
    + +POKI_RUN_COMMAND{{mlr clean-whitespace --help}}HERE + +POKI_RUN_COMMAND{{mlr --icsv --ojson cat data/clean-whitespace.csv}}HERE +POKI_RUN_COMMAND{{mlr --icsv --ojson clean-whitespace -k data/clean-whitespace.csv}}HERE +POKI_RUN_COMMAND{{mlr --icsv --ojson clean-whitespace -v data/clean-whitespace.csv}}HERE +POKI_RUN_COMMAND{{mlr --icsv --ojson clean-whitespace data/clean-whitespace.csv}}HERE + +

    Function links: +

    + +
    +

    count-distinct

    @@ -219,6 +254,18 @@
    +

    fill-down

    + +
    + +POKI_RUN_COMMAND{{mlr fill-down --help}}HERE + +POKI_RUN_COMMAND{{cat data/fill-down.csv}}HERE +POKI_RUN_COMMAND{{mlr --csv fill-down -f b data/fill-down.csv}}HERE +POKI_RUN_COMMAND{{mlr --csv fill-down -a -f b data/fill-down.csv}}HERE + +
    +

    filter

    @@ -876,6 +923,8 @@ POKI_RUN_COMMAND{{mlr uniq --help}}HERE +

    There are two main ways to use mlr uniq: the first way is with -g to specify group-by columns. +
    POKI_RUN_COMMAND{{wc -l data/colored-shapes.dkvp}}HERE
    @@ -888,6 +937,33 @@ POKI_RUN_COMMAND{{mlr --opprint uniq -n -g color,shape data/colored-shapes.dkvp}}HERE
    +

    The second main way to use mlr uniq is without group-by columns, using -a instead: + + + + + + + + + + + + + + +
    +POKI_RUN_COMMAND{{cat data/repeats.dkvp}}HERE +
    + POKI_RUN_COMMAND{{wc -l data/repeats.dkvp}}HERE + + POKI_RUN_COMMAND{{mlr --opprint uniq -a data/repeats.dkvp}}HERE +
    + POKI_RUN_COMMAND{{mlr --opprint uniq -a -n data/repeats.dkvp}}HERE + + POKI_RUN_COMMAND{{mlr --opprint uniq -a -c data/repeats.dkvp}}HERE +
    +

    unsparsify

    diff -Nru miller-5.3.0/doc/content-for-release-docs.html miller-5.4.0/doc/content-for-release-docs.html --- miller-5.3.0/doc/content-for-release-docs.html 2018-01-06 22:49:24.000000000 +0000 +++ miller-5.4.0/doc/content-for-release-docs.html 2018-10-14 20:17:52.000000000 +0000 @@ -23,6 +23,7 @@