diff -Nru memcached-1.6.18/assoc.h memcached-1.6.19/assoc.h --- memcached-1.6.18/assoc.h 2022-08-25 22:48:57.000000000 +0000 +++ memcached-1.6.19/assoc.h 2023-03-08 21:34:27.000000000 +0000 @@ -1,16 +1,17 @@ /* associative array */ void assoc_init(const int hashpower_init); + item *assoc_find(const char *key, const size_t nkey, const uint32_t hv); int assoc_insert(item *item, const uint32_t hv); void assoc_delete(const char *key, const size_t nkey, const uint32_t hv); -void do_assoc_move_next_bucket(void); + int start_assoc_maintenance_thread(void); void stop_assoc_maintenance_thread(void); void assoc_start_expand(uint64_t curr_items); + /* walk functions */ void *assoc_get_iterator(void); bool assoc_iterate(void *iterp, item **it); void assoc_iterate_final(void *iterp); extern unsigned int hashpower; -extern unsigned int item_lock_hashpower; diff -Nru memcached-1.6.18/configure memcached-1.6.19/configure --- memcached-1.6.18/configure 2023-01-11 06:17:57.000000000 +0000 +++ memcached-1.6.19/configure 2023-03-08 21:42:12.000000000 +0000 @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.71 for memcached 1.6.18. +# Generated by GNU Autoconf 2.71 for memcached 1.6.19. # # Report bugs to . # @@ -611,8 +611,8 @@ # Identity of this package. PACKAGE_NAME='memcached' PACKAGE_TARNAME='memcached' -PACKAGE_VERSION='1.6.18' -PACKAGE_STRING='memcached 1.6.18' +PACKAGE_VERSION='1.6.19' +PACKAGE_STRING='memcached 1.6.19' PACKAGE_BUGREPORT='memcached@googlegroups.com' PACKAGE_URL='' @@ -814,6 +814,7 @@ enable_unix_socket enable_proxy enable_proxy_uring +enable_werror enable_dtrace enable_coverage enable_64bit @@ -1383,7 +1384,7 @@ # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures memcached 1.6.18 to adapt to many kinds of systems. +\`configure' configures memcached 1.6.19 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1454,7 +1455,7 @@ if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of memcached 1.6.18:";; + short | recursive ) echo "Configuration of memcached 1.6.19:";; esac cat <<\_ACEOF @@ -1478,6 +1479,7 @@ --disable-unix-socket Disable unix domain socket --enable-proxy Enable proxy code EXPERIMENTAL --enable-proxy-uring Enable proxy io_uring code EXPERIMENTAL + --enable-werror Enable -Werror --enable-dtrace Enable dtrace probes --disable-coverage Disable code coverage --enable-64bit build 64bit version @@ -1575,7 +1577,7 @@ test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -memcached configure 1.6.18 +memcached configure 1.6.19 generated by GNU Autoconf 2.71 Copyright (C) 2021 Free Software Foundation, Inc. @@ -2172,7 +2174,7 @@ This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by memcached $as_me 1.6.18, which was +It was created by memcached $as_me 1.6.19, which was generated by GNU Autoconf 2.71. Invocation command line was $ $0$ac_configure_args_raw @@ -3517,7 +3519,7 @@ # Define the identity of the package. PACKAGE='memcached' - VERSION='1.6.18' + VERSION='1.6.19' printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h @@ -5633,6 +5635,13 @@ fi +# Check whether --enable-werror was given. +if test ${enable_werror+y} +then : + enableval=$enable_werror; +fi + + @@ -8412,16 +8421,20 @@ +if test "x$enable_werror" = "xyes"; then + CFLAGS="$CFLAGS -Werror" +fi + if test "$ICC" = "yes" then - CFLAGS="$CFLAGS -diag-disable 187 -Wall -Werror" + CFLAGS="$CFLAGS -diag-disable 187 -Wall" printf "%s\n" "#define _GNU_SOURCE 1" >>confdefs.h elif test "$GCC" = "yes" then GCC_VERSION=`$CC -dumpversion` - CFLAGS="$CFLAGS -Wall -Werror -pedantic -Wmissing-prototypes -Wmissing-declarations -Wredundant-decls" + CFLAGS="$CFLAGS -Wall -pedantic -Wmissing-prototypes -Wmissing-declarations -Wredundant-decls" if test "x$enable_asan" = "xyes"; then CFLAGS="$CFLAGS -fsanitize=address" fi @@ -9035,7 +9048,7 @@ # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by memcached $as_me 1.6.18, which was +This file was extended by memcached $as_me 1.6.19, which was generated by GNU Autoconf 2.71. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -9103,7 +9116,7 @@ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config='$ac_cs_config_escaped' ac_cs_version="\\ -memcached config.status 1.6.18 +memcached config.status 1.6.19 configured by $0, generated by GNU Autoconf 2.71, with options \\"\$ac_cs_config\\" diff -Nru memcached-1.6.18/configure.ac memcached-1.6.19/configure.ac --- memcached-1.6.18/configure.ac 2022-11-25 00:28:47.000000000 +0000 +++ memcached-1.6.19/configure.ac 2023-03-08 21:34:27.000000000 +0000 @@ -130,6 +130,9 @@ AC_ARG_ENABLE(proxy-uring, [AS_HELP_STRING([--enable-proxy-uring], [Enable proxy io_uring code EXPERIMENTAL])]) +AC_ARG_ENABLE(werror, + [AS_HELP_STRING([--enable-werror], [Enable -Werror])]) + dnl ********************************************************************** dnl DETECT_SASL_CB_GETCONF dnl @@ -829,6 +832,10 @@ [test "x$enable_docs" != "xno" -a "x$XML2RFC" != "xno" -a "x$XSLTPROC" != "xno"]) +if test "x$enable_werror" = "xyes"; then + CFLAGS="$CFLAGS -Werror" +fi + dnl Let the compiler be a bit more picky. Please note that you cannot dnl specify these flags to the compiler before AC_CHECK_FUNCS, because dnl the test program will generate a compilation warning and hence fail @@ -836,12 +843,12 @@ if test "$ICC" = "yes" then dnl ICC trying to be gcc. - CFLAGS="$CFLAGS -diag-disable 187 -Wall -Werror" + CFLAGS="$CFLAGS -diag-disable 187 -Wall" AC_DEFINE([_GNU_SOURCE],[1],[make sure IOV_MAX is defined]) elif test "$GCC" = "yes" then GCC_VERSION=`$CC -dumpversion` - CFLAGS="$CFLAGS -Wall -Werror -pedantic -Wmissing-prototypes -Wmissing-declarations -Wredundant-decls" + CFLAGS="$CFLAGS -Wall -pedantic -Wmissing-prototypes -Wmissing-declarations -Wredundant-decls" if test "x$enable_asan" = "xyes"; then CFLAGS="$CFLAGS -fsanitize=address" fi diff -Nru memcached-1.6.18/crawler.c memcached-1.6.19/crawler.c --- memcached-1.6.18/crawler.c 2023-01-11 06:10:10.000000000 +0000 +++ memcached-1.6.19/crawler.c 2023-03-08 21:34:27.000000000 +0000 @@ -22,13 +22,16 @@ #include #include +#include "base64.h" + #define LARGEST_ID POWER_LARGEST typedef struct { void *c; /* original connection structure. still with source thread attached. */ int sfd; /* client fd. */ - bipbuf_t *buf; /* output buffer */ - char *cbuf; /* current buffer */ + int buflen; + int bufused; + char *buf; /* output buffer */ } crawler_client_t; typedef struct _crawler_module_t crawler_module_t; @@ -80,13 +83,26 @@ .needs_client = true }; -crawler_module_reg_t *crawler_mod_regs[3] = { +static void crawler_mgdump_eval(crawler_module_t *cm, item *search, uint32_t hv, int i); +static void crawler_mgdump_finalize(crawler_module_t *cm); + +crawler_module_reg_t crawler_mgdump_mod = { + .init = NULL, + .eval = crawler_mgdump_eval, + .doneclass = NULL, + .finalize = crawler_mgdump_finalize, + .needs_lock = false, + .needs_client = true +}; + +crawler_module_reg_t *crawler_mod_regs[4] = { &crawler_expired_mod, &crawler_expired_mod, - &crawler_metadump_mod + &crawler_metadump_mod, + &crawler_mgdump_mod, }; -static int lru_crawler_client_getbuf(crawler_client_t *c); +static int lru_crawler_write(crawler_client_t *c); crawler_module_t active_crawler_mod; enum crawler_run_type active_crawler_type; @@ -107,14 +123,13 @@ /*** LRU CRAWLER THREAD ***/ -#define LRU_CRAWLER_WRITEBUF 8192 +#define LRU_CRAWLER_MINBUFSPACE 8192 static void lru_crawler_close_client(crawler_client_t *c) { //fprintf(stderr, "CRAWLER: Closing client\n"); sidethread_conn_close(c->c); c->c = NULL; - c->cbuf = NULL; - bipbuf_free(c->buf); + free(c->buf); c->buf = NULL; } @@ -122,11 +137,20 @@ //fprintf(stderr, "CRAWLER: Closing client\n"); redispatch_conn(c->c); c->c = NULL; - c->cbuf = NULL; - bipbuf_free(c->buf); + free(c->buf); c->buf = NULL; } +static int lru_crawler_expand_buf(crawler_client_t *c) { + c->buflen *= 2; + char *nb = realloc(c->buf, c->buflen); + if (nb == NULL) { + return -1; + } + c->buf = nb; + return 0; +} + static int crawler_expired_init(crawler_module_t *cm, void *data) { struct crawler_expired_data *d; if (data != NULL) { @@ -236,7 +260,6 @@ } static void crawler_metadump_eval(crawler_module_t *cm, item *it, uint32_t hv, int i) { - //int slab_id = CLEAR_LRU(i); char keybuf[KEY_MAX_URI_ENCODED_LENGTH]; int is_flushed = item_is_flushed(it); /* Ignore expired content. */ @@ -247,7 +270,7 @@ } // TODO: uriencode directly into the buffer. uriencode(ITEM_key(it), keybuf, it->nkey, KEY_MAX_URI_ENCODED_LENGTH); - int total = snprintf(cm->c.cbuf, 4096, + int total = snprintf(cm->c.buf + cm->c.bufused, 4096, "key=%s exp=%ld la=%llu cas=%llu fetch=%s cls=%u size=%lu\n", keybuf, (it->exptime == 0) ? -1 : (long)(it->exptime + process_started), @@ -257,53 +280,98 @@ ITEM_clsid(it), (unsigned long) ITEM_ntotal(it)); refcount_decr(it); - // TODO: some way of tracking the errors. these are very unlikely though. - if (total >= LRU_CRAWLER_WRITEBUF - 1 || total <= 0) { - /* Failed to write, don't push it. */ + // TODO: some way of tracking the errors. these should be impossible given + // the space requirements. + if (total >= LRU_CRAWLER_MINBUFSPACE - 1 || total <= 0) { + // Failed to write, don't push it. return; } - bipbuf_push(cm->c.buf, total); + cm->c.bufused += total; } static void crawler_metadump_finalize(crawler_module_t *cm) { if (cm->c.c != NULL) { - // Ensure space for final message. - lru_crawler_client_getbuf(&cm->c); - memcpy(cm->c.cbuf, "END\r\n", 5); - bipbuf_push(cm->c.buf, 5); + lru_crawler_write(&cm->c); // empty the write buffer + memcpy(cm->c.buf, "END\r\n", 5); + cm->c.bufused += 5; } } -static int lru_crawler_poll(crawler_client_t *c) { - unsigned char *data; - unsigned int data_size = 0; +static void crawler_mgdump_eval(crawler_module_t *cm, item *it, uint32_t hv, int i) { + int is_flushed = item_is_flushed(it); + /* Ignore expired content. */ + if ((it->exptime != 0 && it->exptime < current_time) + || is_flushed) { + refcount_decr(it); + return; + } + + char *p = cm->c.buf + cm->c.bufused; // buffer offset. + char *start = p; + memcpy(p, "mg ", 3); + p += 3; + if (it->it_flags & ITEM_KEY_BINARY) { + p += base64_encode((unsigned char *) ITEM_key(it), it->nkey, (unsigned char*) p, LRU_CRAWLER_MINBUFSPACE/2); + memcpy(p, " b\r\n", 4); + p += 4; + } else { + memcpy(p, ITEM_key(it), it->nkey); + p += it->nkey; + memcpy(p, "\r\n", 2); + p += 2; + } + int total = p - start; + + refcount_decr(it); + cm->c.bufused += total; +} + +static void crawler_mgdump_finalize(crawler_module_t *cm) { + if (cm->c.c != NULL) { + lru_crawler_write(&cm->c); // empty the write buffer + memcpy(cm->c.buf, "EN\r\n", 4); + cm->c.bufused += 4; + } +} + +// write the whole buffer out to the client socket. +static int lru_crawler_write(crawler_client_t *c) { + unsigned int data_size = c->bufused; + unsigned int sent = 0; struct pollfd to_poll[1]; to_poll[0].fd = c->sfd; to_poll[0].events = POLLOUT; - int ret = poll(to_poll, 1, 1000); - - if (ret < 0) { - // fatal. - return -1; - } + if (c->c == NULL) return -1; + if (data_size == 0) return 0; - if (ret == 0) return 0; + while (sent < data_size) { + int ret = poll(to_poll, 1, 1000); - if (to_poll[0].revents & POLLIN) { - char buf[1]; - int res = ((conn*)c->c)->read(c->c, buf, 1); - if (res == 0 || (res == -1 && (errno != EAGAIN && errno != EWOULDBLOCK))) { - lru_crawler_close_client(c); + if (ret < 0) { + // fatal. return -1; } - } - if ((data = bipbuf_peek_all(c->buf, &data_size)) != NULL) { + + if (ret == 0) return 0; + + // check if socket was closed on us. + if (to_poll[0].revents & POLLIN) { + char buf[1]; + int res = ((conn*)c->c)->read(c->c, buf, 1); + if (res == 0 || (res == -1 && (errno != EAGAIN && errno != EWOULDBLOCK))) { + lru_crawler_close_client(c); + return -1; + } + } + if (to_poll[0].revents & (POLLHUP|POLLERR)) { + // got socket hangup. lru_crawler_close_client(c); return -1; } else if (to_poll[0].revents & POLLOUT) { - int total = ((conn*)c->c)->write(c->c, data, data_size); + // socket is writeable. + int total = ((conn*)c->c)->write(c->c, c->buf + sent, data_size - sent); if (total == -1) { if (errno != EAGAIN && errno != EWOULDBLOCK) { lru_crawler_close_client(c); @@ -312,29 +380,14 @@ } else if (total == 0) { lru_crawler_close_client(c); return -1; - } else { - bipbuf_poll(c->buf, total); } + sent += total; } - } - return 0; -} + } // while -/* Grab some space to work with, if none exists, run the poll() loop and wait - * for it to clear up or close. - * Return NULL if closed. - */ -static int lru_crawler_client_getbuf(crawler_client_t *c) { - void *buf = NULL; - if (c->c == NULL) return -1; - /* not enough space. */ - while ((buf = bipbuf_request(c->buf, LRU_CRAWLER_WRITEBUF)) == NULL) { - // TODO: max loops before closing. - int ret = lru_crawler_poll(c); - if (ret < 0) return ret; - } + // write buffer now empty. + c->bufused = 0; - c->cbuf = buf; return 0; } @@ -349,22 +402,39 @@ active_crawler_mod.mod->doneclass(&active_crawler_mod, i); } +// ensure we build the buffer a little bit to cut down on poll/write syscalls. +#define MIN_ITEMS_PER_WRITE 16 static void item_crawl_hash(void) { // get iterator from assoc. can hang for a long time. // - blocks hash expansion void *iter = assoc_get_iterator(); int crawls_persleep = settings.crawls_persleep; item *it = NULL; + int items = 0; // loop while iterator returns something // - iterator func handles bucket-walking // - iterator returns with bucket locked. while (assoc_iterate(iter, &it)) { // if iterator returns true but no item, we're inbetween buckets and - // can do sleep or cleanup work without holding a lock. + // can do cleanup work without holding an item lock. if (it == NULL) { + if (active_crawler_mod.c.c != NULL) { + if (items > MIN_ITEMS_PER_WRITE) { + int ret = lru_crawler_write(&active_crawler_mod.c); + items = 0; + if (ret != 0) { + // fail out and finalize. + break; + } + } + } else if (active_crawler_mod.mod->needs_client) { + // fail out and finalize. + break; + } + // - sleep bits from orig loop - if (crawls_persleep-- <= 0 && settings.lru_crawler_sleep) { + if (crawls_persleep <= 0 && settings.lru_crawler_sleep) { pthread_mutex_unlock(&lru_crawler_lock); usleep(settings.lru_crawler_sleep); pthread_mutex_lock(&lru_crawler_lock); @@ -377,27 +447,29 @@ continue; } - /* Get memory from bipbuf, if client has no space, flush. */ - if (active_crawler_mod.c.c != NULL) { - int ret = lru_crawler_client_getbuf(&active_crawler_mod.c); - if (ret != 0) { - // fail out and finalize. - break; - } - } else if (active_crawler_mod.mod->needs_client) { - // fail out and finalize. - break; - } - // double check that the item isn't in a transitional state. if (refcount_incr(it) < 2) { refcount_decr(it); continue; } + // We're presently holding an item lock, so we cannot flush the + // buffer to the network socket as the syscall is both slow and could + // hang waiting for POLLOUT. Instead we must expand the buffer. + if (active_crawler_mod.c.c != NULL) { + crawler_client_t *c = &active_crawler_mod.c; + if (c->buflen - c->bufused < LRU_CRAWLER_MINBUFSPACE) { + if (lru_crawler_expand_buf(c) != 0) { + // failed to expand buffer, stop. + break; + } + } + } // FIXME: missing hv and i are fine for metadump eval, but not fine // for expire eval. active_crawler_mod.mod->eval(&active_crawler_mod, it, 0, 0); + crawls_persleep--; + items++; } // must finalize or we leave the hash table expansion blocked. @@ -430,12 +502,14 @@ continue; } - /* Get memory from bipbuf, if client has no space, flush. */ if (active_crawler_mod.c.c != NULL) { - int ret = lru_crawler_client_getbuf(&active_crawler_mod.c); - if (ret != 0) { - lru_crawler_class_done(i); - continue; + crawler_client_t *c = &active_crawler_mod.c; + if (c->buflen - c->bufused < LRU_CRAWLER_MINBUFSPACE) { + int ret = lru_crawler_write(c); + if (ret != 0) { + lru_crawler_class_done(i); + continue; + } } } else if (active_crawler_mod.mod->needs_client) { lru_crawler_class_done(i); @@ -500,8 +574,8 @@ if (active_crawler_mod.mod != NULL) { if (active_crawler_mod.mod->finalize != NULL) active_crawler_mod.mod->finalize(&active_crawler_mod); - while (active_crawler_mod.c.c != NULL && bipbuf_used(active_crawler_mod.c.buf)) { - lru_crawler_poll(&active_crawler_mod.c); + while (active_crawler_mod.c.c != NULL && active_crawler_mod.c.bufused != 0) { + lru_crawler_write(&active_crawler_mod.c); } // Double checking in case the client closed during the poll if (active_crawler_mod.c.c != NULL) { @@ -626,10 +700,14 @@ crawlc->c = c; crawlc->sfd = sfd; - crawlc->buf = bipbuf_new(1024 * 128); + size_t size = LRU_CRAWLER_MINBUFSPACE * 16; + crawlc->buf = malloc(size); + if (crawlc->buf == NULL) { return -2; } + crawlc->buflen = size; + crawlc->bufused = 0; return 0; } @@ -661,7 +739,7 @@ } /* hash table walk only supported with metadump for now. */ - if (type != CRAWLER_METADUMP && ids == NULL) { + if (ids == NULL && type != CRAWLER_METADUMP && type != CRAWLER_MGDUMP) { pthread_mutex_unlock(&lru_crawler_lock); return -2; } diff -Nru memcached-1.6.18/debian/changelog memcached-1.6.19/debian/changelog --- memcached-1.6.18/debian/changelog 2023-01-12 00:02:22.000000000 +0000 +++ memcached-1.6.19/debian/changelog 2023-03-15 09:33:00.000000000 +0000 @@ -1,3 +1,11 @@ +memcached (1.6.19-1) unstable; urgency=medium + + * New upstream release. + - Refresh patches. + * memcached.conf: Also listen on IPv6 by default. + + -- Chris Lamb Wed, 15 Mar 2023 09:33:00 +0000 + memcached (1.6.18-1) unstable; urgency=medium * New upstream release. (Closes: #1028497) diff -Nru memcached-1.6.18/debian/memcached.conf memcached-1.6.19/debian/memcached.conf --- memcached-1.6.18/debian/memcached.conf 2023-01-12 00:02:22.000000000 +0000 +++ memcached-1.6.19/debian/memcached.conf 2023-03-15 09:33:00.000000000 +0000 @@ -33,6 +33,7 @@ # This parameter is one of the only security measures that memcached has, so make sure # it's listening on a firewalled interface. -l 127.0.0.1 +-l ::1 # Limit the number of simultaneous incoming connections. The daemon default is 1024 # -c 1024 diff -Nru memcached-1.6.18/debian/patches/0007-uninitialized-variables.patch memcached-1.6.19/debian/patches/0007-uninitialized-variables.patch --- memcached-1.6.18/debian/patches/0007-uninitialized-variables.patch 2023-01-12 00:02:22.000000000 +0000 +++ memcached-1.6.19/debian/patches/0007-uninitialized-variables.patch 2023-03-15 09:33:00.000000000 +0000 @@ -12,7 +12,7 @@ 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/logger.c b/logger.c -index 79f4ed1..3f3ec3b 100644 +index ce97d26..abf3738 100644 --- a/logger.c +++ b/logger.c @@ -269,7 +269,7 @@ static int _logger_parse_extw(logentry *e, char *scratch) { diff -Nru memcached-1.6.18/doc/Makefile memcached-1.6.19/doc/Makefile --- memcached-1.6.18/doc/Makefile 2023-01-11 06:17:58.000000000 +0000 +++ memcached-1.6.19/doc/Makefile 2023-03-08 21:42:13.000000000 +0000 @@ -160,7 +160,7 @@ AWK = mawk CC = gcc CCDEPMODE = depmode=gcc3 -CFLAGS = -g -O2 -pthread -pthread -Wall -Werror -pedantic -Wmissing-prototypes -Wmissing-declarations -Wredundant-decls +CFLAGS = -g -O2 -pthread -pthread -Wall -pedantic -Wmissing-prototypes -Wmissing-declarations -Wredundant-decls CPP = gcc -E CPPFLAGS = CSCOPE = cscope @@ -195,10 +195,10 @@ PACKAGE = memcached PACKAGE_BUGREPORT = memcached@googlegroups.com PACKAGE_NAME = memcached -PACKAGE_STRING = memcached 1.6.18 +PACKAGE_STRING = memcached 1.6.19 PACKAGE_TARNAME = memcached PACKAGE_URL = -PACKAGE_VERSION = 1.6.18 +PACKAGE_VERSION = 1.6.19 PATH_SEPARATOR = : PKG_CONFIG = /usr/bin/pkg-config PKG_CONFIG_LIBDIR = @@ -209,7 +209,7 @@ SET_MAKE = SHELL = /bin/bash STRIP = -VERSION = 1.6.18 +VERSION = 1.6.19 XML2RFC = no XSLTPROC = /usr/bin/xsltproc abs_builddir = /home/dormando/d/p/danga/git/memcached/doc diff -Nru memcached-1.6.18/doc/protocol.txt memcached-1.6.19/doc/protocol.txt --- memcached-1.6.18/doc/protocol.txt 2023-01-11 06:10:10.000000000 +0000 +++ memcached-1.6.19/doc/protocol.txt 2023-03-08 21:34:27.000000000 +0000 @@ -883,10 +883,12 @@ - D(token): delta to apply (decimal unsigned 64-bit number, default 1) - T(token): update TTL on success - M(token): mode switch to change between incr and decr modes. +- O(token): opaque value, consumes a token and copies back with response - q: use noreply semantics for return codes (see details under mset) - t: return current TTL - c: return current CAS value if successful. - v: return new value +- k: return key as a token The flags are now repeated with detailed information where useful: @@ -1130,6 +1132,31 @@ - "BADCLASS [message]" to indicate an invalid class was specified. +lru_crawler mgdump + +- Similar in function to the above "lru_crawler crawl" command, this function + outputs one line for every valid item found in the matching slab classes. + + If "hash" is specified instead of a classid or "all", the crawler will dump + items by directly walking the hash table instead of the LRU's. This makes it + more likely all items will be visited once as LRU reordering and locking can + cause frequently accessed items to be missed. + + Lines are in a basic metaget format, like: "mg key\r\n". If a key is in + binary format: "mg base64encodedkey b\r\n" + A user may then take each line, append any flags they want, and run those + commands against the server to fetch exactly what they want to know. + +The response line could be one of: + +- "OK" to indicate successful launch. + +- "BUSY [message]" to indicate the crawler is already processing a request. + +- "BADCLASS [message]" to indicate an invalid class was specified. + + + Watchers -------- diff -Nru memcached-1.6.18/items.c memcached-1.6.19/items.c --- memcached-1.6.18/items.c 2023-01-11 06:10:10.000000000 +0000 +++ memcached-1.6.19/items.c 2023-03-08 21:34:27.000000000 +0000 @@ -63,7 +63,6 @@ static uint64_t cas_id = 0; static volatile int do_run_lru_maintainer_thread = 0; -static int lru_maintainer_initialized = 0; static pthread_mutex_t lru_maintainer_lock = PTHREAD_MUTEX_INITIALIZER; static pthread_mutex_t cas_id_lock = PTHREAD_MUTEX_INITIALIZER; static pthread_mutex_t stats_sizes_lock = PTHREAD_MUTEX_INITIALIZER; @@ -259,7 +258,7 @@ return nch; } -item *do_item_alloc(char *key, const size_t nkey, const unsigned int flags, +item *do_item_alloc(const char *key, const size_t nkey, const unsigned int flags, const rel_time_t exptime, const int nbytes) { uint8_t nsuffix; item *it = NULL; @@ -975,7 +974,7 @@ } /** wrapper around assoc_find which does the lazy expiration logic */ -item *do_item_get(const char *key, const size_t nkey, const uint32_t hv, conn *c, const bool do_update) { +item *do_item_get(const char *key, const size_t nkey, const uint32_t hv, LIBEVENT_THREAD *t, const bool do_update) { item *it = assoc_find(key, nkey, hv); if (it != NULL) { refcount_incr(it); @@ -1006,7 +1005,7 @@ int ii; if (it == NULL) { fprintf(stderr, "> NOT FOUND "); - } else { + } else if (was_found) { fprintf(stderr, "> FOUND KEY "); } for (ii = 0; ii < nkey; ++ii) { @@ -1018,31 +1017,31 @@ was_found = 1; if (item_is_flushed(it)) { do_item_unlink(it, hv); - STORAGE_delete(c->thread->storage, it); + STORAGE_delete(t->storage, it); do_item_remove(it); it = NULL; - pthread_mutex_lock(&c->thread->stats.mutex); - c->thread->stats.get_flushed++; - pthread_mutex_unlock(&c->thread->stats.mutex); + pthread_mutex_lock(&t->stats.mutex); + t->stats.get_flushed++; + pthread_mutex_unlock(&t->stats.mutex); if (settings.verbose > 2) { fprintf(stderr, " -nuked by flush"); } was_found = 2; } else if (it->exptime != 0 && it->exptime <= current_time) { do_item_unlink(it, hv); - STORAGE_delete(c->thread->storage, it); + STORAGE_delete(t->storage, it); do_item_remove(it); it = NULL; - pthread_mutex_lock(&c->thread->stats.mutex); - c->thread->stats.get_expired++; - pthread_mutex_unlock(&c->thread->stats.mutex); + pthread_mutex_lock(&t->stats.mutex); + t->stats.get_expired++; + pthread_mutex_unlock(&t->stats.mutex); if (settings.verbose > 2) { fprintf(stderr, " -nuked by expire"); } was_found = 3; } else { if (do_update) { - do_item_bump(c, it, hv); + do_item_bump(t, it, hv); } DEBUG_REFCNT(it, '+'); } @@ -1051,8 +1050,8 @@ if (settings.verbose > 2) fprintf(stderr, "\n"); /* For now this is in addition to the above verbose logging. */ - LOGGER_LOG(c->thread->l, LOG_FETCHERS, LOGGER_ITEM_GET, NULL, was_found, key, - nkey, (it) ? it->nbytes : 0, (it) ? ITEM_clsid(it) : 0, c->sfd); + LOGGER_LOG(t->l, LOG_FETCHERS, LOGGER_ITEM_GET, NULL, was_found, key, + nkey, (it) ? it->nbytes : 0, (it) ? ITEM_clsid(it) : 0, t->cur_sfd); return it; } @@ -1060,7 +1059,7 @@ // Requires lock held for item. // Split out of do_item_get() to allow mget functions to look through header // data before losing state modified via the bump function. -void do_item_bump(conn *c, item *it, const uint32_t hv) { +void do_item_bump(LIBEVENT_THREAD *t, item *it, const uint32_t hv) { /* We update the hit markers only during fetches. * An item needs to be hit twice overall to be considered * ACTIVE, but only needs a single hit to maintain activity @@ -1075,7 +1074,7 @@ it->it_flags |= ITEM_ACTIVE; if (ITEM_lruid(it) != COLD_LRU) { it->time = current_time; // only need to bump time. - } else if (!lru_bump_async(c->thread->lru_bump_buf, it, hv)) { + } else if (!lru_bump_async(t->lru_bump_buf, it, hv)) { // add flag before async bump to avoid race. it->it_flags &= ~ITEM_ACTIVE; } @@ -1088,8 +1087,8 @@ } item *do_item_touch(const char *key, size_t nkey, uint32_t exptime, - const uint32_t hv, conn *c) { - item *it = do_item_get(key, nkey, hv, c, DO_UPDATE); + const uint32_t hv, LIBEVENT_THREAD *t) { + item *it = do_item_get(key, nkey, hv, t, DO_UPDATE); if (it != NULL) { it->exptime = exptime; } @@ -1745,11 +1744,6 @@ pthread_mutex_unlock(&lru_maintainer_lock); } -int init_lru_maintainer(void) { - lru_maintainer_initialized = 1; - return 0; -} - /* Tail linkers and crawler for the LRU crawler. */ void do_item_linktail_q(item *it) { /* item is the new tail */ item **head, **tail; diff -Nru memcached-1.6.18/items.h memcached-1.6.19/items.h --- memcached-1.6.18/items.h 2023-01-11 05:58:39.000000000 +0000 +++ memcached-1.6.19/items.h 2023-03-08 21:34:27.000000000 +0000 @@ -11,7 +11,7 @@ void set_cas_id(uint64_t new_cas); /*@null@*/ -item *do_item_alloc(char *key, const size_t nkey, const unsigned int flags, const rel_time_t exptime, const int nbytes); +item *do_item_alloc(const char *key, const size_t nkey, const unsigned int flags, const rel_time_t exptime, const int nbytes); item_chunk *do_item_alloc_chunk(item_chunk *ch, const size_t bytes_remain); item *do_item_alloc_pull(const size_t ntotal, const unsigned int id); void item_free(item *it); @@ -71,15 +71,14 @@ } item_stats_automove; void fill_item_stats_automove(item_stats_automove *am); -item *do_item_get(const char *key, const size_t nkey, const uint32_t hv, conn *c, const bool do_update); -item *do_item_touch(const char *key, const size_t nkey, uint32_t exptime, const uint32_t hv, conn *c); -void do_item_bump(conn *c, item *it, const uint32_t hv); +item *do_item_get(const char *key, const size_t nkey, const uint32_t hv, LIBEVENT_THREAD *t, const bool do_update); +item *do_item_touch(const char *key, const size_t nkey, uint32_t exptime, const uint32_t hv, LIBEVENT_THREAD *t); +void do_item_bump(LIBEVENT_THREAD *t, item *it, const uint32_t hv); void item_stats_reset(void); extern pthread_mutex_t lru_locks[POWER_LARGEST]; int start_lru_maintainer_thread(void *arg); int stop_lru_maintainer_thread(void); -int init_lru_maintainer(void); void lru_maintainer_pause(void); void lru_maintainer_resume(void); diff -Nru memcached-1.6.18/logger.c memcached-1.6.19/logger.c --- memcached-1.6.18/logger.c 2023-01-11 06:10:10.000000000 +0000 +++ memcached-1.6.19/logger.c 2023-03-08 21:34:27.000000000 +0000 @@ -160,6 +160,7 @@ static int _logger_util_addr_endpoint(struct sockaddr_in6 *addr, char *rip, size_t riplen, unsigned short *rport) { memset(rip, 0, riplen); + *rport = 0; switch (addr->sin6_family) { case AF_INET: @@ -177,7 +178,6 @@ case AF_UNSPEC: case AF_UNIX: strncpy(rip, "unix", strlen("unix") + 1); - *rport = 0; break; #endif // #ifndef DISABLE_UNIX_SOCKET } @@ -371,6 +371,64 @@ ); return total; } + +#define MAX_RBUF_READ 100 +static void _logger_log_proxy_errbe(logentry *e, const entry_details *d, const void *entry, va_list ap) { + char *errmsg = va_arg(ap, char *); + char *be_name = va_arg(ap, char *); + char *be_port = va_arg(ap, char *); + int be_depth = va_arg(ap, int); + char *be_rbuf = va_arg(ap, char *); + int be_rbuflen = va_arg(ap, int); + + struct logentry_proxy_errbe *le = (void *)e->data; + le->be_depth = be_depth; + le->errlen = strlen(errmsg); + if (be_name && be_port) { + le->be_namelen = strlen(be_name); + le->be_portlen = strlen(be_port); + } + + le->be_rbuflen = be_rbuflen; + if (be_rbuflen > MAX_RBUF_READ) { + le->be_rbuflen = MAX_RBUF_READ; + } + + char *data = le->data; + memcpy(data, errmsg, le->errlen); + data += le->errlen; + memcpy(data, be_name, le->be_namelen); + data += le->be_namelen; + memcpy(data, be_port, le->be_portlen); + data += le->be_portlen; + memcpy(data, be_rbuf, le->be_rbuflen); + data += le->be_rbuflen; + + e->size = sizeof(struct logentry_proxy_errbe) + (data - le->data); +} + +static int _logger_parse_prx_errbe(logentry *e, char *scratch) { + int total; + char rbuf[MAX_RBUF_READ * 3]; // x 3 for worst case URI encoding. + struct logentry_proxy_errbe *le = (void *)e->data; + char *data = le->data; + char *errmsg = data; + data += le->errlen; + char *be_name = data; + data += le->be_namelen; + char *be_port = data; + data += le->be_portlen; + char *be_rbuf = data; + + uriencode(be_rbuf, rbuf, le->be_rbuflen, MAX_RBUF_READ * 3); + total = snprintf(scratch, LOGGER_PARSE_SCRATCH, + "ts=%lld.%d gid=%llu type=proxy_backend error=%.*s name=%.*s port=%.*s depth=%d rbuf=%s\n", + (long long int)e->tv.tv_sec, (int)e->tv.tv_usec, (unsigned long long) e->gid, + (int)le->errlen, errmsg, (int)le->be_namelen, be_name, + (int)le->be_portlen, be_port, le->be_depth, rbuf); + + return total; +} #endif /* Should this go somewhere else? */ @@ -419,8 +477,8 @@ [LOGGER_PROXY_USER] = {512, LOG_PROXYUSER, _logger_log_text, _logger_parse_text, "type=proxy_user msg=%s" }, - [LOGGER_PROXY_BE_ERROR] = {512, LOG_PROXYEVENTS, _logger_log_text, _logger_parse_text, - "type=proxy_backend error=%s name=%s port=%s" + [LOGGER_PROXY_BE_ERROR] = {512, LOG_PROXYEVENTS, _logger_log_proxy_errbe, _logger_parse_prx_errbe, + NULL }, #endif @@ -913,8 +971,8 @@ /* Request a maximum length of data to write to */ e = (logentry *) bipbuf_request(buf, (sizeof(logentry) + reqlen)); if (e == NULL) { - pthread_mutex_unlock(&l->mutex); l->dropped++; + pthread_mutex_unlock(&l->mutex); return LOGGER_RET_NOSPACE; } e->event = event; diff -Nru memcached-1.6.18/logger.h memcached-1.6.19/logger.h --- memcached-1.6.18/logger.h 2022-11-25 00:28:47.000000000 +0000 +++ memcached-1.6.19/logger.h 2023-03-08 21:34:27.000000000 +0000 @@ -126,6 +126,15 @@ long elapsed; char data[]; }; + +struct logentry_proxy_errbe { + size_t errlen; + size_t be_namelen; + size_t be_portlen; + size_t be_rbuflen; + int be_depth; + char data[]; +}; #endif /* end intermediary structures */ diff -Nru memcached-1.6.18/Makefile.am memcached-1.6.19/Makefile.am --- memcached-1.6.18/Makefile.am 2023-01-11 05:58:39.000000000 +0000 +++ memcached-1.6.19/Makefile.am 2023-03-08 21:34:27.000000000 +0000 @@ -62,6 +62,7 @@ proxy_jump_hash.c proxy_request.c \ proxy_network.c proxy_lua.c \ proxy_config.c proxy_ring_hash.c \ + proxy_internal.c \ md5.c md5.h endif diff -Nru memcached-1.6.18/Makefile.in memcached-1.6.19/Makefile.in --- memcached-1.6.18/Makefile.in 2023-01-11 06:17:57.000000000 +0000 +++ memcached-1.6.19/Makefile.in 2023-03-08 21:42:12.000000000 +0000 @@ -104,6 +104,7 @@ @ENABLE_PROXY_TRUE@ proxy_jump_hash.c proxy_request.c \ @ENABLE_PROXY_TRUE@ proxy_network.c proxy_lua.c \ @ENABLE_PROXY_TRUE@ proxy_config.c proxy_ring_hash.c \ +@ENABLE_PROXY_TRUE@ proxy_internal.c \ @ENABLE_PROXY_TRUE@ md5.c md5.h @ENABLE_EXTSTORE_TRUE@am__append_8 = extstore.c extstore.h \ @@ -155,8 +156,8 @@ sasl_defs.c proto_proxy.c proto_proxy.h vendor/mcmc/mcmc.h \ proxy_xxhash.c proxy.h proxy_await.c proxy_ustats.c \ proxy_jump_hash.c proxy_request.c proxy_network.c proxy_lua.c \ - proxy_config.c proxy_ring_hash.c md5.c md5.h extstore.c \ - extstore.h crc32c.c crc32c.h storage.c storage.h \ + proxy_config.c proxy_ring_hash.c proxy_internal.c md5.c md5.h \ + extstore.c extstore.h crc32c.c crc32c.h storage.c storage.h \ slab_automove_extstore.c slab_automove_extstore.h tls.c tls.h @BUILD_SOLARIS_PRIVS_TRUE@am__objects_1 = \ @BUILD_SOLARIS_PRIVS_TRUE@ memcached-solaris_priv.$(OBJEXT) @@ -179,6 +180,7 @@ @ENABLE_PROXY_TRUE@ memcached-proxy_lua.$(OBJEXT) \ @ENABLE_PROXY_TRUE@ memcached-proxy_config.$(OBJEXT) \ @ENABLE_PROXY_TRUE@ memcached-proxy_ring_hash.$(OBJEXT) \ +@ENABLE_PROXY_TRUE@ memcached-proxy_internal.$(OBJEXT) \ @ENABLE_PROXY_TRUE@ memcached-md5.$(OBJEXT) @ENABLE_EXTSTORE_TRUE@am__objects_8 = memcached-extstore.$(OBJEXT) \ @ENABLE_EXTSTORE_TRUE@ memcached-crc32c.$(OBJEXT) \ @@ -216,9 +218,9 @@ vendor/mcmc/mcmc.h proxy_xxhash.c proxy.h proxy_await.c \ proxy_ustats.c proxy_jump_hash.c proxy_request.c \ proxy_network.c proxy_lua.c proxy_config.c proxy_ring_hash.c \ - md5.c md5.h extstore.c extstore.h crc32c.c crc32c.h storage.c \ - storage.h slab_automove_extstore.c slab_automove_extstore.h \ - tls.c tls.h + proxy_internal.c md5.c md5.h extstore.c extstore.h crc32c.c \ + crc32c.h storage.c storage.h slab_automove_extstore.c \ + slab_automove_extstore.h tls.c tls.h @BUILD_SOLARIS_PRIVS_TRUE@am__objects_10 = memcached_debug-solaris_priv.$(OBJEXT) @BUILD_LINUX_PRIVS_TRUE@am__objects_11 = \ @BUILD_LINUX_PRIVS_TRUE@ memcached_debug-linux_priv.$(OBJEXT) @@ -238,6 +240,7 @@ @ENABLE_PROXY_TRUE@ memcached_debug-proxy_lua.$(OBJEXT) \ @ENABLE_PROXY_TRUE@ memcached_debug-proxy_config.$(OBJEXT) \ @ENABLE_PROXY_TRUE@ memcached_debug-proxy_ring_hash.$(OBJEXT) \ +@ENABLE_PROXY_TRUE@ memcached_debug-proxy_internal.$(OBJEXT) \ @ENABLE_PROXY_TRUE@ memcached_debug-md5.$(OBJEXT) @ENABLE_EXTSTORE_TRUE@am__objects_17 = \ @ENABLE_EXTSTORE_TRUE@ memcached_debug-extstore.$(OBJEXT) \ @@ -324,6 +327,7 @@ ./$(DEPDIR)/memcached-proto_text.Po \ ./$(DEPDIR)/memcached-proxy_await.Po \ ./$(DEPDIR)/memcached-proxy_config.Po \ + ./$(DEPDIR)/memcached-proxy_internal.Po \ ./$(DEPDIR)/memcached-proxy_jump_hash.Po \ ./$(DEPDIR)/memcached-proxy_lua.Po \ ./$(DEPDIR)/memcached-proxy_network.Po \ @@ -367,6 +371,7 @@ ./$(DEPDIR)/memcached_debug-proto_text.Po \ ./$(DEPDIR)/memcached_debug-proxy_await.Po \ ./$(DEPDIR)/memcached_debug-proxy_config.Po \ + ./$(DEPDIR)/memcached_debug-proxy_internal.Po \ ./$(DEPDIR)/memcached_debug-proxy_jump_hash.Po \ ./$(DEPDIR)/memcached_debug-proxy_lua.Po \ ./$(DEPDIR)/memcached_debug-proxy_network.Po \ @@ -823,6 +828,7 @@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/memcached-proto_text.Po@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/memcached-proxy_await.Po@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/memcached-proxy_config.Po@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/memcached-proxy_internal.Po@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/memcached-proxy_jump_hash.Po@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/memcached-proxy_lua.Po@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/memcached-proxy_network.Po@am__quote@ # am--include-marker @@ -867,6 +873,7 @@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/memcached_debug-proto_text.Po@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/memcached_debug-proxy_await.Po@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/memcached_debug-proxy_config.Po@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/memcached_debug-proxy_internal.Po@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/memcached_debug-proxy_jump_hash.Po@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/memcached_debug-proxy_lua.Po@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/memcached_debug-proxy_network.Po@am__quote@ # am--include-marker @@ -1444,6 +1451,20 @@ @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(memcached_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o memcached-proxy_ring_hash.obj `if test -f 'proxy_ring_hash.c'; then $(CYGPATH_W) 'proxy_ring_hash.c'; else $(CYGPATH_W) '$(srcdir)/proxy_ring_hash.c'; fi` +memcached-proxy_internal.o: proxy_internal.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(memcached_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT memcached-proxy_internal.o -MD -MP -MF $(DEPDIR)/memcached-proxy_internal.Tpo -c -o memcached-proxy_internal.o `test -f 'proxy_internal.c' || echo '$(srcdir)/'`proxy_internal.c +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/memcached-proxy_internal.Tpo $(DEPDIR)/memcached-proxy_internal.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='proxy_internal.c' object='memcached-proxy_internal.o' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(memcached_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o memcached-proxy_internal.o `test -f 'proxy_internal.c' || echo '$(srcdir)/'`proxy_internal.c + +memcached-proxy_internal.obj: proxy_internal.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(memcached_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT memcached-proxy_internal.obj -MD -MP -MF $(DEPDIR)/memcached-proxy_internal.Tpo -c -o memcached-proxy_internal.obj `if test -f 'proxy_internal.c'; then $(CYGPATH_W) 'proxy_internal.c'; else $(CYGPATH_W) '$(srcdir)/proxy_internal.c'; fi` +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/memcached-proxy_internal.Tpo $(DEPDIR)/memcached-proxy_internal.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='proxy_internal.c' object='memcached-proxy_internal.obj' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(memcached_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o memcached-proxy_internal.obj `if test -f 'proxy_internal.c'; then $(CYGPATH_W) 'proxy_internal.c'; else $(CYGPATH_W) '$(srcdir)/proxy_internal.c'; fi` + memcached-md5.o: md5.c @am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(memcached_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT memcached-md5.o -MD -MP -MF $(DEPDIR)/memcached-md5.Tpo -c -o memcached-md5.o `test -f 'md5.c' || echo '$(srcdir)/'`md5.c @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/memcached-md5.Tpo $(DEPDIR)/memcached-md5.Po @@ -2060,6 +2081,20 @@ @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(memcached_debug_CFLAGS) $(CFLAGS) -c -o memcached_debug-proxy_ring_hash.obj `if test -f 'proxy_ring_hash.c'; then $(CYGPATH_W) 'proxy_ring_hash.c'; else $(CYGPATH_W) '$(srcdir)/proxy_ring_hash.c'; fi` +memcached_debug-proxy_internal.o: proxy_internal.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(memcached_debug_CFLAGS) $(CFLAGS) -MT memcached_debug-proxy_internal.o -MD -MP -MF $(DEPDIR)/memcached_debug-proxy_internal.Tpo -c -o memcached_debug-proxy_internal.o `test -f 'proxy_internal.c' || echo '$(srcdir)/'`proxy_internal.c +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/memcached_debug-proxy_internal.Tpo $(DEPDIR)/memcached_debug-proxy_internal.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='proxy_internal.c' object='memcached_debug-proxy_internal.o' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(memcached_debug_CFLAGS) $(CFLAGS) -c -o memcached_debug-proxy_internal.o `test -f 'proxy_internal.c' || echo '$(srcdir)/'`proxy_internal.c + +memcached_debug-proxy_internal.obj: proxy_internal.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(memcached_debug_CFLAGS) $(CFLAGS) -MT memcached_debug-proxy_internal.obj -MD -MP -MF $(DEPDIR)/memcached_debug-proxy_internal.Tpo -c -o memcached_debug-proxy_internal.obj `if test -f 'proxy_internal.c'; then $(CYGPATH_W) 'proxy_internal.c'; else $(CYGPATH_W) '$(srcdir)/proxy_internal.c'; fi` +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/memcached_debug-proxy_internal.Tpo $(DEPDIR)/memcached_debug-proxy_internal.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='proxy_internal.c' object='memcached_debug-proxy_internal.obj' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(memcached_debug_CFLAGS) $(CFLAGS) -c -o memcached_debug-proxy_internal.obj `if test -f 'proxy_internal.c'; then $(CYGPATH_W) 'proxy_internal.c'; else $(CYGPATH_W) '$(srcdir)/proxy_internal.c'; fi` + memcached_debug-md5.o: md5.c @am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(memcached_debug_CFLAGS) $(CFLAGS) -MT memcached_debug-md5.o -MD -MP -MF $(DEPDIR)/memcached_debug-md5.Tpo -c -o memcached_debug-md5.o `test -f 'md5.c' || echo '$(srcdir)/'`md5.c @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/memcached_debug-md5.Tpo $(DEPDIR)/memcached_debug-md5.Po @@ -2550,6 +2585,7 @@ -rm -f ./$(DEPDIR)/memcached-proto_text.Po -rm -f ./$(DEPDIR)/memcached-proxy_await.Po -rm -f ./$(DEPDIR)/memcached-proxy_config.Po + -rm -f ./$(DEPDIR)/memcached-proxy_internal.Po -rm -f ./$(DEPDIR)/memcached-proxy_jump_hash.Po -rm -f ./$(DEPDIR)/memcached-proxy_lua.Po -rm -f ./$(DEPDIR)/memcached-proxy_network.Po @@ -2594,6 +2630,7 @@ -rm -f ./$(DEPDIR)/memcached_debug-proto_text.Po -rm -f ./$(DEPDIR)/memcached_debug-proxy_await.Po -rm -f ./$(DEPDIR)/memcached_debug-proxy_config.Po + -rm -f ./$(DEPDIR)/memcached_debug-proxy_internal.Po -rm -f ./$(DEPDIR)/memcached_debug-proxy_jump_hash.Po -rm -f ./$(DEPDIR)/memcached_debug-proxy_lua.Po -rm -f ./$(DEPDIR)/memcached_debug-proxy_network.Po @@ -2694,6 +2731,7 @@ -rm -f ./$(DEPDIR)/memcached-proto_text.Po -rm -f ./$(DEPDIR)/memcached-proxy_await.Po -rm -f ./$(DEPDIR)/memcached-proxy_config.Po + -rm -f ./$(DEPDIR)/memcached-proxy_internal.Po -rm -f ./$(DEPDIR)/memcached-proxy_jump_hash.Po -rm -f ./$(DEPDIR)/memcached-proxy_lua.Po -rm -f ./$(DEPDIR)/memcached-proxy_network.Po @@ -2738,6 +2776,7 @@ -rm -f ./$(DEPDIR)/memcached_debug-proto_text.Po -rm -f ./$(DEPDIR)/memcached_debug-proxy_await.Po -rm -f ./$(DEPDIR)/memcached_debug-proxy_config.Po + -rm -f ./$(DEPDIR)/memcached_debug-proxy_internal.Po -rm -f ./$(DEPDIR)/memcached_debug-proxy_jump_hash.Po -rm -f ./$(DEPDIR)/memcached_debug-proxy_lua.Po -rm -f ./$(DEPDIR)/memcached_debug-proxy_network.Po diff -Nru memcached-1.6.18/memcached.c memcached-1.6.19/memcached.c --- memcached-1.6.18/memcached.c 2023-01-11 06:10:10.000000000 +0000 +++ memcached-1.6.19/memcached.c 2023-03-08 21:34:27.000000000 +0000 @@ -424,9 +424,9 @@ if (c->rbuf == NULL) { c->rbuf = do_cache_alloc(c->thread->rbuf_cache); if (!c->rbuf) { - THR_STATS_LOCK(c); + THR_STATS_LOCK(c->thread); c->thread->stats.read_buf_oom++; - THR_STATS_UNLOCK(c); + THR_STATS_UNLOCK(c->thread); return false; } c->rsize = READ_BUFFER_SIZE; @@ -569,7 +569,7 @@ } } -void thread_io_queue_add(LIBEVENT_THREAD *t, int type, void *ctx, io_queue_stack_cb cb, io_queue_stack_cb com_cb, io_queue_cb ret_cb, io_queue_cb fin_cb) { +void thread_io_queue_add(LIBEVENT_THREAD *t, int type, void *ctx, io_queue_stack_cb cb) { io_queue_cb_t *q = t->io_queues; while (q->type != IO_QUEUE_NONE) { q++; @@ -577,9 +577,6 @@ q->type = type; q->ctx = ctx; q->submit_cb = cb; - q->complete_cb = com_cb; - q->finalize_cb = fin_cb; - q->return_cb = ret_cb; return; } @@ -626,26 +623,9 @@ return NULL; } -// called after returning to the main worker thread. -// users of the queue need to distinguish if the IO was actually consumed or -// not and handle appropriately. -static void conn_io_queue_complete(conn *c) { - io_queue_t *q = c->io_queues; - io_queue_cb_t *qcb = c->thread->io_queues; - while (q->type != IO_QUEUE_NONE) { - if (q->stack_ctx) { - qcb->complete_cb(q); - } - qcb++; - q++; - } -} - // called to return a single IO object to the original worker thread. void conn_io_queue_return(io_pending_t *io) { - io_queue_cb_t *q = thread_io_queue_get(io->thread, io->io_queue_type); - q->return_cb(io); - return; + io->return_cb(io); } conn *conn_new(const int sfd, enum conn_states init_state, @@ -1077,7 +1057,9 @@ if (resp != NULL) { b->refcount++; - resp->free = false; + memset(resp, 0, sizeof(*resp)); + resp->free = false; // redundant, for clarity. + resp->bundle = b; if (b->refcount == MAX_RESP_PER_BUNDLE) { assert(b->prev == NULL); // We only allocate off the head. Assign new head. @@ -1095,20 +1077,21 @@ assert(th->open_bundle == NULL); b = do_cache_alloc(th->rbuf_cache); if (b) { - THR_STATS_LOCK(c); - c->thread->stats.response_obj_bytes += READ_BUFFER_SIZE; - THR_STATS_UNLOCK(c); + THR_STATS_LOCK(th); + th->stats.response_obj_bytes += READ_BUFFER_SIZE; + THR_STATS_UNLOCK(th); b->next_check = 1; b->refcount = 1; for (int i = 0; i < MAX_RESP_PER_BUNDLE; i++) { - b->r[i].bundle = b; b->r[i].free = true; } b->next = 0; b->prev = 0; th->open_bundle = b; resp = &b->r[0]; - resp->free = false; + memset(resp, 0, sizeof(*resp)); + resp->free = false; // redundant. for clarity. + resp->bundle = b; } else { return NULL; } @@ -1117,8 +1100,7 @@ return resp; } -static void resp_free(conn *c, mc_resp *resp) { - LIBEVENT_THREAD *th = c->thread; +void resp_free(LIBEVENT_THREAD *th, mc_resp *resp) { mc_resp_bundle *b = resp->bundle; resp->free = true; @@ -1143,9 +1125,9 @@ // Now completely done with this buffer. do_cache_free(th->rbuf_cache, b); - THR_STATS_LOCK(c); - c->thread->stats.response_obj_bytes -= READ_BUFFER_SIZE; - THR_STATS_UNLOCK(c); + THR_STATS_LOCK(th); + th->stats.response_obj_bytes -= READ_BUFFER_SIZE; + THR_STATS_UNLOCK(th); } } else { mc_resp_bundle **head = &th->open_bundle; @@ -1161,30 +1143,25 @@ } } + THR_STATS_LOCK(th); + th->stats.response_obj_count--; + THR_STATS_UNLOCK(th); } bool resp_start(conn *c) { mc_resp *resp = resp_allocate(c); if (!resp) { - THR_STATS_LOCK(c); + THR_STATS_LOCK(c->thread); c->thread->stats.response_obj_oom++; - THR_STATS_UNLOCK(c); + THR_STATS_UNLOCK(c->thread); return false; } + // handling the stats counters here to simplify testing - THR_STATS_LOCK(c); + THR_STATS_LOCK(c->thread); c->thread->stats.response_obj_count++; - THR_STATS_UNLOCK(c); - // Skip zeroing the bundle pointer at the start. - // TODO: this line is here temporarily to make the code easy to disable. - // when it's more mature, move the memset into resp_allocate() and have it - // set the bundle pointer on allocate so this line isn't as complex. - memset((char *)resp + sizeof(mc_resp_bundle*), 0, sizeof(*resp) - sizeof(mc_resp_bundle*)); - // TODO: this next line works. memset _does_ show up significantly under - // perf reports due to zeroing out the entire resp->wbuf. before swapping - // the lines more validation work should be done to ensure wbuf's aren't - // accidentally reused without being written to. - //memset((char *)resp + sizeof(mc_resp_bundle*), 0, offsetof(mc_resp, wbuf)); + THR_STATS_UNLOCK(c->thread); + if (!c->resp_head) { c->resp_head = resp; } @@ -1203,6 +1180,30 @@ return true; } +mc_resp *resp_start_unlinked(conn *c) { + mc_resp *resp = resp_allocate(c); + if (!resp) { + THR_STATS_LOCK(c->thread); + c->thread->stats.response_obj_oom++; + THR_STATS_UNLOCK(c->thread); + return false; + } + + // handling the stats counters here to simplify testing + THR_STATS_LOCK(c->thread); + c->thread->stats.response_obj_count++; + THR_STATS_UNLOCK(c->thread); + + if (IS_UDP(c->transport)) { + // need to hold on to some data for async responses. + c->resp->request_id = c->request_id; + c->resp->request_addr = c->request_addr; + c->resp->request_addr_size = c->request_addr_size; + } + + return resp; +} + // returns next response in chain. mc_resp* resp_finish(conn *c, mc_resp *resp) { mc_resp *next = resp->next; @@ -1215,11 +1216,11 @@ free(resp->write_and_free); } if (resp->io_pending) { + io_pending_t *io = resp->io_pending; // If we had a pending IO, tell it to internally clean up then return // the main object back to our thread cache. - io_queue_cb_t *qcb = thread_io_queue_get(c->thread, resp->io_pending->io_queue_type); - qcb->finalize_cb(resp->io_pending); - do_cache_free(c->thread->io_cache, resp->io_pending); + io->finalize_cb(io); + do_cache_free(c->thread->io_cache, io); resp->io_pending = NULL; } if (c->resp_head == resp) { @@ -1228,10 +1229,7 @@ if (c->resp == resp) { c->resp = NULL; } - resp_free(c, resp); - THR_STATS_LOCK(c); - c->thread->stats.response_obj_count--; - THR_STATS_UNLOCK(c); + resp_free(c->thread, resp); return next; } @@ -1565,9 +1563,9 @@ * * Returns the state of storage. */ -enum store_item_type do_store_item(item *it, int comm, conn *c, const uint32_t hv) { +enum store_item_type do_store_item(item *it, int comm, LIBEVENT_THREAD *t, const uint32_t hv, uint64_t *cas, bool cas_stale) { char *key = ITEM_key(it); - item *old_it = do_item_get(key, it->nkey, hv, c, DONT_UPDATE); + item *old_it = do_item_get(key, it->nkey, hv, t, DONT_UPDATE); enum store_item_type stored = NOT_STORED; enum cas_result { CAS_NONE, CAS_MATCH, CAS_BADVAL, CAS_STALE, CAS_MISS }; @@ -1587,7 +1585,7 @@ cas_res = CAS_NONE; } else if (it_cas == old_cas) { cas_res = CAS_MATCH; - } else if (c->set_stale && it_cas < old_cas) { + } else if (cas_stale && it_cas < old_cas) { cas_res = CAS_STALE; } else { cas_res = CAS_BADVAL; @@ -1603,9 +1601,9 @@ // cas validates // it and old_it may belong to different classes. // I'm updating the stats for the one that's getting pushed out - pthread_mutex_lock(&c->thread->stats.mutex); - c->thread->stats.slab_stats[ITEM_clsid(old_it)].cas_hits++; - pthread_mutex_unlock(&c->thread->stats.mutex); + pthread_mutex_lock(&t->stats.mutex); + t->stats.slab_stats[ITEM_clsid(old_it)].cas_hits++; + pthread_mutex_unlock(&t->stats.mutex); do_store = true; } else if (cas_res == CAS_STALE) { // if we're allowed to set a stale value, CAS must be lower than @@ -1618,15 +1616,15 @@ it->it_flags |= ITEM_TOKEN_SENT; } - pthread_mutex_lock(&c->thread->stats.mutex); - c->thread->stats.slab_stats[ITEM_clsid(old_it)].cas_hits++; - pthread_mutex_unlock(&c->thread->stats.mutex); + pthread_mutex_lock(&t->stats.mutex); + t->stats.slab_stats[ITEM_clsid(old_it)].cas_hits++; + pthread_mutex_unlock(&t->stats.mutex); do_store = true; } else { // NONE or BADVAL are the same for CAS cmd - pthread_mutex_lock(&c->thread->stats.mutex); - c->thread->stats.slab_stats[ITEM_clsid(old_it)].cas_badval++; - pthread_mutex_unlock(&c->thread->stats.mutex); + pthread_mutex_lock(&t->stats.mutex); + t->stats.slab_stats[ITEM_clsid(old_it)].cas_badval++; + pthread_mutex_unlock(&t->stats.mutex); if (settings.verbose > 1) { fprintf(stderr, "CAS: failure: expected %llu, got %llu\n", @@ -1674,7 +1672,7 @@ } if (do_store) { - STORAGE_delete(c->thread->storage, old_it); + STORAGE_delete(t->storage, old_it); item_replace(old_it, it, hv); stored = STORED; } @@ -1699,9 +1697,9 @@ case NREAD_CAS: // LRU expired stored = NOT_FOUND; - pthread_mutex_lock(&c->thread->stats.mutex); - c->thread->stats.cas_misses++; - pthread_mutex_unlock(&c->thread->stats.mutex); + pthread_mutex_lock(&t->stats.mutex); + t->stats.cas_misses++; + pthread_mutex_unlock(&t->stats.mutex); break; case NREAD_REPLACE: case NREAD_APPEND: @@ -1716,12 +1714,12 @@ } } - if (stored == STORED) { - c->cas = ITEM_get_cas(it); + if (stored == STORED && cas != NULL) { + *cas = ITEM_get_cas(it); } - LOGGER_LOG(c->thread->l, LOG_MUTATIONS, LOGGER_ITEM_STORE, NULL, + LOGGER_LOG(t->l, LOG_MUTATIONS, LOGGER_ITEM_STORE, NULL, stored, comm, ITEM_key(it), it->nkey, it->nbytes, it->exptime, - ITEM_clsid(it), c->sfd); + ITEM_clsid(it), t->cur_sfd); return stored; } @@ -2126,6 +2124,7 @@ if (c->state == conn_listening || (IS_UDP(c->transport) && c->state == conn_read)) { + memset(&local_addr, 0, sizeof(local_addr)); socklen_t local_addr_len = sizeof(local_addr); if (getsockname(c->sfd, @@ -2139,6 +2138,7 @@ if (c->state != conn_listening && !(IS_UDP(c->transport) && c->state == conn_read)) { struct sockaddr_storage svr_sock_addr; + memset(&svr_sock_addr, 0, sizeof(svr_sock_addr)); socklen_t svr_addr_len = sizeof(svr_sock_addr); getsockname(c->sfd, (struct sockaddr *)&svr_sock_addr, &svr_addr_len); get_conn_text(c, svr_sock_addr.ss_family, svr_addr, (struct sockaddr *)&svr_sock_addr); @@ -2187,12 +2187,12 @@ } #define IT_REFCOUNT_LIMIT 60000 -item* limited_get(char *key, size_t nkey, conn *c, uint32_t exptime, bool should_touch, bool do_update, bool *overflow) { +item* limited_get(const char *key, size_t nkey, LIBEVENT_THREAD *t, uint32_t exptime, bool should_touch, bool do_update, bool *overflow) { item *it; if (should_touch) { - it = item_touch(key, nkey, exptime, c); + it = item_touch(key, nkey, exptime, t); } else { - it = item_get(key, nkey, c, do_update); + it = item_get(key, nkey, t, do_update); } if (it && it->refcount > IT_REFCOUNT_LIMIT) { item_remove(it); @@ -2208,9 +2208,9 @@ // locked, caller can directly change what it needs. // though it might eventually be a better interface to sink it all into // items.c. -item* limited_get_locked(char *key, size_t nkey, conn *c, bool do_update, uint32_t *hv, bool *overflow) { +item* limited_get_locked(const char *key, size_t nkey, LIBEVENT_THREAD *t, bool do_update, uint32_t *hv, bool *overflow) { item *it; - it = item_get_locked(key, nkey, c, do_update, hv); + it = item_get_locked(key, nkey, t, do_update, hv); if (it && it->refcount > IT_REFCOUNT_LIMIT) { do_item_remove(it); it = NULL; @@ -2233,7 +2233,7 @@ * * returns a response string to send back to the client. */ -enum delta_result_type do_add_delta(conn *c, const char *key, const size_t nkey, +enum delta_result_type do_add_delta(LIBEVENT_THREAD *t, const char *key, const size_t nkey, const bool incr, const int64_t delta, char *buf, uint64_t *cas, const uint32_t hv, @@ -2243,7 +2243,7 @@ int res; item *it; - it = do_item_get(key, nkey, hv, c, DONT_UPDATE); + it = do_item_get(key, nkey, hv, t, DONT_UPDATE); if (!it) { return DELTA_ITEM_NOT_FOUND; } @@ -2273,23 +2273,23 @@ if (incr) { value += delta; - MEMCACHED_COMMAND_INCR(c->sfd, ITEM_key(it), it->nkey, value); + //MEMCACHED_COMMAND_INCR(c->sfd, ITEM_key(it), it->nkey, value); } else { if(delta > value) { value = 0; } else { value -= delta; } - MEMCACHED_COMMAND_DECR(c->sfd, ITEM_key(it), it->nkey, value); + //MEMCACHED_COMMAND_DECR(c->sfd, ITEM_key(it), it->nkey, value); } - pthread_mutex_lock(&c->thread->stats.mutex); + pthread_mutex_lock(&t->stats.mutex); if (incr) { - c->thread->stats.slab_stats[ITEM_clsid(it)].incr_hits++; + t->stats.slab_stats[ITEM_clsid(it)].incr_hits++; } else { - c->thread->stats.slab_stats[ITEM_clsid(it)].decr_hits++; + t->stats.slab_stats[ITEM_clsid(it)].decr_hits++; } - pthread_mutex_unlock(&c->thread->stats.mutex); + pthread_mutex_unlock(&t->stats.mutex); itoa_u64(value, buf); res = strlen(buf); @@ -3379,7 +3379,6 @@ break; case conn_io_queue: /* Complete our queued IO's from within the worker thread. */ - conn_io_queue_complete(c); conn_set_state(c, conn_mwrite); break; case conn_max_state: @@ -4908,9 +4907,6 @@ } #endif - /* Run regardless of initializing it later */ - init_lru_maintainer(); - /* set stderr non-buffering (for running under, say, daemontools) */ setbuf(stderr, NULL); @@ -6056,9 +6052,6 @@ #ifdef PROXY if (settings.proxy_enabled) { settings.proxy_ctx = proxy_init(settings.proxy_uring); - if (proxy_load_config(settings.proxy_ctx) != 0) { - exit(EXIT_FAILURE); - } } #endif #ifdef EXTSTORE @@ -6070,6 +6063,14 @@ init_lru_crawler(NULL); #endif +#ifdef PROXY + if (settings.proxy_enabled) { + if (proxy_first_confload(settings.proxy_ctx) != 0) { + exit(EXIT_FAILURE); + } + } +#endif + if (start_assoc_maint && start_assoc_maintenance_thread() == -1) { exit(EXIT_FAILURE); } diff -Nru memcached-1.6.18/memcached.h memcached-1.6.19/memcached.h --- memcached-1.6.18/memcached.h 2023-01-11 06:10:10.000000000 +0000 +++ memcached-1.6.19/memcached.h 2023-03-08 21:34:27.000000000 +0000 @@ -102,12 +102,6 @@ #define ITEM_SIZE_MAX_LOWER_LIMIT 1024 #define ITEM_SIZE_MAX_UPPER_LIMIT 1024 * 1024 * 1024 - -/* unistd.h is here */ -#if HAVE_UNISTD_H -# include -#endif - /* Slab sizing definitions. */ #define POWER_SMALLEST 1 #define POWER_LARGEST 256 /* actual cap is 255 */ @@ -273,6 +267,9 @@ #define NREAD_PREPEND 5 #define NREAD_CAS 6 +#define CAS_ALLOW_STALE true +#define CAS_NO_STALE false + enum store_item_type { NOT_STORED=0, STORED, EXISTS, NOT_FOUND, TOO_LARGE, NO_MEMORY }; @@ -599,7 +596,7 @@ // TODO: If we eventually want user loaded modules, we can't use an enum :( enum crawler_run_type { - CRAWLER_AUTOEXPIRE=0, CRAWLER_EXPIRED, CRAWLER_METADUMP + CRAWLER_AUTOEXPIRE=0, CRAWLER_EXPIRED, CRAWLER_METADUMP, CRAWLER_MGDUMP }; typedef struct { @@ -669,21 +666,12 @@ typedef struct io_queue_s io_queue_t; typedef void (*io_queue_stack_cb)(io_queue_t *q); typedef void (*io_queue_cb)(io_pending_t *pending); -// this structure's ownership gets passed between threads: -// - owned normally by the worker thread. -// - multiple queues can be submitted at the same time. -// - each queue can be sent to different background threads. -// - each submitted queue needs to know when to return to the worker. -// - the worker needs to know when all queues have returned so it can process. -// -// io_queue_t's count field is owned by worker until submitted. Then owned by -// side thread until returned. -// conn->io_queues_submitted is always owned by the worker thread. it is -// incremented as the worker submits queues, and decremented as it gets pinged -// for returned threads. -// -// All of this is to avoid having to hit a mutex owned by the connection -// thread that gets pinged for each thread (or an equivalent atomic). +// This structure used to be passed between threads, but is now owned entirely +// by the worker threads. +// IO pending objects are created and stacked into this structure. They are +// then sent off to remote threads. +// The objects are returned one at a time to the worker threads, and this +// structure is then consulted to see when to resume the worker. struct io_queue_s { void *ctx; // duplicated from io_queue_cb_t void *stack_ctx; // module-specific context to be batch-submitted @@ -694,9 +682,6 @@ typedef struct io_queue_cb_s { void *ctx; // untouched ptr for specific context io_queue_stack_cb submit_cb; // callback given a full stack of pending IO's at once. - io_queue_stack_cb complete_cb; - io_queue_cb return_cb; // called on worker thread. - io_queue_cb finalize_cb; // called back on the worker thread. int type; } io_queue_cb_t; @@ -711,6 +696,8 @@ int notify_receive_fd; /* receiving end of notify pipe */ int notify_send_fd; /* sending end of notify pipe */ #endif + int cur_sfd; /* client fd for logging commands */ + int thread_baseid; /* which "number" thread this is for data offsets */ struct thread_stats stats; /* Stats generated by this thread */ io_queue_cb_t io_queues[IO_QUEUE_COUNT]; struct conn_queue *ev_queue; /* Worker/conn event queue */ @@ -731,6 +718,7 @@ void *proxy_hooks; void *proxy_user_stats; void *proxy_int_stats; + void *proxy_event_thread; // worker threads can also be proxy IO threads uint32_t proxy_rng[4]; // fast per-thread rng for lua. // TODO: add ctx object so we can attach to queue. #endif @@ -785,6 +773,8 @@ LIBEVENT_THREAD *thread; conn *c; mc_resp *resp; // associated response object + io_queue_cb return_cb; // called on worker thread. + io_queue_cb finalize_cb; // called back on the worker thread. char data[120]; }; @@ -914,13 +904,13 @@ * Functions */ void do_accept_new_conns(const bool do_accept); -enum delta_result_type do_add_delta(conn *c, const char *key, +enum delta_result_type do_add_delta(LIBEVENT_THREAD *t, const char *key, const size_t nkey, const bool incr, const int64_t delta, char *buf, uint64_t *cas, const uint32_t hv, item **it_ret); -enum store_item_type do_store_item(item *item, int comm, conn* c, const uint32_t hv); -void thread_io_queue_add(LIBEVENT_THREAD *t, int type, void *ctx, io_queue_stack_cb cb, io_queue_stack_cb com_cb, io_queue_cb ret_cb, io_queue_cb fin_cb); +enum store_item_type do_store_item(item *item, int comm, LIBEVENT_THREAD *t, const uint32_t hv, uint64_t *cas, bool cas_stale); +void thread_io_queue_add(LIBEVENT_THREAD *t, int type, void *ctx, io_queue_stack_cb cb); void conn_io_queue_setup(conn *c); io_queue_t *conn_io_queue_get(conn *c, int type); io_queue_cb_t *thread_io_queue_get(LIBEVENT_THREAD *t, int type); @@ -961,19 +951,19 @@ void sidethread_conn_close(conn *c); /* Lock wrappers for cache functions that are called from main loop. */ -enum delta_result_type add_delta(conn *c, const char *key, +enum delta_result_type add_delta(LIBEVENT_THREAD *t, const char *key, const size_t nkey, bool incr, const int64_t delta, char *buf, uint64_t *cas); void accept_new_conns(const bool do_accept); void conn_close_idle(conn *c); void conn_close_all(void); -item *item_alloc(char *key, size_t nkey, int flags, rel_time_t exptime, int nbytes); +item *item_alloc(const char *key, size_t nkey, int flags, rel_time_t exptime, int nbytes); #define DO_UPDATE true #define DONT_UPDATE false -item *item_get(const char *key, const size_t nkey, conn *c, const bool do_update); -item *item_get_locked(const char *key, const size_t nkey, conn *c, const bool do_update, uint32_t *hv); -item *item_touch(const char *key, const size_t nkey, uint32_t exptime, conn *c); +item *item_get(const char *key, const size_t nkey, LIBEVENT_THREAD *t, const bool do_update); +item *item_get_locked(const char *key, const size_t nkey, LIBEVENT_THREAD *t, const bool do_update, uint32_t *hv); +item *item_touch(const char *key, const size_t nkey, uint32_t exptime, LIBEVENT_THREAD *t); int item_link(item *it); void item_remove(item *it); int item_replace(item *it, item *new_it, const uint32_t hv); @@ -990,8 +980,8 @@ #define refcount_decr(it) --(it->refcount) void STATS_LOCK(void); void STATS_UNLOCK(void); -#define THR_STATS_LOCK(c) pthread_mutex_lock(&c->thread->stats.mutex) -#define THR_STATS_UNLOCK(c) pthread_mutex_unlock(&c->thread->stats.mutex) +#define THR_STATS_LOCK(t) pthread_mutex_lock(&t->stats.mutex) +#define THR_STATS_UNLOCK(t) pthread_mutex_unlock(&t->stats.mutex) void threadlocal_stats_reset(void); void threadlocal_stats_aggregate(struct thread_stats *stats); void slab_stats_aggregate(struct thread_stats *stats, struct slab_stats *out); @@ -1002,7 +992,7 @@ void append_stat(const char *name, ADD_STAT add_stats, conn *c, const char *fmt, ...); -enum store_item_type store_item(item *item, int comm, conn *c); +enum store_item_type store_item(item *item, int comm, LIBEVENT_THREAD *t, uint64_t *cas, bool cas_stale); /* Protocol related code */ void out_string(conn *c, const char *str); @@ -1013,14 +1003,16 @@ #define EXPTIME_TO_POSITIVE_TIME(exptime) (exptime < 0) ? \ REALTIME_MAXDELTA + 1 : exptime rel_time_t realtime(const time_t exptime); -item* limited_get(char *key, size_t nkey, conn *c, uint32_t exptime, bool should_touch, bool do_update, bool *overflow); -item* limited_get_locked(char *key, size_t nkey, conn *c, bool do_update, uint32_t *hv, bool *overflow); +item* limited_get(const char *key, size_t nkey, LIBEVENT_THREAD *t, uint32_t exptime, bool should_touch, bool do_update, bool *overflow); +item* limited_get_locked(const char *key, size_t nkey, LIBEVENT_THREAD *t, bool do_update, uint32_t *hv, bool *overflow); // Read/Response object handlers. void resp_reset(mc_resp *resp); void resp_add_iov(mc_resp *resp, const void *buf, int len); void resp_add_chunked_iov(mc_resp *resp, const void *buf, int len); bool resp_start(conn *c); +mc_resp *resp_start_unlinked(conn *c); mc_resp* resp_finish(conn *c, mc_resp *resp); +void resp_free(LIBEVENT_THREAD *th, mc_resp *resp); bool resp_has_stack(conn *c); bool rbuf_switch_to_malloc(conn *c); void conn_release_items(conn *c); diff -Nru memcached-1.6.18/memcached.spec memcached-1.6.19/memcached.spec --- memcached-1.6.18/memcached.spec 2023-01-11 06:17:57.000000000 +0000 +++ memcached-1.6.19/memcached.spec 2023-03-08 21:42:11.000000000 +0000 @@ -27,7 +27,7 @@ %endif Name: memcached -Version: 1.6.18 +Version: 1.6.19 Release: 1%{?dist} Summary: High Performance, Distributed Memory Object Cache @@ -137,7 +137,7 @@ /bin/systemctl stop %{name}.service > /dev/null 2>&1 || : /bin/systemctl stop %{name}@\*.service > /dev/null 2>&1 || : %else - /sbin/service %{name} stop > /dev/null 2&>1 || : + /sbin/service %{name} stop > /dev/null 2>&1 || : /sbin/chkconfig --del %{name} %endif fi diff -Nru memcached-1.6.18/proto_bin.c memcached-1.6.19/proto_bin.c --- memcached-1.6.18/proto_bin.c 2023-01-11 05:58:39.000000000 +0000 +++ memcached-1.6.19/proto_bin.c 2023-03-08 21:34:27.000000000 +0000 @@ -294,7 +294,7 @@ if (c->binary_header.request.cas != 0) { cas = c->binary_header.request.cas; } - switch(add_delta(c, key, nkey, c->cmd == PROTOCOL_BINARY_CMD_INCREMENT, + switch(add_delta(c->thread, key, nkey, c->cmd == PROTOCOL_BINARY_CMD_INCREMENT, req->message.body.delta, tmpbuf, &cas)) { case OK: @@ -323,11 +323,13 @@ res + 2); if (it != NULL) { + uint64_t cas = 0; memcpy(ITEM_data(it), tmpbuf, res); memcpy(ITEM_data(it) + res, "\r\n", 2); + c->thread->cur_sfd = c->sfd; // for store_item logging. - if (store_item(it, NREAD_ADD, c)) { - c->cas = ITEM_get_cas(it); + if (store_item(it, NREAD_ADD, c->thread, &cas, CAS_NO_STALE)) { + c->cas = cas; write_bin_response(c, &rsp->message.body, 0, 0, sizeof(rsp->message.body.value)); } else { write_bin_error(c, PROTOCOL_BINARY_RESPONSE_NOT_STORED, @@ -382,10 +384,12 @@ ch->used += 2; } - ret = store_item(it, c->cmd, c); + uint64_t cas = 0; + c->thread->cur_sfd = c->sfd; // for store_item logging. + ret = store_item(it, c->cmd, c->thread, &cas, CAS_NO_STALE); + c->cas = cas; #ifdef ENABLE_DTRACE - uint64_t cas = ITEM_get_cas(it); switch (c->cmd) { case NREAD_ADD: MEMCACHED_COMMAND_ADD(c->sfd, ITEM_key(it), it->nkey, @@ -476,9 +480,9 @@ protocol_binary_request_touch *t = (void *)extbuf; time_t exptime = ntohl(t->message.body.expiration); - it = item_touch(key, nkey, realtime(exptime), c); + it = item_touch(key, nkey, realtime(exptime), c->thread); } else { - it = item_get(key, nkey, c, DO_UPDATE); + it = item_get(key, nkey, c->thread, DO_UPDATE); } if (it) { @@ -888,6 +892,7 @@ uint8_t extlen = c->binary_header.request.extlen; uint16_t keylen = c->binary_header.request.keylen; uint32_t bodylen = c->binary_header.request.bodylen; + c->thread->cur_sfd = c->sfd; // cuddle sfd for logging. if (keylen > bodylen || keylen + extlen > bodylen) { write_bin_error(c, PROTOCOL_BINARY_RESPONSE_UNKNOWN_COMMAND, NULL, 0); @@ -1136,7 +1141,7 @@ /* Avoid stale data persisting in cache because we failed alloc. * Unacceptable for SET. Anywhere else too? */ if (c->cmd == PROTOCOL_BINARY_CMD_SET) { - it = item_get(key, nkey, c, DONT_UPDATE); + it = item_get(key, nkey, c->thread, DONT_UPDATE); if (it) { item_unlink(it); STORAGE_delete(c->thread->storage, it); @@ -1303,7 +1308,7 @@ stats_prefix_record_delete(key, nkey); } - it = item_get_locked(key, nkey, c, DONT_UPDATE, &hv); + it = item_get_locked(key, nkey, c->thread, DONT_UPDATE, &hv); if (it) { uint64_t cas = c->binary_header.request.cas; if (cas == 0 || cas == ITEM_get_cas(it)) { diff -Nru memcached-1.6.18/proto_proxy.c memcached-1.6.19/proto_proxy.c --- memcached-1.6.18/proto_proxy.c 2023-01-11 06:10:10.000000000 +0000 +++ memcached-1.6.19/proto_proxy.c 2023-03-08 21:34:27.000000000 +0000 @@ -14,7 +14,6 @@ #define PROCESS_NORMAL false static void proxy_process_command(conn *c, char *command, size_t cmdlen, bool multiget); static void mcp_queue_io(conn *c, mc_resp *resp, int coro_ref, lua_State *Lc); -static void proxy_out_errstring(mc_resp *resp, const char *str); /******** EXTERNAL FUNCTIONS ******/ // functions starting with _ are breakouts for the public functions. @@ -130,65 +129,22 @@ // NOTE: might need to differentiate the libs yes? proxy_register_libs(ctx, NULL, L); - // Create/start the backend threads, which we need before servers + // Create/start the IO thread, which we need before servers // start getting created. - // Supporting N event threads should be possible, but it will be a - // low number of N to avoid too many wakeup syscalls. - // For now we hardcode to 1. - proxy_event_thread_t *threads = calloc(1, sizeof(proxy_event_thread_t)); - ctx->proxy_threads = threads; - for (int i = 0; i < 1; i++) { - proxy_event_thread_t *t = &threads[i]; - t->ctx = ctx; -#ifdef USE_EVENTFD - t->event_fd = eventfd(0, EFD_NONBLOCK); - if (t->event_fd == -1) { - perror("failed to create backend notify eventfd"); - exit(1); - } - t->be_event_fd = eventfd(0, EFD_NONBLOCK); - if (t->be_event_fd == -1) { - perror("failed to create backend notify eventfd"); - exit(1); - } -#else - int fds[2]; - if (pipe(fds)) { - perror("can't create proxy backend notify pipe"); - exit(1); - } - - t->notify_receive_fd = fds[0]; - t->notify_send_fd = fds[1]; - - if (pipe(fds)) { - perror("can't create proxy backend connection notify pipe"); - exit(1); - } - t->be_notify_receive_fd = fds[0]; - t->be_notify_send_fd = fds[1]; -#endif - proxy_init_evthread_events(t); - - // incoming request queue. - STAILQ_INIT(&t->io_head_in); - STAILQ_INIT(&t->beconn_head_in); - pthread_mutex_init(&t->mutex, NULL); - pthread_cond_init(&t->cond, NULL); - - memcpy(&t->tunables, &ctx->tunables, sizeof(t->tunables)); + proxy_event_thread_t *t = calloc(1, sizeof(proxy_event_thread_t)); + ctx->proxy_io_thread = t; + proxy_init_event_thread(t, ctx, NULL); #ifdef HAVE_LIBURING - if (t->use_uring) { - pthread_create(&t->thread_id, NULL, proxy_event_thread_ur, t); - } else { - pthread_create(&t->thread_id, NULL, proxy_event_thread, t); - } -#else + if (t->use_uring) { + pthread_create(&t->thread_id, NULL, proxy_event_thread_ur, t); + } else { pthread_create(&t->thread_id, NULL, proxy_event_thread, t); -#endif // HAVE_LIBURING - thread_setname(t->thread_id, "mc-prx-io"); } +#else + pthread_create(&t->thread_id, NULL, proxy_event_thread, t); +#endif // HAVE_LIBURING + thread_setname(t->thread_id, "mc-prx-io"); _start_proxy_config_threads(ctx); return ctx; @@ -221,18 +177,20 @@ thr->proxy_rng[x] = rand(); } - // kick off the configuration. - if (proxy_thread_loadconf(ctx, thr) != 0) { - exit(EXIT_FAILURE); - } + // Create a proxy event thread structure to piggyback on the worker. + proxy_event_thread_t *t = calloc(1, sizeof(proxy_event_thread_t)); + thr->proxy_event_thread = t; + proxy_init_event_thread(t, ctx, thr->base); } // ctx_stack is a stack of io_pending_proxy_t's. void proxy_submit_cb(io_queue_t *q) { - proxy_event_thread_t *e = ((proxy_ctx_t *)q->ctx)->proxy_threads; + proxy_event_thread_t *e = ((proxy_ctx_t *)q->ctx)->proxy_io_thread; io_pending_proxy_t *p = q->stack_ctx; io_head_t head; + be_head_t w_head; // worker local stack. STAILQ_INIT(&head); + STAILQ_INIT(&w_head); // NOTE: responses get returned in the correct order no matter what, since // mc_resp's are linked. @@ -246,11 +204,21 @@ // So for now we build the secondary list with an STAILQ, which // can be transplanted/etc. while (p) { - // insert into tail so head is oldest request. - STAILQ_INSERT_TAIL(&head, p, io_next); + mcp_backend_t *be; + P_DEBUG("%s: queueing req for backend: %p\n", __func__, (void *)p); if (p->is_await) { // need to not count await objects multiple times. - if (p->await_first) { + if (p->await_background) { + P_DEBUG("%s: fast-returning await_background object: %p\n", __func__, (void *)p); + // intercept await backgrounds + // this call cannot recurse if we're on the worker thread, + // since the worker thread has to finish executing this + // function in order to pick up the returned IO. + q->count++; + return_io_pending((io_pending_t *)p); + p = p->next; + continue; + } else if (p->await_first) { q->count++; } // funny workaround: awaiting IOP's don't count toward @@ -259,6 +227,24 @@ } else { q->count++; } + be = p->backend; + + if (be->use_io_thread) { + // insert into tail so head is oldest request. + STAILQ_INSERT_TAIL(&head, p, io_next); + } else { + // emulate some of handler_dequeue() + STAILQ_INSERT_TAIL(&be->io_head, p, io_next); + if (be->io_next == NULL) { + be->io_next = p; + } + be->depth++; + if (!be->stacked) { + be->stacked = true; + be->be_next.stqe_next = NULL; // paranoia + STAILQ_INSERT_TAIL(&w_head, be, be_next); + } + } p = p->next; } @@ -266,33 +252,37 @@ // clear out the submit queue so we can re-queue new IO's inline. q->stack_ctx = NULL; - // Transfer request stack to event thread. - pthread_mutex_lock(&e->mutex); - STAILQ_CONCAT(&e->io_head_in, &head); - // No point in holding the lock since we're not doing a cond signal. - pthread_mutex_unlock(&e->mutex); + if (!STAILQ_EMPTY(&head)) { + P_DEBUG("%s: submitting queue to IO thread\n", __func__); + // Transfer request stack to event thread. + pthread_mutex_lock(&e->mutex); + STAILQ_CONCAT(&e->io_head_in, &head); + // No point in holding the lock since we're not doing a cond signal. + pthread_mutex_unlock(&e->mutex); - // Signal to check queue. + // Signal to check queue. #ifdef USE_EVENTFD - uint64_t u = 1; - // TODO (v2): check result? is it ever possible to get a short write/failure - // for an eventfd? - if (write(e->event_fd, &u, sizeof(uint64_t)) != sizeof(uint64_t)) { - assert(1 == 0); - } + uint64_t u = 1; + // TODO (v2): check result? is it ever possible to get a short write/failure + // for an eventfd? + if (write(e->event_fd, &u, sizeof(uint64_t)) != sizeof(uint64_t)) { + assert(1 == 0); + } #else - if (write(e->notify_send_fd, "w", 1) <= 0) { - assert(1 == 0); - } + if (write(e->notify_send_fd, "w", 1) <= 0) { + assert(1 == 0); + } #endif + } + if (!STAILQ_EMPTY(&w_head)) { + P_DEBUG("%s: running inline worker queue\n", __func__); + // emulating proxy_event_handler + proxy_run_backend_queue(&w_head); + } return; } -void proxy_complete_cb(io_queue_t *q) { - // empty/unused. -} - // called from worker thread after an individual IO has been returned back to // the worker thread. Do post-IO run and cleanup work. void proxy_return_cb(io_pending_t *pending) { @@ -300,14 +290,8 @@ if (p->is_await) { mcplib_await_return(p); } else { - struct timeval end; lua_State *Lc = p->coro; - // stamp the elapsed time into the response object. - gettimeofday(&end, NULL); - p->client_resp->elapsed = (end.tv_sec - p->client_resp->start.tv_sec) * 1000000 + - (end.tv_usec - p->client_resp->start.tv_usec); - // in order to resume we need to remove the objects that were // originally returned // what's currently on the top of the stack is what we want to keep. @@ -342,6 +326,14 @@ // Note: lua registry is the same for main thread or a coroutine. luaL_unref(p->coro, LUA_REGISTRYINDEX, p->coro_ref); } + + if (p->io_type == IO_PENDING_TYPE_EXTSTORE && p->hdr_it) { + // TODO: lock once, worst case this hashes/locks twice. + if (p->miss) { + item_unlink(p->hdr_it); + } + item_remove(p->hdr_it); + } return; } @@ -407,7 +399,7 @@ lua_State *L = thr->L; luaL_unref(L, LUA_REGISTRYINDEX, c->proxy_coro_ref); c->proxy_coro_ref = 0; - WSTAT_DECR(c, proxy_req_active, 1); + WSTAT_DECR(thr, proxy_req_active, 1); } // we buffered a SET of some kind. @@ -473,7 +465,7 @@ } // Need a custom function so we can prefix lua strings easily. -static void proxy_out_errstring(mc_resp *resp, const char *str) { +void proxy_out_errstring(mc_resp *resp, const char *str) { size_t len; const static char error_prefix[] = "SERVER_ERROR "; const static int error_prefix_len = sizeof(error_prefix) - 1; @@ -550,12 +542,43 @@ size_t rlen = 0; if (cores == LUA_OK) { - WSTAT_DECR(c, proxy_req_active, 1); + WSTAT_DECR(c->thread, proxy_req_active, 1); int type = lua_type(Lc, 1); + P_DEBUG("%s: coroutine completed. return type: %d\n", __func__, type); if (type == LUA_TUSERDATA) { mcp_resp_t *r = luaL_checkudata(Lc, 1, "mcp.response"); _set_noreply_mode(resp, r); - if (r->buf) { + if (r->status != MCMC_OK) { + proxy_out_errstring(resp, "backend failure"); + } else if (r->cresp) { + mc_resp *tresp = r->cresp; + // The internal cache handler has created a resp we want to swap in + // here. It would be fastest to swap *resp's position in the + // link but if the set is deep this would instead be slow, so + // we copy over details from this temporary resp instead. + assert(c != NULL); + + // So far all we fill is the wbuf and some iov's? so just copy + // that + the UDP info? + memcpy(resp->wbuf, tresp->wbuf, tresp->iov[0].iov_len); + for (int x = 0; x < tresp->iovcnt; x++) { + resp->iov[x] = tresp->iov[x]; + } + resp->iovcnt = tresp->iovcnt; + resp->chunked_total = tresp->chunked_total; + resp->chunked_data_iov = tresp->chunked_data_iov; + // copy UDP headers... + resp->request_id = tresp->request_id; + resp->udp_sequence = tresp->udp_sequence; + resp->udp_total = tresp->udp_total; + resp->request_addr = tresp->request_addr; + resp->request_addr_size = tresp->request_addr_size; + resp->item = tresp->item; // will be populated if not extstore fetch + resp->skip = tresp->skip; + + // we let the mcp_resp gc handler free up tresp and any + // associated io_pending's of its own later. + } else if (r->buf) { // response set from C. // FIXME (v2): write_and_free() ? it's a bit wrong for here. resp->write_and_free = r->buf; @@ -569,11 +592,10 @@ memcpy(resp->wbuf, s, l); resp_add_iov(resp, resp->wbuf, l); lua_pop(Lc, 1); - } else if (r->status != MCMC_OK) { - proxy_out_errstring(resp, "backend failure"); } else { // Empty response: used for ascii multiget emulation. } + } else if (type == LUA_TSTRING) { // response is a raw string from lua. const char *s = lua_tolstring(Lc, 1, &rlen); @@ -584,54 +606,75 @@ } else { proxy_out_errstring(resp, "bad response"); } + } else if (cores == LUA_YIELD) { - if (nresults == 1) { - // TODO (v2): try harder to validate; but we have so few yield cases - // that I'm going to shortcut this here. A single yielded result - // means it's probably an await(), so attempt to process this. - if (p != NULL) { - int coro_ref = p->coro_ref; - mc_resp *resp = p->resp; - assert((void *)p == (void *)resp->io_pending); - resp->io_pending = NULL; - c = p->c; - do_cache_free(c->thread->io_cache, p); - mcplib_await_run(c, resp, Lc, coro_ref); - } else { - // coroutine object sitting on the _main_ VM right now, so we grab - // the reference from there, which also pops it. - int coro_ref = luaL_ref(c->thread->L, LUA_REGISTRYINDEX); - mcplib_await_run(c, c->resp, Lc, coro_ref); - } + int coro_ref = 0; + int yield_type = lua_tointeger(Lc, -1); + P_DEBUG("%s: coroutine yielded. return type: %d\n", __func__, yield_type); + assert(yield_type != 0); + lua_pop(Lc, 1); + + // need to remove and free the io_pending, since c->resp owns it. + // so we call mcp_queue_io() again and let it override the + // mc_resp's io_pending object. + // + // p is not null only when being called from proxy_return_cb(), + // a pending IO is returning to resume. + if (p != NULL) { + coro_ref = p->coro_ref; + assert((void *)p == (void *)resp->io_pending); + resp->io_pending = NULL; + c = p->c; + // *p is now dead. + do_cache_free(c->thread->io_cache, p); } else { - // need to remove and free the io_pending, since c->resp owns it. - // so we call mcp_queue_io() again and let it override the - // mc_resp's io_pending object. - - int coro_ref = 0; - mc_resp *resp; - if (p != NULL) { - coro_ref = p->coro_ref; - resp = p->resp; - c = p->c; - do_cache_free(p->c->thread->io_cache, p); - // *p is now dead. - } else { - // yielding from a top level call to the coroutine, - // so we need to grab a reference to the coroutine thread. - // TODO (v2): make this more explicit? - // we only need to get the reference here, and error conditions - // should instead drop it, but now it's not obvious to users that - // we're reaching back into the main thread's stack. - assert(c != NULL); - coro_ref = luaL_ref(c->thread->L, LUA_REGISTRYINDEX); - resp = c->resp; - } - // TODO (v2): c only used for cache alloc? push the above into the func? - mcp_queue_io(c, resp, coro_ref, Lc); + // coroutine object sitting on the _main_ VM right now, so we grab + // the reference from there, which also pops it. + assert(c != NULL); + coro_ref = luaL_ref(c->thread->L, LUA_REGISTRYINDEX); + } + + int res = 0; + switch (yield_type) { + case MCP_YIELD_AWAIT: + mcplib_await_run(c, resp, Lc, coro_ref); + break; + case MCP_YIELD_POOL: + // TODO (v2): c only used for cache alloc? + mcp_queue_io(c, resp, coro_ref, Lc); + break; + case MCP_YIELD_LOCAL: + // stack should be: rq, res + res = mcplib_internal_run(Lc, c, resp, coro_ref); + if (res == 0) { + // stack should still be: rq, res + // TODO: turn this function into a for loop that re-runs on + // certain status codes, to avoid recursive depth here. + // + // FIXME: this dance with the coroutine reference is + // annoying. In this case we immediately resume, so no *io + // was generated, so we won't do the above coro_ref swap, so + // we'll try to take the coro_ref again and fail. + // The ref is only actually used in proxy_await + // It should instead be stashed on the top mc_resp object + // (ideally removing c->proxy_coro_ref at the same time) + // and unref'ed when the resp is cleaned up. + lua_rawgeti(c->thread->L, LUA_REGISTRYINDEX, coro_ref); + luaL_unref(c->thread->L, LUA_REGISTRYINDEX, coro_ref); + proxy_run_coroutine(Lc, resp, NULL, c); + } else if (res > 0) { + // internal run queued for extstore. + } else { + assert(res < 0); + proxy_out_errstring(resp, "bad request"); + } + break; + default: + abort(); } + } else { - WSTAT_DECR(c, proxy_req_active, 1); + WSTAT_DECR(c->thread, proxy_req_active, 1); P_DEBUG("%s: Failed to run coroutine: %s\n", __func__, lua_tostring(Lc, -1)); LOGGER_LOG(NULL, LOG_PROXYEVENTS, LOGGER_PROXY_ERROR, NULL, lua_tostring(Lc, -1)); proxy_out_errstring(resp, "lua failure"); @@ -654,7 +697,7 @@ // permanent solution. int ret = process_request(&pr, command, cmdlen); if (ret != 0) { - WSTAT_INCR(c, proxy_conn_errors, 1); + WSTAT_INCR(c->thread, proxy_conn_errors, 1); if (!resp_start(c)) { conn_set_state(c, conn_closing); return; @@ -761,7 +804,7 @@ // We test the command length all the way down here because multigets can // be very long, and they're chopped up by now. if (cmdlen >= MCP_REQUEST_MAXLEN) { - WSTAT_INCR(c, proxy_conn_errors, 1); + WSTAT_INCR(c->thread, proxy_conn_errors, 1); if (!resp_start(c)) { conn_set_state(c, conn_closing); return; @@ -809,7 +852,7 @@ if (c->item == NULL) { lua_settop(L, 0); proxy_out_errstring(c->resp, "out of memory"); - WSTAT_DECR(c, proxy_req_active, 1); + WSTAT_DECR(c->thread, proxy_req_active, 1); return; } c->item_malloced = true; @@ -876,7 +919,7 @@ io_pending_proxy_t *p = do_cache_alloc(c->thread->io_cache); if (p == NULL) { - WSTAT_INCR(c, proxy_conn_oom, 1); + WSTAT_INCR(c->thread, proxy_conn_oom, 1); proxy_lua_error(Lc, "out of memory allocating from IO cache"); return; } @@ -892,6 +935,8 @@ p->client_resp = r; p->flushed = false; p->ascii_multiget = rq->ascii_multiget; + p->return_cb = proxy_return_cb; + p->finalize_cb = proxy_finalize_cb; resp->io_pending = (io_pending_t *)p; // top of the main thread should be our coroutine. diff -Nru memcached-1.6.18/proto_proxy.h memcached-1.6.19/proto_proxy.h --- memcached-1.6.18/proto_proxy.h 2023-01-11 06:10:10.000000000 +0000 +++ memcached-1.6.19/proto_proxy.h 2023-03-08 21:34:27.000000000 +0000 @@ -13,6 +13,7 @@ // TODO: need better names or a better interface for these. can be confusing // to reason about the order. void proxy_start_reload(void *arg); +int proxy_first_confload(void *arg); int proxy_load_config(void *arg); void proxy_worker_reload(void *arg, LIBEVENT_THREAD *thr); diff -Nru memcached-1.6.18/proto_text.c memcached-1.6.19/proto_text.c --- memcached-1.6.18/proto_text.c 2023-01-11 06:10:10.000000000 +0000 +++ memcached-1.6.19/proto_text.c 2023-03-08 21:34:27.000000000 +0000 @@ -168,10 +168,11 @@ } out_string(c, "CLIENT_ERROR bad data chunk"); } else { - ret = store_item(it, comm, c); + uint64_t cas = 0; + c->thread->cur_sfd = c->sfd; // cuddle sfd for logging. + ret = store_item(it, comm, c->thread, &cas, c->set_stale); #ifdef ENABLE_DTRACE - uint64_t cas = ITEM_get_cas(it); switch (c->cmd) { case NREAD_ADD: MEMCACHED_COMMAND_ADD(c->sfd, ITEM_key(it), it->nkey, @@ -201,6 +202,7 @@ #endif if (c->mset_res) { + c->cas = cas; _finalize_mset(c, ret); } else { switch (ret) { @@ -565,7 +567,7 @@ goto stop; } - it = limited_get(key, nkey, c, exptime, should_touch, DO_UPDATE, &overflow); + it = limited_get(key, nkey, c->thread, exptime, should_touch, DO_UPDATE, &overflow); if (settings.detail_enabled) { stats_prefix_record_get(key, nkey, NULL != it); } @@ -844,7 +846,7 @@ } bool overflow; // not used here. - item *it = limited_get(key, nkey, c, 0, false, DONT_UPDATE, &overflow); + item *it = limited_get(key, nkey, c->thread, 0, false, DONT_UPDATE, &overflow); if (it) { mc_resp *resp = c->resp; size_t total = 0; @@ -1093,10 +1095,10 @@ // I think we do, since an overflow shouldn't trigger an alloc/replace. bool overflow = false; if (!of.locked) { - it = limited_get(key, nkey, c, 0, false, !of.no_update, &overflow); + it = limited_get(key, nkey, c->thread, 0, false, !of.no_update, &overflow); } else { // If we had to lock the item, we're doing our own bump later. - it = limited_get_locked(key, nkey, c, DONT_UPDATE, &hv, &overflow); + it = limited_get_locked(key, nkey, c->thread, DONT_UPDATE, &hv, &overflow); } // Since we're a new protocol, we can actually inform users that refcount @@ -1294,7 +1296,7 @@ if (of.locked) { // Delayed bump so we could get fetched/last access time pre-update. if (!of.no_update && it != NULL) { - do_item_bump(c, it, hv); + do_item_bump(c->thread, it, hv); } item_unlock(hv); } @@ -1513,7 +1515,7 @@ /* Avoid stale data persisting in cache because we failed alloc. */ // NOTE: only if SET mode? - it = item_get_locked(key, nkey, c, DONT_UPDATE, &hv); + it = item_get_locked(key, nkey, c->thread, DONT_UPDATE, &hv); if (it) { do_item_unlink(it, hv); STORAGE_delete(c->thread->storage, it); @@ -1620,7 +1622,7 @@ } } - it = item_get_locked(key, nkey, c, DONT_UPDATE, &hv); + it = item_get_locked(key, nkey, c->thread, DONT_UPDATE, &hv); if (it) { MEMCACHED_COMMAND_DELETE(c->sfd, ITEM_key(it), it->nkey); @@ -1759,7 +1761,7 @@ // return a referenced item if it exists, so we can modify it here, rather // than adding even more parameters to do_add_delta. bool item_created = false; - switch(do_add_delta(c, key, nkey, incr, of.delta, tmpbuf, &of.req_cas_id, hv, &it)) { + switch(do_add_delta(c->thread, key, nkey, incr, of.delta, tmpbuf, &of.req_cas_id, hv, &it)) { case OK: if (c->noreply) resp->skip = true; @@ -1782,7 +1784,7 @@ if (it != NULL) { memcpy(ITEM_data(it), tmpbuf, vlen); memcpy(ITEM_data(it) + vlen, "\r\n", 2); - if (do_store_item(it, NREAD_ADD, c, hv)) { + if (do_store_item(it, NREAD_ADD, c->thread, hv, NULL, CAS_NO_STALE)) { item_created = true; } else { // Not sure how we can get here if we're holding the lock. @@ -1987,7 +1989,7 @@ /* Avoid stale data persisting in cache because we failed alloc. * Unacceptable for SET. Anywhere else too? */ if (comm == NREAD_SET) { - it = item_get(key, nkey, c, DONT_UPDATE); + it = item_get(key, nkey, c->thread, DONT_UPDATE); if (it) { item_unlink(it); STORAGE_delete(c->thread->storage, it); @@ -2039,7 +2041,7 @@ } exptime = realtime(EXPTIME_TO_POSITIVE_TIME(exptime_int)); - it = item_touch(key, nkey, exptime, c); + it = item_touch(key, nkey, exptime, c->thread); if (it) { pthread_mutex_lock(&c->thread->stats.mutex); c->thread->stats.touch_cmds++; @@ -2081,7 +2083,7 @@ return; } - switch(add_delta(c, key, nkey, incr, delta, temp, NULL)) { + switch(add_delta(c->thread, key, nkey, incr, delta, temp, NULL)) { case OK: out_string(c, temp); break; @@ -2141,7 +2143,7 @@ stats_prefix_record_delete(key, nkey); } - it = item_get_locked(key, nkey, c, DONT_UPDATE, &hv); + it = item_get_locked(key, nkey, c->thread, DONT_UPDATE, &hv); if (it) { MEMCACHED_COMMAND_DELETE(c->sfd, ITEM_key(it), it->nkey); @@ -2646,6 +2648,41 @@ break; } return; + } else if (ntokens == 4 && strcmp(tokens[COMMAND_TOKEN + 1].value, "mgdump") == 0) { + if (settings.lru_crawler == false) { + out_string(c, "CLIENT_ERROR lru crawler disabled"); + return; + } + if (!settings.dump_enabled) { + out_string(c, "ERROR key dump not allowed"); + return; + } + if (resp_has_stack(c)) { + out_string(c, "ERROR cannot pipeline other commands before mgdump"); + return; + } + + int rv = lru_crawler_crawl(tokens[2].value, CRAWLER_MGDUMP, + c, c->sfd, LRU_CRAWLER_CAP_REMAINING); + switch(rv) { + case CRAWLER_OK: + conn_set_state(c, conn_watch); + event_del(&c->event); + break; + case CRAWLER_RUNNING: + out_string(c, "BUSY currently processing crawler request"); + break; + case CRAWLER_BADCLASS: + out_string(c, "BADCLASS invalid class id"); + break; + case CRAWLER_NOTSTARTED: + out_string(c, "NOTSTARTED no items to crawl"); + break; + case CRAWLER_ERROR: + out_string(c, "ERROR an unknown error happened"); + break; + } + return; } else if (ntokens == 4 && strcmp(tokens[COMMAND_TOKEN + 1].value, "tocrawl") == 0) { uint32_t tocrawl; if (!safe_strtoul(tokens[2].value, &tocrawl)) { @@ -2734,6 +2771,7 @@ return; } + c->thread->cur_sfd = c->sfd; // cuddle sfd for logging. ntokens = tokenize_command(command, tokens, MAX_TOKENS); // All commands need a minimum of two tokens: cmd and NULL finalizer // There are also no valid commands shorter than two bytes. diff -Nru memcached-1.6.18/proxy_await.c memcached-1.6.19/proxy_await.c --- memcached-1.6.18/proxy_await.c 2023-01-11 06:10:10.000000000 +0000 +++ memcached-1.6.19/proxy_await.c 2023-03-08 21:34:27.000000000 +0000 @@ -9,8 +9,10 @@ int argtable_ref; // need to hold refs to any potential hash selectors int restable_ref; // table of result objects int coro_ref; // reference to parent coroutine + int detail_ref; // reference to detail string. enum mcp_await_e type; bool completed; // have we completed the parent coroutine or not + bool logerr; // create log_req entries for error responses mcp_request_t *rq; mc_resp *resp; // the top level mc_resp to fill in (as if we were an iop) } mcp_await_t; @@ -23,12 +25,13 @@ // local restable = mcp.await(request, pools, num_wait) // NOTE: need to hold onto the pool objects since those hold backend // references. Here we just keep a reference to the argument table. -int mcplib_await(lua_State *L) { +static int _mcplib_await(lua_State *L, bool logerr) { mcp_request_t *rq = luaL_checkudata(L, 1, "mcp.request"); luaL_checktype(L, 2, LUA_TTABLE); int n = 0; // length of table of pools int wait_for = 0; // 0 means wait for all responses enum mcp_await_e type = AWAIT_GOOD; + int detail_ref = 0; lua_pushnil(L); // init table key while (lua_next(L, 2) != 0) { @@ -41,6 +44,11 @@ proxy_lua_error(L, "mcp.await arguments must have at least one pool"); } + if (lua_isstring(L, 5)) { + // pops the detail string. + detail_ref = luaL_ref(L, LUA_REGISTRYINDEX); + } + if (lua_isnumber(L, 4)) { type = lua_tointeger(L, 4); lua_pop(L, 1); @@ -86,10 +94,22 @@ aw->argtable_ref = argtable_ref; aw->rq = rq; aw->req_ref = req_ref; + aw->detail_ref = detail_ref; aw->type = type; + aw->logerr = logerr; P_DEBUG("%s: about to yield [len: %d]\n", __func__, n); - return lua_yield(L, 1); + lua_pushinteger(L, MCP_YIELD_AWAIT); + return lua_yield(L, 2); +} + +// default await, no logging. +int mcplib_await(lua_State *L) { + return _mcplib_await(L, false); +} + +int mcplib_await_logerrors(lua_State *L) { + return _mcplib_await(L, true); } static void mcp_queue_await_io(conn *c, lua_State *Lc, mcp_request_t *rq, int await_ref, bool await_first) { @@ -131,7 +151,7 @@ io_pending_proxy_t *p = do_cache_alloc(c->thread->io_cache); if (p == NULL) { - WSTAT_INCR(c, proxy_conn_oom, 1); + WSTAT_INCR(c->thread, proxy_conn_oom, 1); proxy_lua_error(Lc, "out of memory allocating from IO cache"); return; } @@ -147,6 +167,8 @@ p->client_resp = r; p->flushed = false; p->ascii_multiget = rq->ascii_multiget; + p->return_cb = proxy_return_cb; + p->finalize_cb = proxy_finalize_cb; // io_p needs to hold onto its own response reference, because we may or // may not include it in the final await() result. @@ -183,7 +205,7 @@ io_pending_proxy_t *p = do_cache_alloc(c->thread->io_cache); if (p == NULL) { - WSTAT_INCR(c, proxy_conn_oom, 1); + WSTAT_INCR(c->thread, proxy_conn_oom, 1); proxy_lua_error(Lc, "out of memory allocating from IO cache"); return; } @@ -201,6 +223,8 @@ p->is_await = true; p->await_ref = await_ref; p->await_background = true; + p->return_cb = proxy_return_cb; + p->finalize_cb = proxy_finalize_cb; // Dummy IO has no backend, and no request attached. @@ -218,7 +242,7 @@ // places. Else these errors currently crash the daemon. int mcplib_await_run(conn *c, mc_resp *resp, lua_State *L, int coro_ref) { P_DEBUG("%s: start\n", __func__); - WSTAT_INCR(c, proxy_await_active, 1); + WSTAT_INCR(c->thread, proxy_await_active, 1); mcp_await_t *aw = lua_touserdata(L, -1); int await_ref = luaL_ref(L, LUA_REGISTRYINDEX); // await is popped. assert(aw != NULL); @@ -235,7 +259,11 @@ const char *key = MCP_PARSER_KEY(rq->pr); size_t len = rq->pr.klen; int n = 0; - bool await_first = true; + // TODO (v3) await_first is used as a marker for upping the "wait for + // IO's" queue count, which means we need to force it off if we're in + // background mode, else we would accidentally wait for a response anyway. + // This note is for finding a less convoluted method for this. + bool await_first = (aw->type == AWAIT_BACKGROUND) ? false : true; // loop arg table and run each hash selector lua_pushnil(L); // -> 3 while (lua_next(L, 1) != 0) { @@ -245,11 +273,10 @@ if (pp == NULL) { proxy_lua_error(L, "mcp.await must be supplied with a pool"); } - mcp_pool_t *p = pp->main; // NOTE: rq->be is only held to help pass the backend into the IOP in // mcp_queue call. Could be a local variable and an argument too. - rq->be = mcplib_pool_proxy_call_helper(L, p, key, len); + rq->be = mcplib_pool_proxy_call_helper(L, pp, key, len); mcp_queue_await_io(c, L, rq, await_ref, await_first); await_first = false; @@ -362,7 +389,7 @@ } // note that post-completion, we stop gathering responses into the - // resposne table... because it's already been returned. + // response table... because it's already been returned. // So "valid" can only be true if also !completed if (aw->pending == 0) { if (!aw->completed) { @@ -393,6 +420,30 @@ p->client_resp->elapsed = (end.tv_sec - p->client_resp->start.tv_sec) * 1000000 + (end.tv_usec - p->client_resp->start.tv_usec); + // instructed to generate log_req entries for each failed request, + // this is useful to do here as these can be asynchronous. + // NOTE: this may be a temporary feature. + if (aw->logerr && p->client_resp->status != MCMC_OK && aw->completed) { + size_t dlen = 0; + const char *detail = NULL; + logger *l = p->thread->l; + // only process logs if someone is listening. + if (l->eflags & LOG_PROXYREQS) { + lua_rawgeti(L, LUA_REGISTRYINDEX, aw->req_ref); + mcp_request_t *rq = lua_touserdata(L, -1); + lua_pop(L, 1); // references still held, just clearing stack. + mcp_resp_t *rs = p->client_resp; + + if (aw->detail_ref) { + lua_rawgeti(L, LUA_REGISTRYINDEX, aw->detail_ref); + detail = luaL_tolstring(L, -1, &dlen); + lua_pop(L, 1); + } + + logger_log(l, LOGGER_PROXY_REQ, NULL, rq->pr.request, rq->pr.reqlen, rs->elapsed, rs->resp.type, rs->resp.code, rs->status, detail, dlen, rs->be_name, rs->be_port); + } + } + luaL_unref(L, LUA_REGISTRYINDEX, p->mcpres_ref); } // our await_ref is shared, so we don't need to release it. @@ -425,7 +476,10 @@ luaL_unref(L, LUA_REGISTRYINDEX, aw->argtable_ref); luaL_unref(L, LUA_REGISTRYINDEX, aw->req_ref); luaL_unref(L, LUA_REGISTRYINDEX, p->await_ref); - WSTAT_DECR(p->c, proxy_await_active, 1); + if (aw->detail_ref) { + luaL_unref(L, LUA_REGISTRYINDEX, aw->detail_ref); + } + WSTAT_DECR(p->thread, proxy_await_active, 1); } // Just remove anything we could have left on the primary VM stack diff -Nru memcached-1.6.18/proxy_config.c memcached-1.6.19/proxy_config.c --- memcached-1.6.18/proxy_config.c 2023-01-11 06:10:10.000000000 +0000 +++ memcached-1.6.19/proxy_config.c 2023-03-08 21:34:27.000000000 +0000 @@ -41,11 +41,40 @@ void proxy_start_reload(void *arg) { proxy_ctx_t *ctx = arg; if (pthread_mutex_trylock(&ctx->config_lock) == 0) { + ctx->loading = true; pthread_cond_signal(&ctx->config_cond); pthread_mutex_unlock(&ctx->config_lock); } } +int proxy_first_confload(void *arg) { + proxy_ctx_t *ctx = arg; + pthread_mutex_lock(&ctx->config_lock); + ctx->loading = true; + pthread_cond_signal(&ctx->config_cond); + pthread_mutex_unlock(&ctx->config_lock); + + while (1) { + bool stop = false; + pthread_mutex_lock(&ctx->config_lock); + if (!ctx->loading) { + stop = true; + } + pthread_mutex_unlock(&ctx->config_lock); + if (stop) + break; + } + int fails = 0; + STAT_L(ctx); + fails = ctx->global_stats.config_reload_fails; + STAT_UL(ctx); + if (fails) { + return -1; + } + + return 0; +} + // Manages a queue of inbound objects destined to be deallocated. static void *_proxy_manager_thread(void *arg) { proxy_ctx_t *ctx = arg; @@ -108,6 +137,7 @@ logger_create(); pthread_mutex_lock(&ctx->config_lock); while (1) { + ctx->loading = false; pthread_cond_wait(&ctx->config_cond, &ctx->config_lock); LOGGER_LOG(NULL, LOG_PROXYEVENTS, LOGGER_PROXY_CONFIG, NULL, "start"); STAT_INCR(ctx, config_reloads, 1); @@ -233,7 +263,7 @@ return 0; } -static int _copy_pool(lua_State *from, lua_State *to) { +static int _copy_pool(lua_State *from, lua_State *to, LIBEVENT_THREAD *thr) { // from, -3 should have he userdata. mcp_pool_t *p = luaL_checkudata(from, -3, "mcp.pool"); size_t size = sizeof(mcp_pool_proxy_t); @@ -241,16 +271,22 @@ luaL_setmetatable(to, "mcp.pool_proxy"); pp->main = p; + if (p->use_iothread) { + pp->pool = p->pool; + } else { + // allow 0 indexing for backends when unique to each worker thread + pp->pool = &p->pool[thr->thread_baseid * p->pool_size]; + } pthread_mutex_lock(&p->lock); p->refcount++; pthread_mutex_unlock(&p->lock); return 0; } -static void _copy_config_table(lua_State *from, lua_State *to); +static void _copy_config_table(lua_State *from, lua_State *to, LIBEVENT_THREAD *thr); // (from, -1) is the source value // should end with (to, -1) being the new value. -static void _copy_config_table(lua_State *from, lua_State *to) { +static void _copy_config_table(lua_State *from, lua_State *to, LIBEVENT_THREAD *thr) { int type = lua_type(from, -1); bool found = false; luaL_checkstack(from, 4, "configuration error: table recursion too deep"); @@ -266,7 +302,7 @@ if (lua_rawget(from, -2) != LUA_TNIL) { const char *name = lua_tostring(from, -1); if (strcmp(name, "mcp.pool") == 0) { - _copy_pool(from, to); + _copy_pool(from, to, thr); found = true; } } @@ -323,7 +359,7 @@ // lua_settable(to, n) - n being the table // takes -2 key -1 value, pops both. // use lua_absindex(L, -1) and so to convert easier? - _copy_config_table(from, to); // push next value. + _copy_config_table(from, to, thr); // push next value. lua_settable(to, nt); lua_pop(from, 1); // drop value, keep key. } @@ -385,7 +421,7 @@ // If the setjump/longjump combos are compatible a pcall for from and // atpanic for to might work best, since the config VM is/should be long // running and worker VM's should be rotated. - _copy_config_table(ctx->proxy_state, L); + _copy_config_table(ctx->proxy_state, L, thr); // copied value is in front of route function, now call it. if (lua_pcall(L, 1, 1, 0) != LUA_OK) { diff -Nru memcached-1.6.18/proxy.h memcached-1.6.19/proxy.h --- memcached-1.6.18/proxy.h 2023-01-11 06:10:10.000000000 +0000 +++ memcached-1.6.19/proxy.h 2023-03-08 21:34:27.000000000 +0000 @@ -2,6 +2,7 @@ #define PROXY_H #include "memcached.h" +#include "extstore.h" #include #include #include @@ -42,15 +43,15 @@ #define WSTAT_L(t) pthread_mutex_lock(&t->stats.mutex); #define WSTAT_UL(t) pthread_mutex_unlock(&t->stats.mutex); -#define WSTAT_INCR(c, stat, amount) { \ - pthread_mutex_lock(&c->thread->stats.mutex); \ - c->thread->stats.stat += amount; \ - pthread_mutex_unlock(&c->thread->stats.mutex); \ +#define WSTAT_INCR(t, stat, amount) { \ + pthread_mutex_lock(&t->stats.mutex); \ + t->stats.stat += amount; \ + pthread_mutex_unlock(&t->stats.mutex); \ } -#define WSTAT_DECR(c, stat, amount) { \ - pthread_mutex_lock(&c->thread->stats.mutex); \ - c->thread->stats.stat -= amount; \ - pthread_mutex_unlock(&c->thread->stats.mutex); \ +#define WSTAT_DECR(t, stat, amount) { \ + pthread_mutex_lock(&t->stats.mutex); \ + t->stats.stat -= amount; \ + pthread_mutex_unlock(&t->stats.mutex); \ } #define STAT_L(ctx) pthread_mutex_lock(&ctx->stats_lock); #define STAT_UL(ctx) pthread_mutex_unlock(&ctx->stats_lock); @@ -80,6 +81,10 @@ #define MCP_BACKEND_UPVALUE 3 #define MCP_CONTEXT_UPVALUE 4 +#define MCP_YIELD_POOL 1 +#define MCP_YIELD_AWAIT 2 +#define MCP_YIELD_LOCAL 3 + // all possible commands. #define CMD_FIELDS \ X(CMD_MG) \ @@ -192,7 +197,7 @@ typedef struct { lua_State *proxy_state; void *proxy_code; - proxy_event_thread_t *proxy_threads; + proxy_event_thread_t *proxy_io_thread; pthread_mutex_t config_lock; pthread_cond_t config_cond; pthread_t config_tid; @@ -205,6 +210,7 @@ bool worker_done; // signal variable for the worker lock/cond system. bool worker_failed; // covered by worker_lock as well. bool use_uring; // use IO_URING for backend connections. + bool loading; // bool indicating an active config load. struct proxy_global_stats global_stats; struct proxy_user_stats user_stats; struct proxy_tunables tunables; // NOTE: updates covered by stats_lock @@ -304,6 +310,7 @@ char port[MAX_PORTLEN+1]; char label[MAX_LABELLEN+1]; size_t llen; // cache label length for small speedup in pool creation. + struct proxy_tunables tunables; }; // lua object wrapper meant to own a malloc'ed conn structure @@ -318,7 +325,8 @@ // FIXME: inline the mcmc client data. // TODO: event_thread -> something? union of owner type? struct mcp_backend_s { - int depth; + int depth; // total number of requests in queue + int pending_read; // number of requests written to socket, pending read. int failed_count; // number of fails (timeouts) in a row proxy_event_thread_t *event_thread; // event thread owning this backend. void *client; // mcmc client @@ -328,7 +336,10 @@ io_pending_proxy_t *io_next; // next request to write. char *rbuf; // statically allocated read buffer. size_t rbufused; // currently active bytes in the buffer - struct event event; // libevent + struct event main_event; // libevent: changes role, mostly for main read events + struct event write_event; // libevent: only used when socket wbuf full + struct event timeout_event; // libevent: alarm for pending reads + struct proxy_tunables tunables; #ifdef HAVE_LIBURING proxy_event_t ur_rd_ev; // liburing. proxy_event_t ur_wr_ev; // need a separate event/cb for writing/polling @@ -342,6 +353,7 @@ bool can_write; // recently got a WANT_WRITE or are connecting. bool stacked; // if backend already queued for syscalls. bool bad; // timed out, marked as bad. + bool use_io_thread; // note if this backend is worker-local or not. struct iovec write_iovs[BE_IOV_MAX]; // iovs to stage batched writes char name[MAX_NAMELEN+1]; char port[MAX_PORTLEN+1]; @@ -353,13 +365,11 @@ pthread_t thread_id; struct event_base *base; struct event notify_event; // listen event for the notify pipe/eventfd. - struct event clock_event; // timer for updating event thread data. struct event beconn_event; // listener for backends in connect state #ifdef HAVE_LIBURING struct io_uring ring; proxy_event_t ur_notify_event; // listen on eventfd. proxy_event_t ur_benotify_event; // listen on eventfd for backend connections. - proxy_event_t ur_clock_event; // timer for updating event thread data. eventfd_t event_counter; eventfd_t beevent_counter; bool use_uring; @@ -379,7 +389,6 @@ int be_notify_send_fd; #endif proxy_ctx_t *ctx; // main context. - struct proxy_tunables tunables; // periodically copied from main ctx }; enum mcp_resp_mode { @@ -392,6 +401,8 @@ typedef struct { mcmc_resp_t resp; char *buf; // response line + potentially value. + mc_resp *cresp; // client mc_resp object during extstore fetches. + LIBEVENT_THREAD *thread; // cresp's owner thread needed for extstore cleanup. size_t blen; // total size of the value to read. struct timeval start; // time this object was created. long elapsed; // time elapsed once handled. @@ -405,28 +416,50 @@ // re-cast an io_pending_t into this more descriptive structure. // the first few items _must_ match the original struct. +#define IO_PENDING_TYPE_PROXY 0 +#define IO_PENDING_TYPE_EXTSTORE 1 struct _io_pending_proxy_t { int io_queue_type; LIBEVENT_THREAD *thread; conn *c; - mc_resp *resp; // original struct ends here + mc_resp *resp; + io_queue_cb return_cb; // called on worker thread. + io_queue_cb finalize_cb; // called back on the worker thread. + // original struct ends here - struct _io_pending_proxy_t *next; // stack for IO submission - STAILQ_ENTRY(_io_pending_proxy_t) io_next; // stack for backends + int io_type; // extstore IO or backend IO int coro_ref; // lua registry reference to the coroutine - int mcpres_ref; // mcp.res reference used for await() lua_State *coro; // pointer directly to the coroutine - mcp_backend_t *backend; // backend server to request from - struct iovec iov[2]; // request string + tail buffer - int iovcnt; // 1 or 2... - unsigned int iovbytes; // total bytes in the iovec - int await_ref; // lua reference if we were an await object - mcp_resp_t *client_resp; // reference (currently pointing to a lua object) - bool flushed; // whether we've fully written this request to a backend. - bool ascii_multiget; // passed on from mcp_r_t - bool is_await; // are we an await object? - bool await_first; // are we the main route for an await object? - bool await_background; // dummy IO for backgrounded awaits + union { + // extstore IO. + struct { + obj_io eio; + item *hdr_it; + mc_resp *tresp; // temporary mc_resp for storage to fill. + int gettype; + int iovec_data; + bool miss; + bool badcrc; + bool active; + }; + // backend request IO + struct { + struct _io_pending_proxy_t *next; // stack for IO submission + STAILQ_ENTRY(_io_pending_proxy_t) io_next; // stack for backends + int mcpres_ref; // mcp.res reference used for await() + mcp_backend_t *backend; // backend server to request from + struct iovec iov[2]; // request string + tail buffer + int iovcnt; // 1 or 2... + unsigned int iovbytes; // total bytes in the iovec + int await_ref; // lua reference if we were an await object + mcp_resp_t *client_resp; // reference (currently pointing to a lua object) + bool flushed; // whether we've fully written this request to a backend. + bool ascii_multiget; // passed on from mcp_r_t + bool is_await; // are we an await object? + bool await_first; // are we the main route for an await object? + bool await_background; // dummy IO for backgrounded awaits + }; + }; }; // Note: does *be have to be a sub-struct? how stable are userdata pointers? @@ -447,21 +480,25 @@ proxy_ctx_t *ctx; // main context. STAILQ_ENTRY(mcp_pool_s) next; // stack for deallocator. char key_filter_conf[KEY_HASH_FILTER_MAX+1]; + char beprefix[MAX_LABELLEN+1]; // TODO: should probably be shorter. uint64_t hash_seed; // calculated from a string. int refcount; int phc_ref; int self_ref; // TODO (v2): double check that this is needed. int pool_size; + bool use_iothread; mcp_pool_be_t pool[]; }; typedef struct { mcp_pool_t *main; // ptr to original + mcp_pool_be_t *pool; // ptr to main->pool starting offset for owner thread. } mcp_pool_proxy_t; // networking interface -void proxy_init_evthread_events(proxy_event_thread_t *t); +void proxy_init_event_thread(proxy_event_thread_t *t, proxy_ctx_t *ctx, struct event_base *base); void *proxy_event_thread(void *arg); +void proxy_run_backend_queue(be_head_t *head); // await interface enum mcp_await_e { @@ -473,9 +510,14 @@ AWAIT_BACKGROUND, // returns as soon as background jobs are dispatched }; int mcplib_await(lua_State *L); +int mcplib_await_logerrors(lua_State *L); int mcplib_await_run(conn *c, mc_resp *resp, lua_State *L, int coro_ref); int mcplib_await_return(io_pending_proxy_t *p); +// internal request interface +int mcplib_internal(lua_State *L); +int mcplib_internal_run(lua_State *L, conn *c, mc_resp *top_resp, int coro_ref); + // user stats interface int mcplib_add_stat(lua_State *L); int mcplib_stat(lua_State *L); @@ -499,11 +541,12 @@ int mcplib_open_dist_ring_hash(lua_State *L); int proxy_run_coroutine(lua_State *Lc, mc_resp *resp, io_pending_proxy_t *p, conn *c); -mcp_backend_t *mcplib_pool_proxy_call_helper(lua_State *L, mcp_pool_t *p, const char *key, size_t len); +mcp_backend_t *mcplib_pool_proxy_call_helper(lua_State *L, mcp_pool_proxy_t *pp, const char *key, size_t len); void mcp_request_attach(lua_State *L, mcp_request_t *rq, io_pending_proxy_t *p); int mcp_request_render(mcp_request_t *rq, int idx, const char *tok, size_t len); void proxy_lua_error(lua_State *L, const char *s); void proxy_lua_ferror(lua_State *L, const char *fmt, ...); +void proxy_out_errstring(mc_resp *resp, const char *str); int _start_proxy_config_threads(proxy_ctx_t *ctx); int proxy_thread_loadconf(proxy_ctx_t *ctx, LIBEVENT_THREAD *thr); diff -Nru memcached-1.6.18/proxy_internal.c memcached-1.6.19/proxy_internal.c --- memcached-1.6.18/proxy_internal.c 1970-01-01 00:00:00.000000000 +0000 +++ memcached-1.6.19/proxy_internal.c 2023-03-08 21:34:27.000000000 +0000 @@ -0,0 +1,1698 @@ +/* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// Functions related to local command execution. + +#include "proxy.h" +#include "storage.h" + +#define PROXY_STORAGE_GET 0 +#define PROXY_STORAGE_MG 1 +#define _DO_CAS true +#define _NO_CAS false +#define _DO_TOUCH true +#define _NO_TOUCH false + +static int _store_item_copy_from_buf(item *d_it, char *buf, const int len) { + if (d_it->it_flags & ITEM_CHUNKED) { + item_chunk *dch = (item_chunk *) ITEM_schunk(d_it); + int done = 0; + // Fill dch's via a flat data buffer + while (len > done && dch) { + int todo = (dch->size - dch->used < len - done) + ? dch->size - dch->used : len - done; + memcpy(dch->data + dch->used, buf + done, todo); + done += todo; + dch->used += todo; + assert(dch->used <= dch->size); + + if (dch->size == dch->used) { + item_chunk *tch = do_item_alloc_chunk(dch, len - done); + if (tch) { + dch = tch; + } else { + return -1; + } + } + } + assert(len == done); + } else { + memcpy(ITEM_data(d_it), buf, len); + } + + return 0; +} + +// TODO (v2): out_string() needs to change to just take a *resp, but I don't +// want to do the huge refactor in this change series. So for now we have a +// custom out_string(). +static void pout_string(mc_resp *resp, const char *str) { + size_t len; + bool skip = resp->skip; + assert(resp != NULL); + + // if response was original filled with something, but we're now writing + // out an error or similar, have to reset the object first. + resp_reset(resp); + + // We blank the response "just in case", but if we're not intending on + // sending it lets not rewrite it. + if (skip) { + resp->skip = true; + return; + } + + // Fill response object with static string. + + len = strlen(str); + if ((len + 2) > WRITE_BUFFER_SIZE) { + /* ought to be always enough. just fail for simplicity */ + str = "SERVER_ERROR output line too long"; + len = strlen(str); + } + + memcpy(resp->wbuf, str, len); + memcpy(resp->wbuf + len, "\r\n", 2); + resp_add_iov(resp, resp->wbuf, len + 2); + + return; +} + +// For meta commands error strings override the quiet flag. +static void pout_errstring(mc_resp *resp, const char *str) { + resp->skip = false; + pout_string(resp, str); +} + +#ifdef EXTSTORE +static void _storage_get_item_cb(void *e, obj_io *eio, int ret) { + io_pending_proxy_t *io = (io_pending_proxy_t *)eio->data; + assert(io->active == true); + mc_resp *resp = io->tresp; + item *read_it = (item *)eio->buf; + bool miss = false; + + if (ret < 1) { + miss = true; + } else { + uint32_t crc2; + uint32_t crc = (uint32_t) read_it->exptime; + crc2 = crc32c(0, (char *)read_it+STORE_OFFSET, eio->len-STORE_OFFSET); + + if (crc != crc2) { + miss = true; + io->badcrc = true; + } + } + + if (miss && !resp->skip) { + resp->iovcnt = 1; + if (io->gettype == PROXY_STORAGE_GET) { + resp->iov[0].iov_len = 5; + resp->iov[0].iov_base = "END\r\n"; + resp->tosend = 5; + } else if (io->gettype == PROXY_STORAGE_MG) { + resp->iov[0].iov_len = 4; + resp->iov[0].iov_base = "EN\r\n"; + resp->tosend = 5; + } else { + assert(1 == 0); + } + } + + if (!miss) { + resp->iov[io->iovec_data].iov_base = ITEM_data(read_it); + } + io->miss = miss; + io->active = false; + + // in proxy mode we tend to return IO's as they happen so we can keep + // latency down more. + return_io_pending((io_pending_t *)io); +} + +// TODO (v2): if the item is smaller than resp->wbuf[] shouldn't we just read +// directly into there? item only necessary for recache. +static int proxy_storage_get(LIBEVENT_THREAD *t, item *it, mc_resp *resp, + int type) { +#ifdef NEED_ALIGN + item_hdr hdr; + memcpy(&hdr, ITEM_data(it), sizeof(hdr)); +#else + item_hdr *hdr = (item_hdr *)ITEM_data(it); +#endif + size_t ntotal = ITEM_ntotal(it); + + io_pending_proxy_t *io = do_cache_alloc(t->io_cache); + // this is a re-cast structure, so assert that we never outsize it. + assert(sizeof(io_pending_t) >= sizeof(io_pending_proxy_t)); + memset(io, 0, sizeof(io_pending_proxy_t)); + io->active = true; + // io_pending owns the reference for this object now. + io->hdr_it = it; + io->tresp = resp; // our mc_resp is a temporary object. + io->io_queue_type = IO_QUEUE_EXTSTORE; + io->io_type = IO_PENDING_TYPE_EXTSTORE; // proxy specific sub-type. + io->gettype = type; + io->thread = t; + io->return_cb = proxy_return_cb; + io->finalize_cb = proxy_finalize_cb; + obj_io *eio = &io->eio; + + eio->buf = malloc(ntotal); + if (eio->buf == NULL) { + do_cache_free(t->io_cache, io); + return -1; + } + + io->iovec_data = resp->iovcnt; + resp_add_iov(resp, "", it->nbytes); + + // We can't bail out anymore, so mc_resp owns the IO from here. + resp->io_pending = (io_pending_t *)io; + + // reference ourselves for the callback. + eio->data = (void *)io; + + // Now, fill in io->io based on what was in our header. +#ifdef NEED_ALIGN + eio->page_version = hdr.page_version; + eio->page_id = hdr.page_id; + eio->offset = hdr.offset; +#else + eio->page_version = hdr->page_version; + eio->page_id = hdr->page_id; + eio->offset = hdr->offset; +#endif + eio->len = ntotal; + eio->mode = OBJ_IO_READ; + eio->cb = _storage_get_item_cb; + + pthread_mutex_lock(&t->stats.mutex); + t->stats.get_extstore++; + pthread_mutex_unlock(&t->stats.mutex); + + return 0; +} +#endif // EXTSTORE + +/* client flags == 0 means use no storage for client flags */ +static inline int make_ascii_get_suffix(char *suffix, item *it, bool return_cas, int nbytes) { + char *p = suffix; + *p = ' '; + p++; + if (FLAGS_SIZE(it) == 0) { + *p = '0'; + p++; + } else { + p = itoa_u32(*((uint32_t *) ITEM_suffix(it)), p); + } + *p = ' '; + p = itoa_u32(nbytes-2, p+1); + + if (return_cas) { + *p = ' '; + p = itoa_u64(ITEM_get_cas(it), p+1); + } + + *p = '\r'; + *(p+1) = '\n'; + *(p+2) = '\0'; + return (p - suffix) + 2; +} + +static void process_get_cmd(LIBEVENT_THREAD *t, mcp_parser_t *pr, mc_resp *resp, bool return_cas, bool should_touch) { + const char *key = &pr->request[pr->tokens[pr->keytoken]]; + int nkey = pr->klen; + rel_time_t exptime = 0; + bool overflow = false; // unused. + + if (nkey > KEY_MAX_LENGTH) { + pout_string(resp, "CLIENT_ERROR bad command line format"); + return; + } + + item *it = limited_get(key, nkey, t, exptime, should_touch, DO_UPDATE, &overflow); + if (it) { + int nbytes = it->nbytes;; + nbytes = it->nbytes; + char *p = resp->wbuf; + memcpy(p, "VALUE ", 6); + p += 6; + memcpy(p, ITEM_key(it), it->nkey); + p += it->nkey; + p += make_ascii_get_suffix(p, it, return_cas, nbytes); + resp_add_iov(resp, resp->wbuf, p - resp->wbuf); + +#ifdef EXTSTORE + if (it->it_flags & ITEM_HDR) { + if (proxy_storage_get(t, it, resp, PROXY_STORAGE_GET) != 0) { + pthread_mutex_lock(&t->stats.mutex); + t->stats.get_oom_extstore++; + pthread_mutex_unlock(&t->stats.mutex); + + item_remove(it); + proxy_out_errstring(resp, "out of memory writing get response"); + return; + } + } else if ((it->it_flags & ITEM_CHUNKED) == 0) { + resp_add_iov(resp, ITEM_data(it), it->nbytes); + } else { + resp_add_chunked_iov(resp, it, it->nbytes); + } +#else + if ((it->it_flags & ITEM_CHUNKED) == 0) { + resp_add_iov(resp, ITEM_data(it), it->nbytes); + } else { + resp_add_chunked_iov(resp, it, it->nbytes); + } +#endif + + /* item_get() has incremented it->refcount for us */ + pthread_mutex_lock(&t->stats.mutex); + if (should_touch) { + t->stats.touch_cmds++; + t->stats.slab_stats[ITEM_clsid(it)].touch_hits++; + } else { + t->stats.lru_hits[it->slabs_clsid]++; + t->stats.get_cmds++; + } + pthread_mutex_unlock(&t->stats.mutex); +#ifdef EXTSTORE + /* If ITEM_HDR, an io_wrap owns the reference. */ + if ((it->it_flags & ITEM_HDR) == 0) { + resp->item = it; + } +#else + resp->item = it; +#endif + } else { + pthread_mutex_lock(&t->stats.mutex); + if (should_touch) { + t->stats.touch_cmds++; + t->stats.touch_misses++; + } else { + t->stats.get_misses++; + t->stats.get_cmds++; + } + pthread_mutex_unlock(&t->stats.mutex); + } + + resp_add_iov(resp, "END\r\n", 5); + return; +} + +static void process_update_cmd(LIBEVENT_THREAD *t, mcp_parser_t *pr, mc_resp *resp, int comm, bool handle_cas) { + const char *key = &pr->request[pr->tokens[pr->keytoken]]; + size_t nkey = pr->klen; + unsigned int flags; + int32_t exptime_int = 0; + rel_time_t exptime = 0; + uint64_t req_cas_id = 0; + item *it; + + assert(resp != NULL); + + if (nkey > KEY_MAX_LENGTH) { + pout_string(resp, "CLIENT_ERROR bad command line format"); + return; + } + + // TODO (v2): these safe_str* functions operate on C _strings_, but these + // tokens simply end with a space or carriage return/newline, so we either + // need custom functions or validate harder that these calls won't bite us + // later. + if (! (safe_strtoul(&pr->request[pr->tokens[2]], (uint32_t *)&flags) + && safe_strtol(&pr->request[pr->tokens[3]], &exptime_int))) { + pout_string(resp, "CLIENT_ERROR bad command line format"); + return; + } + + exptime = realtime(EXPTIME_TO_POSITIVE_TIME(exptime_int)); + + // does cas value exist? + if (handle_cas) { + if (!safe_strtoull(&pr->request[pr->tokens[5]], &req_cas_id)) { + pout_string(resp, "CLIENT_ERROR bad command line format"); + return; + } + } + + // vlen is validated from the main parser. + + if (settings.detail_enabled) { + stats_prefix_record_set(key, nkey); + } + + it = item_alloc(key, nkey, flags, exptime, pr->vlen); + + if (it == 0) { + //enum store_item_type status; + if (! item_size_ok(nkey, flags, pr->vlen)) { + pout_string(resp, "SERVER_ERROR object too large for cache"); + //status = TOO_LARGE; + pthread_mutex_lock(&t->stats.mutex); + t->stats.store_too_large++; + pthread_mutex_unlock(&t->stats.mutex); + } else { + pout_string(resp, "SERVER_ERROR out of memory storing object"); + //status = NO_MEMORY; + pthread_mutex_lock(&t->stats.mutex); + t->stats.store_no_memory++; + pthread_mutex_unlock(&t->stats.mutex); + } + //LOGGER_LOG(c->thread->l, LOG_MUTATIONS, LOGGER_ITEM_STORE, + // NULL, status, comm, key, nkey, 0, 0, c->sfd); + + /* Avoid stale data persisting in cache because we failed alloc. + * Unacceptable for SET. Anywhere else too? */ + if (comm == NREAD_SET) { + it = item_get(key, nkey, t, DONT_UPDATE); + if (it) { + item_unlink(it); + STORAGE_delete(t->storage, it); + item_remove(it); + } + } + + return; + } + ITEM_set_cas(it, req_cas_id); + + pthread_mutex_lock(&t->stats.mutex); + t->stats.slab_stats[ITEM_clsid(it)].set_cmds++; + pthread_mutex_unlock(&t->stats.mutex); + + // complete_nread_proxy() does the data chunk check so all we need to do + // is copy the data. + if (_store_item_copy_from_buf(it, pr->vbuf, it->nbytes) != 0) { + pout_string(resp, "SERVER_ERROR out of memory storing object"); + item_remove(it); + return; + } + + int ret = store_item(it, comm, t, NULL, CAS_NO_STALE); + switch (ret) { + case STORED: + pout_string(resp, "STORED"); + break; + case EXISTS: + pout_string(resp, "EXISTS"); + break; + case NOT_FOUND: + pout_string(resp, "NOT_FOUND"); + break; + case NOT_STORED: + pout_string(resp, "NOT_STORED"); + break; + default: + pout_string(resp, "SERVER_ERROR Unhandled storage type."); + } + + // We don't need to hold a reference since the item was fully read. + item_remove(it); +} + +static void process_arithmetic_cmd(LIBEVENT_THREAD *t, mcp_parser_t *pr, mc_resp *resp, const bool incr) { + char temp[INCR_MAX_STORAGE_LEN]; + uint64_t delta; + const char *key = &pr->request[pr->tokens[pr->keytoken]]; + size_t nkey = pr->klen; + + assert(t != NULL); + + if (nkey > KEY_MAX_LENGTH) { + pout_string(resp, "CLIENT_ERROR bad command line format"); + return; + } + + if (!safe_strtoull(&pr->request[pr->tokens[2]], &delta)) { + pout_string(resp, "CLIENT_ERROR invalid numeric delta argument"); + return; + } + + switch(add_delta(t, key, nkey, incr, delta, temp, NULL)) { + case OK: + pout_string(resp, temp); + break; + case NON_NUMERIC: + pout_string(resp, "CLIENT_ERROR cannot increment or decrement non-numeric value"); + break; + case EOM: + pout_string(resp, "SERVER_ERROR out of memory"); + break; + case DELTA_ITEM_NOT_FOUND: + pthread_mutex_lock(&t->stats.mutex); + if (incr) { + t->stats.incr_misses++; + } else { + t->stats.decr_misses++; + } + pthread_mutex_unlock(&t->stats.mutex); + + pout_string(resp, "NOT_FOUND"); + break; + case DELTA_ITEM_CAS_MISMATCH: + break; /* Should never get here */ + } +} + +static void process_delete_cmd(LIBEVENT_THREAD *t, mcp_parser_t *pr, mc_resp *resp) { + const char *key = &pr->request[pr->tokens[pr->keytoken]]; + size_t nkey = pr->klen; + item *it; + uint32_t hv; + + assert(t != NULL); + + // NOTE: removed a compatibility bodge from a decade ago. + // delete used to take a "delay" argument, which was removed, but some + // ancient php clients always sent a 0 argument, which would then fail. + // It's been long enough that I don't want to carry this forward into the + // new parser. + + if (nkey > KEY_MAX_LENGTH) { + pout_string(resp, "CLIENT_ERROR bad command line format"); + return; + } + + it = item_get_locked(key, nkey, t, DONT_UPDATE, &hv); + if (it) { + //MEMCACHED_COMMAND_DELETE(c->sfd, ITEM_key(it), it->nkey); + + pthread_mutex_lock(&t->stats.mutex); + t->stats.slab_stats[ITEM_clsid(it)].delete_hits++; + pthread_mutex_unlock(&t->stats.mutex); + + do_item_unlink(it, hv); + STORAGE_delete(t->storage, it); + do_item_remove(it); /* release our reference */ + pout_string(resp, "DELETED"); + } else { + pthread_mutex_lock(&t->stats.mutex); + t->stats.delete_misses++; + pthread_mutex_unlock(&t->stats.mutex); + + pout_string(resp, "NOT_FOUND"); + } + item_unlock(hv); +} + +static void process_touch_cmd(LIBEVENT_THREAD *t, mcp_parser_t *pr, mc_resp *resp) { + const char *key = &pr->request[pr->tokens[pr->keytoken]]; + size_t nkey = pr->klen; + int32_t exptime_int = 0; + rel_time_t exptime = 0; + item *it; + + assert(t != NULL); + + if (nkey > KEY_MAX_LENGTH) { + pout_string(resp, "CLIENT_ERROR bad command line format"); + return; + } + + if (!safe_strtol(&pr->request[pr->tokens[2]], &exptime_int)) { + pout_string(resp, "CLIENT_ERROR invalid exptime argument"); + return; + } + + exptime = realtime(EXPTIME_TO_POSITIVE_TIME(exptime_int)); + it = item_touch(key, nkey, exptime, t); + if (it) { + pthread_mutex_lock(&t->stats.mutex); + t->stats.touch_cmds++; + t->stats.slab_stats[ITEM_clsid(it)].touch_hits++; + pthread_mutex_unlock(&t->stats.mutex); + + pout_string(resp, "TOUCHED"); + item_remove(it); + } else { + pthread_mutex_lock(&t->stats.mutex); + t->stats.touch_cmds++; + t->stats.touch_misses++; + pthread_mutex_unlock(&t->stats.mutex); + + pout_string(resp, "NOT_FOUND"); + } +} + +/*** meta command handlers ***/ + +// FIXME: macro or public interface, this is copypasted. +static int _process_token_len(mcp_parser_t *pr, size_t token) { + const char *s = pr->request + pr->tokens[token]; + const char *e = pr->request + pr->tokens[token+1]; + // start of next token is after any space delimiters, so back those out. + while (*(e-1) == ' ') { + e--; + } + return e - s; +} + +#define META_SPACE(p) { \ + *p = ' '; \ + p++; \ +} + +#define META_CHAR(p, c) { \ + *p = ' '; \ + *(p+1) = c; \ + p += 2; \ +} + +// FIXME: binary key support. +#define META_KEY(p, key, nkey, bin) { \ + META_CHAR(p, 'k'); \ + memcpy(p, key, nkey); \ + p += nkey; \ +} + +#define MFLAG_MAX_OPT_LENGTH 20 +#define MFLAG_MAX_OPAQUE_LENGTH 32 + +struct _meta_flags { + unsigned int has_error :1; // flipped if we found an error during parsing. + unsigned int no_update :1; + unsigned int locked :1; + unsigned int vivify :1; + unsigned int la :1; + unsigned int hit :1; + unsigned int value :1; + unsigned int set_stale :1; + unsigned int no_reply :1; + unsigned int has_cas :1; + unsigned int new_ttl :1; + unsigned int key_binary:1; + char mode; // single character mode switch, common to ms/ma + rel_time_t exptime; + rel_time_t autoviv_exptime; + rel_time_t recache_time; + uint32_t client_flags; + uint64_t req_cas_id; + uint64_t delta; // ma + uint64_t initial; // ma +}; + +static int _meta_flag_preparse(mcp_parser_t *pr, const size_t start, + struct _meta_flags *of, char **errstr) { + unsigned int i; + //size_t ret; + int32_t tmp_int; + uint8_t seen[127] = {0}; + // Start just past the key token. Look at first character of each token. + for (i = start; i < pr->ntokens; i++) { + uint8_t o = (uint8_t)pr->request[pr->tokens[i]]; + // zero out repeat flags so we don't over-parse for return data. + if (o >= 127 || seen[o] != 0) { + *errstr = "CLIENT_ERROR duplicate flag"; + return -1; + } + seen[o] = 1; + switch (o) { + // base64 decode the key in-place, as the binary should always be + // shorter and the conversion code buffers bytes. + // TODO: we need temporary space for the binary key decode since + // request should be const. + /*case 'b': + ret = base64_decode((unsigned char *)tokens[KEY_TOKEN].value, tokens[KEY_TOKEN].length, + (unsigned char *)tokens[KEY_TOKEN].value, tokens[KEY_TOKEN].length); + if (ret == 0) { + // Failed to decode + *errstr = "CLIENT_ERROR error decoding key"; + of->has_error = 1; + } + tokens[KEY_TOKEN].length = ret; + of->key_binary = 1; + break;*/ + /* Negative exptimes can underflow and end up immortal. realtime() will + immediately expire values that are greater than REALTIME_MAXDELTA, but less + than process_started, so lets aim for that. */ + case 'N': + of->locked = 1; + of->vivify = 1; + if (!safe_strtol(&pr->request[pr->tokens[i]+1], &tmp_int)) { + *errstr = "CLIENT_ERROR bad token in command line format"; + of->has_error = 1; + } else { + of->autoviv_exptime = realtime(EXPTIME_TO_POSITIVE_TIME(tmp_int)); + } + break; + case 'T': + of->locked = 1; + if (!safe_strtol(&pr->request[pr->tokens[i]+1], &tmp_int)) { + *errstr = "CLIENT_ERROR bad token in command line format"; + of->has_error = 1; + } else { + of->exptime = realtime(EXPTIME_TO_POSITIVE_TIME(tmp_int)); + of->new_ttl = true; + } + break; + case 'R': + of->locked = 1; + if (!safe_strtol(&pr->request[pr->tokens[i]+1], &tmp_int)) { + *errstr = "CLIENT_ERROR bad token in command line format"; + of->has_error = 1; + } else { + of->recache_time = realtime(EXPTIME_TO_POSITIVE_TIME(tmp_int)); + } + break; + case 'l': + of->la = 1; + of->locked = 1; // need locked to delay LRU bump + break; + case 'O': + case 'P': + case 'L': + break; + case 'k': // known but no special handling + case 's': + case 't': + case 'c': + case 'f': + break; + case 'v': + of->value = 1; + break; + case 'h': + of->locked = 1; // need locked to delay LRU bump + break; + case 'u': + of->no_update = 1; + break; + case 'q': + of->no_reply = 1; + break; + // mset-related. + case 'F': + if (!safe_strtoul(&pr->request[pr->tokens[i]+1], &of->client_flags)) { + of->has_error = true; + } + break; + case 'C': // mset, mdelete, marithmetic + if (!safe_strtoull(&pr->request[pr->tokens[i]+1], &of->req_cas_id)) { + *errstr = "CLIENT_ERROR bad token in command line format"; + of->has_error = true; + } else { + of->has_cas = true; + } + break; + case 'M': // mset and marithmetic mode switch + // FIXME: this used to error if the token isn't a single byte. + // It probably should still? + of->mode = pr->request[pr->tokens[i]]; + break; + case 'J': // marithmetic initial value + if (!safe_strtoull(&pr->request[pr->tokens[i]+1], &of->initial)) { + *errstr = "CLIENT_ERROR invalid numeric initial value"; + of->has_error = 1; + } + break; + case 'D': // marithmetic delta value + if (!safe_strtoull(&pr->request[pr->tokens[i]+1], &of->delta)) { + *errstr = "CLIENT_ERROR invalid numeric delta value"; + of->has_error = 1; + } + break; + case 'I': + of->set_stale = 1; + break; + default: // unknown flag, bail. + *errstr = "CLIENT_ERROR invalid flag"; + return -1; + } + } + + return of->has_error ? -1 : 0; +} + +static void process_mget_cmd(LIBEVENT_THREAD *t, mcp_parser_t *pr, mc_resp *resp) { + const char *key = &pr->request[pr->tokens[pr->keytoken]]; + size_t nkey = pr->klen; + item *it; + unsigned int i = 0; + struct _meta_flags of = {0}; // option bitflags. + uint32_t hv; // cached hash value for unlocking an item. + bool failed = false; + bool item_created = false; + bool won_token = false; + bool ttl_set = false; + char *errstr = "CLIENT_ERROR bad command line format"; + assert(t != NULL); + char *p = resp->wbuf; + int tlen = 0; + + // FIXME: still needed? + //WANT_TOKENS_MIN(ntokens, 3); + + if (nkey > KEY_MAX_LENGTH) { + pout_string(resp, "CLIENT_ERROR bad command line format"); + return; + } + + if (pr->ntokens > MFLAG_MAX_OPT_LENGTH) { + // TODO: ensure the command tokenizer gives us at least this many + pout_errstring(resp, "CLIENT_ERROR options flags are too long"); + return; + } + + // scrubs duplicated options and sets flags for how to load the item. + // we pass in the first token that should be a flag. + if (_meta_flag_preparse(pr, 2, &of, &errstr) != 0) { + pout_errstring(resp, errstr); + return; + } + + bool overflow = false; + if (!of.locked) { + it = limited_get(key, nkey, t, 0, false, !of.no_update, &overflow); + } else { + // If we had to lock the item, we're doing our own bump later. + it = limited_get_locked(key, nkey, t, DONT_UPDATE, &hv, &overflow); + } + + // Since we're a new protocol, we can actually inform users that refcount + // overflow is happening by straight up throwing an error. + // We definitely don't want to re-autovivify by accident. + if (overflow) { + assert(it == NULL); + pout_errstring(resp, "SERVER_ERROR refcount overflow during fetch"); + return; + } + + if (it == NULL && of.vivify) { + // Fill in the exptime during parsing later. + it = item_alloc(key, nkey, 0, realtime(0), 2); + // We don't actually need any of do_store_item's logic: + // - already fetched and missed an existing item. + // - lock is still held. + // - not append/prepend/replace + // - not testing CAS + if (it != NULL) { + // I look forward to the day I get rid of this :) + memcpy(ITEM_data(it), "\r\n", 2); + // NOTE: This initializes the CAS value. + do_item_link(it, hv); + item_created = true; + } + } + + // don't have to check result of add_iov() since the iov size defaults are + // enough. + if (it) { + if (of.value) { + memcpy(p, "VA ", 3); + p = itoa_u32(it->nbytes-2, p+3); + } else { + memcpy(p, "HD", 2); + p += 2; + } + + for (i = pr->keytoken+1; i < pr->ntokens; i++) { + switch (pr->request[pr->tokens[i]]) { + case 'T': + ttl_set = true; + it->exptime = of.exptime; + break; + case 'N': + if (item_created) { + it->exptime = of.autoviv_exptime; + won_token = true; + } + break; + case 'R': + // If we haven't autovivified and supplied token is less + // than current TTL, mark a win. + if ((it->it_flags & ITEM_TOKEN_SENT) == 0 + && !item_created + && it->exptime != 0 + && it->exptime < of.recache_time) { + won_token = true; + } + break; + case 's': + META_CHAR(p, 's'); + p = itoa_u32(it->nbytes-2, p); + break; + case 't': + // TTL remaining as of this request. + // needs to be relative because server clocks may not be in sync. + META_CHAR(p, 't'); + if (it->exptime == 0) { + *p = '-'; + *(p+1) = '1'; + p += 2; + } else { + p = itoa_u32(it->exptime - current_time, p); + } + break; + case 'c': + META_CHAR(p, 'c'); + p = itoa_u64(ITEM_get_cas(it), p); + break; + case 'f': + META_CHAR(p, 'f'); + if (FLAGS_SIZE(it) == 0) { + *p = '0'; + p++; + } else { + p = itoa_u32(*((uint32_t *) ITEM_suffix(it)), p); + } + break; + case 'l': + META_CHAR(p, 'l'); + p = itoa_u32(current_time - it->time, p); + break; + case 'h': + META_CHAR(p, 'h'); + if (it->it_flags & ITEM_FETCHED) { + *p = '1'; + } else { + *p = '0'; + } + p++; + break; + case 'O': + tlen = _process_token_len(pr, i); + if (tlen > MFLAG_MAX_OPAQUE_LENGTH) { + errstr = "CLIENT_ERROR opaque token too long"; + goto error; + } + META_SPACE(p); + memcpy(p, &pr->request[pr->tokens[i]], tlen); + p += tlen; + break; + case 'k': + META_KEY(p, ITEM_key(it), it->nkey, (it->it_flags & ITEM_KEY_BINARY)); + break; + } + } + + // Has this item already sent a token? + // Important to do this here so we don't send W with Z. + // Isn't critical, but easier for client authors to understand. + if (it->it_flags & ITEM_TOKEN_SENT) { + META_CHAR(p, 'Z'); + } + if (it->it_flags & ITEM_STALE) { + META_CHAR(p, 'X'); + // FIXME: think hard about this. is this a default, or a flag? + if ((it->it_flags & ITEM_TOKEN_SENT) == 0) { + // If we're stale but no token already sent, now send one. + won_token = true; + } + } + + if (won_token) { + // Mark a win into the flag buffer. + META_CHAR(p, 'W'); + it->it_flags |= ITEM_TOKEN_SENT; + } + + *p = '\r'; + *(p+1) = '\n'; + *(p+2) = '\0'; + p += 2; + // finally, chain in the buffer. + resp_add_iov(resp, resp->wbuf, p - resp->wbuf); + + if (of.value) { +#ifdef EXTSTORE + if (it->it_flags & ITEM_HDR) { + if (proxy_storage_get(t, it, resp, PROXY_STORAGE_MG) != 0) { + pthread_mutex_lock(&t->stats.mutex); + t->stats.get_oom_extstore++; + pthread_mutex_unlock(&t->stats.mutex); + + failed = true; + } + } else if ((it->it_flags & ITEM_CHUNKED) == 0) { + resp_add_iov(resp, ITEM_data(it), it->nbytes); + } else { + resp_add_chunked_iov(resp, it, it->nbytes); + } +#else + if ((it->it_flags & ITEM_CHUNKED) == 0) { + resp_add_iov(resp, ITEM_data(it), it->nbytes); + } else { + resp_add_chunked_iov(resp, it, it->nbytes); + } +#endif + } + + // need to hold the ref at least because of the key above. +#ifdef EXTSTORE + if (!failed) { + if ((it->it_flags & ITEM_HDR) != 0 && of.value) { + // Only have extstore clean if header and returning value. + resp->item = NULL; + } else { + resp->item = it; + } + } else { + // Failed to set up extstore fetch. + if (of.locked) { + do_item_remove(it); + } else { + item_remove(it); + } + } +#else + resp->item = it; +#endif + } else { + failed = true; + } + + if (of.locked) { + // Delayed bump so we could get fetched/last access time pre-update. + if (!of.no_update && it != NULL) { + do_item_bump(t, it, hv); + } + item_unlock(hv); + } + + // we count this command as a normal one if we've gotten this far. + // TODO: for autovivify case, miss never happens. Is this okay? + if (!failed) { + pthread_mutex_lock(&t->stats.mutex); + if (ttl_set) { + t->stats.touch_cmds++; + t->stats.slab_stats[ITEM_clsid(it)].touch_hits++; + } else { + t->stats.lru_hits[it->slabs_clsid]++; + t->stats.get_cmds++; + } + pthread_mutex_unlock(&t->stats.mutex); + } else { + pthread_mutex_lock(&t->stats.mutex); + if (ttl_set) { + t->stats.touch_cmds++; + t->stats.touch_misses++; + } else { + t->stats.get_misses++; + t->stats.get_cmds++; + } + pthread_mutex_unlock(&t->stats.mutex); + + // This gets elided in noreply mode. + if (of.no_reply) + resp->skip = true; + memcpy(p, "EN", 2); + p += 2; + for (i = pr->keytoken+1; i < pr->ntokens; i++) { + switch (pr->request[pr->tokens[i]]) { + // TODO: macro perhaps? + case 'O': + tlen = _process_token_len(pr, i); + if (tlen > MFLAG_MAX_OPAQUE_LENGTH) { + errstr = "CLIENT_ERROR opaque token too long"; + goto error; + } + META_SPACE(p); + memcpy(p, &pr->request[pr->tokens[i]], tlen); + p += tlen; + break; + case 'k': + META_KEY(p, key, nkey, of.key_binary); + break; + } + } + resp->wbytes = p - resp->wbuf; + memcpy(resp->wbuf + resp->wbytes, "\r\n", 2); + resp->wbytes += 2; + resp_add_iov(resp, resp->wbuf, resp->wbytes); + } + return; +error: + if (it) { + do_item_remove(it); + if (of.locked) { + item_unlock(hv); + } + } + pout_errstring(resp, errstr); +} + +static void process_mset_cmd(LIBEVENT_THREAD *t, mcp_parser_t *pr, mc_resp *resp) { + const char *key = &pr->request[pr->tokens[pr->keytoken]]; + size_t nkey = pr->klen; + + item *it; + int i; + short comm = NREAD_SET; + struct _meta_flags of = {0}; // option bitflags. + char *errstr = "CLIENT_ERROR bad command line format"; + uint32_t hv; // cached hash value. + int vlen = pr->vlen; // value from data line. + assert(t != NULL); + char *p = resp->wbuf; + int tlen = 0; + + //WANT_TOKENS_MIN(ntokens, 3); + + if (nkey > KEY_MAX_LENGTH) { + pout_string(resp, "CLIENT_ERROR bad command line format"); + return; + } + + if (pr->ntokens > MFLAG_MAX_OPT_LENGTH) { + // TODO: ensure the command tokenizer gives us at least this many + pout_errstring(resp, "CLIENT_ERROR options flags are too long"); + return; + } + + if (pr->ntokens == 3) { + pout_errstring(resp, "CLIENT_ERROR bad command line format"); + return; + } + + // We need to at least try to get the size to properly slurp bad bytes + // after an error. + // we pass in the first token that should be a flag. + if (_meta_flag_preparse(pr, 3, &of, &errstr) != 0) { + goto error; + } + + // "mode switch" to alternative commands + switch (of.mode) { + case 0: + break; // no mode supplied. + case 'E': // Add... + comm = NREAD_ADD; + break; + case 'A': // Append. + comm = NREAD_APPEND; + break; + case 'P': // Prepend. + comm = NREAD_PREPEND; + break; + case 'R': // Replace. + comm = NREAD_REPLACE; + break; + case 'S': // Set. Default. + comm = NREAD_SET; + break; + default: + errstr = "CLIENT_ERROR invalid mode for ms M token"; + goto error; + } + + // The item storage function doesn't exactly map to mset. + // If a CAS value is supplied, upgrade default SET mode to CAS mode. + // Also allows REPLACE to work, as REPLACE + CAS works the same as CAS. + // add-with-cas works the same as add; but could only LRU bump if match.. + // APPEND/PREPEND allow a simplified CAS check. + if (of.has_cas && (comm == NREAD_SET || comm == NREAD_REPLACE)) { + comm = NREAD_CAS; + } + + it = item_alloc(key, nkey, of.client_flags, of.exptime, vlen); + + if (it == 0) { + if (! item_size_ok(nkey, of.client_flags, vlen)) { + errstr = "SERVER_ERROR object too large for cache"; + pthread_mutex_lock(&t->stats.mutex); + t->stats.store_too_large++; + pthread_mutex_unlock(&t->stats.mutex); + } else { + errstr = "SERVER_ERROR out of memory storing object"; + pthread_mutex_lock(&t->stats.mutex); + t->stats.store_no_memory++; + pthread_mutex_unlock(&t->stats.mutex); + } + + /* Avoid stale data persisting in cache because we failed alloc. */ + // NOTE: only if SET mode? + it = item_get_locked(key, nkey, t, DONT_UPDATE, &hv); + if (it) { + do_item_unlink(it, hv); + STORAGE_delete(t->storage, it); + do_item_remove(it); + } + item_unlock(hv); + + goto error; + } + ITEM_set_cas(it, of.req_cas_id); + + // data should already be read into the request. + + // Prevent printing back the key in meta commands as garbage. + if (of.key_binary) { + it->it_flags |= ITEM_KEY_BINARY; + } + + bool set_stale = CAS_NO_STALE; + if (of.set_stale && comm == NREAD_CAS) { + set_stale = CAS_ALLOW_STALE; + } + resp->wbytes = p - resp->wbuf; + + pthread_mutex_lock(&t->stats.mutex); + t->stats.slab_stats[ITEM_clsid(it)].set_cmds++; + pthread_mutex_unlock(&t->stats.mutex); + + // complete_nread_proxy() does the data chunk check so all we need to do + // is copy the data. + if (_store_item_copy_from_buf(it, pr->vbuf, it->nbytes) != 0) { + pout_string(resp, "SERVER_ERROR out of memory storing object"); + item_remove(it); + return; + } + + uint64_t cas = 0; + int ret = store_item(it, comm, t, &cas, set_stale); + switch (ret) { + case STORED: + memcpy(p, "HD", 2); + // Only place noreply is used for meta cmds is a nominal response. + if (of.no_reply) { + resp->skip = true; + } + break; + case EXISTS: + memcpy(p, "EX", 2); + break; + case NOT_FOUND: + memcpy(p, "NF", 2); + break; + case NOT_STORED: + memcpy(p, "NS", 2); + break; + default: + pout_errstring(resp, "SERVER_ERROR Unhandled storage type."); + return; + + } + p += 2; + + for (i = pr->keytoken+1; i < pr->ntokens; i++) { + switch (pr->request[pr->tokens[i]]) { + case 'O': + tlen = _process_token_len(pr, i); + if (tlen > MFLAG_MAX_OPAQUE_LENGTH) { + errstr = "CLIENT_ERROR opaque token too long"; + goto error; + } + META_SPACE(p); + memcpy(p, &pr->request[pr->tokens[i]], tlen); + p += tlen; + break; + case 'k': + META_KEY(p, ITEM_key(it), it->nkey, (it->it_flags & ITEM_KEY_BINARY)); + break; + case 'c': + META_CHAR(p, 'c'); + p = itoa_u64(cas, p); + break; + } + } + + // We don't need to free pr->vbuf as that is owned by *rq + // either way, there's no c->item or resp->item reference right now. + + memcpy(p, "\r\n", 2); + p += 2; + // we're offset into wbuf, but good convention to track wbytes. + resp->wbytes = p - resp->wbuf; + resp_add_iov(resp, resp->wbuf, resp->wbytes); + + item_remove(it); + + return; +error: + // Note: no errors possible after the item was successfully allocated. + // So we're just looking at dumping error codes and returning. + pout_errstring(resp, errstr); +} + +static void process_mdelete_cmd(LIBEVENT_THREAD *t, mcp_parser_t *pr, mc_resp *resp) { + const char *key = &pr->request[pr->tokens[pr->keytoken]]; + size_t nkey = pr->klen; + item *it = NULL; + int i; + uint32_t hv; + struct _meta_flags of = {0}; // option bitflags. + char *errstr = "CLIENT_ERROR bad command line format"; + assert(t != NULL); + // reserve bytes for status code + char *p = resp->wbuf + 2; + int tlen = 0; + + //WANT_TOKENS_MIN(ntokens, 3); + + if (nkey > KEY_MAX_LENGTH) { + pout_string(resp, "CLIENT_ERROR bad command line format"); + return; + } + + if (pr->ntokens > MFLAG_MAX_OPT_LENGTH) { + // TODO: ensure the command tokenizer gives us at least this many + pout_errstring(resp, "CLIENT_ERROR options flags are too long"); + return; + } + + // scrubs duplicated options and sets flags for how to load the item. + // we pass in the first token that should be a flag. + // FIXME: not using the preparse errstr? + if (_meta_flag_preparse(pr, 2, &of, &errstr) != 0) { + pout_errstring(resp, "CLIENT_ERROR invalid or duplicate flag"); + return; + } + + for (i = pr->keytoken+1; i < pr->ntokens; i++) { + switch (pr->request[pr->tokens[i]]) { + // TODO: macro perhaps? + case 'O': + tlen = _process_token_len(pr, i); + if (tlen > MFLAG_MAX_OPAQUE_LENGTH) { + errstr = "CLIENT_ERROR opaque token too long"; + goto error; + } + META_SPACE(p); + memcpy(p, &pr->request[pr->tokens[i]], tlen); + p += tlen; + break; + case 'k': + META_KEY(p, key, nkey, of.key_binary); + break; + } + } + + it = item_get_locked(key, nkey, t, DONT_UPDATE, &hv); + if (it) { + // allow only deleting/marking if a CAS value matches. + if (of.has_cas && ITEM_get_cas(it) != of.req_cas_id) { + pthread_mutex_lock(&t->stats.mutex); + t->stats.delete_misses++; + pthread_mutex_unlock(&t->stats.mutex); + + memcpy(resp->wbuf, "EX", 2); + goto cleanup; + } + + // If we're to set this item as stale, we don't actually want to + // delete it. We mark the stale bit, bump CAS, and update exptime if + // we were supplied a new TTL. + if (of.set_stale) { + if (of.new_ttl) { + it->exptime = of.exptime; + } + it->it_flags |= ITEM_STALE; + // Also need to remove TOKEN_SENT, so next client can win. + it->it_flags &= ~ITEM_TOKEN_SENT; + + ITEM_set_cas(it, (settings.use_cas) ? get_cas_id() : 0); + + // Clients can noreply nominal responses. + if (of.no_reply) + resp->skip = true; + memcpy(resp->wbuf, "HD", 2); + } else { + pthread_mutex_lock(&t->stats.mutex); + t->stats.slab_stats[ITEM_clsid(it)].delete_hits++; + pthread_mutex_unlock(&t->stats.mutex); + + do_item_unlink(it, hv); + STORAGE_delete(t->storage, it); + if (of.no_reply) + resp->skip = true; + memcpy(resp->wbuf, "HD", 2); + } + goto cleanup; + } else { + pthread_mutex_lock(&t->stats.mutex); + t->stats.delete_misses++; + pthread_mutex_unlock(&t->stats.mutex); + + memcpy(resp->wbuf, "NF", 2); + goto cleanup; + } +cleanup: + if (it) { + do_item_remove(it); + } + // Item is always returned locked, even if missing. + item_unlock(hv); + resp->wbytes = p - resp->wbuf; + memcpy(resp->wbuf + resp->wbytes, "\r\n", 2); + resp->wbytes += 2; + resp_add_iov(resp, resp->wbuf, resp->wbytes); + //conn_set_state(c, conn_new_cmd); + return; +error: + pout_errstring(resp, errstr); +} + +static void process_marithmetic_cmd(LIBEVENT_THREAD *t, mcp_parser_t *pr, mc_resp *resp) { + const char *key = &pr->request[pr->tokens[pr->keytoken]]; + size_t nkey = pr->klen; + int i; + struct _meta_flags of = {0}; // option bitflags. + char *errstr = "CLIENT_ERROR bad command line format"; + assert(t != NULL); + // no reservation (like del/set) since we post-process the status line. + char *p = resp->wbuf; + int tlen = 0; + + // If no argument supplied, incr or decr by one. + of.delta = 1; + of.initial = 0; // redundant, for clarity. + bool incr = true; // default mode is to increment. + bool locked = false; + uint32_t hv = 0; + item *it = NULL; // item returned by do_add_delta. + + //WANT_TOKENS_MIN(ntokens, 3); + + if (nkey > KEY_MAX_LENGTH) { + pout_string(resp, "CLIENT_ERROR bad command line format"); + return; + } + + if (pr->ntokens > MFLAG_MAX_OPT_LENGTH) { + // TODO: ensure the command tokenizer gives us at least this many + pout_errstring(resp, "CLIENT_ERROR options flags are too long"); + return; + } + + // scrubs duplicated options and sets flags for how to load the item. + // we pass in the first token that should be a flag. + if (_meta_flag_preparse(pr, 2, &of, &errstr) != 0) { + pout_errstring(resp, "CLIENT_ERROR invalid or duplicate flag"); + return; + } + //c->noreply = of.no_reply; + + // "mode switch" to alternative commands + switch (of.mode) { + case 0: // no switch supplied. + break; + case 'I': // Incr (default) + case '+': + incr = true; + break; + case 'D': // Decr. + case '-': + incr = false; + break; + default: + errstr = "CLIENT_ERROR invalid mode for ma M token"; + goto error; + break; + } + + // take hash value and manually lock item... hold lock during store phase + // on miss and avoid recalculating the hash multiple times. + hv = hash(key, nkey); + item_lock(hv); + locked = true; + char tmpbuf[INCR_MAX_STORAGE_LEN]; + + // return a referenced item if it exists, so we can modify it here, rather + // than adding even more parameters to do_add_delta. + bool item_created = false; + uint64_t cas = 0; + switch(do_add_delta(t, key, nkey, incr, of.delta, tmpbuf, &of.req_cas_id, hv, &it)) { + case OK: + //if (c->noreply) + // resp->skip = true; + // *it was filled, set the status below. + cas = ITEM_get_cas(it); + break; + case NON_NUMERIC: + errstr = "CLIENT_ERROR cannot increment or decrement non-numeric value"; + goto error; + break; + case EOM: + errstr = "SERVER_ERROR out of memory"; + goto error; + break; + case DELTA_ITEM_NOT_FOUND: + if (of.vivify) { + itoa_u64(of.initial, tmpbuf); + int vlen = strlen(tmpbuf); + + it = item_alloc(key, nkey, 0, 0, vlen+2); + if (it != NULL) { + memcpy(ITEM_data(it), tmpbuf, vlen); + memcpy(ITEM_data(it) + vlen, "\r\n", 2); + if (do_store_item(it, NREAD_ADD, t, hv, &cas, CAS_NO_STALE)) { + item_created = true; + } else { + // Not sure how we can get here if we're holding the lock. + memcpy(resp->wbuf, "NS", 2); + } + } else { + errstr = "SERVER_ERROR Out of memory allocating new item"; + goto error; + } + } else { + pthread_mutex_lock(&t->stats.mutex); + if (incr) { + t->stats.incr_misses++; + } else { + t->stats.decr_misses++; + } + pthread_mutex_unlock(&t->stats.mutex); + // won't have a valid it here. + memcpy(p, "NF", 2); + p += 2; + } + break; + case DELTA_ITEM_CAS_MISMATCH: + // also returns without a valid it. + memcpy(p, "EX", 2); + p += 2; + break; + } + + // final loop + // allows building the response with information after vivifying from a + // miss, or returning a new CAS value after add_delta(). + if (it) { + size_t vlen = strlen(tmpbuf); + if (of.value) { + memcpy(p, "VA ", 3); + p = itoa_u32(vlen, p+3); + } else { + memcpy(p, "HD", 2); + p += 2; + } + + for (i = pr->keytoken+1; i < pr->ntokens; i++) { + switch (pr->request[pr->tokens[i]]) { + case 'c': + META_CHAR(p, 'c'); + p = itoa_u64(cas, p); + break; + case 't': + META_CHAR(p, 't'); + if (it->exptime == 0) { + *p = '-'; + *(p+1) = '1'; + p += 2; + } else { + p = itoa_u32(it->exptime - current_time, p); + } + break; + case 'T': + it->exptime = of.exptime; + break; + case 'N': + if (item_created) { + it->exptime = of.autoviv_exptime; + } + break; + case 'O': + tlen = _process_token_len(pr, i); + if (tlen > MFLAG_MAX_OPAQUE_LENGTH) { + errstr = "CLIENT_ERROR opaque token too long"; + goto error; + } + META_SPACE(p); + memcpy(p, &pr->request[pr->tokens[i]], tlen); + break; + case 'k': + META_KEY(p, key, nkey, of.key_binary); + break; + } + } + + if (of.value) { + *p = '\r'; + *(p+1) = '\n'; + p += 2; + memcpy(p, tmpbuf, vlen); + p += vlen; + } + + do_item_remove(it); + } else { + // No item to handle. still need to return opaque/key tokens + for (i = pr->keytoken+1; i < pr->ntokens; i++) { + switch (pr->request[pr->tokens[i]]) { + case 'O': + tlen = _process_token_len(pr, i); + if (tlen > MFLAG_MAX_OPAQUE_LENGTH) { + errstr = "CLIENT_ERROR opaque token too long"; + goto error; + } + META_SPACE(p); + memcpy(p, &pr->request[pr->tokens[i]], tlen); + break; + case 'k': + META_KEY(p, key, nkey, of.key_binary); + break; + } + } + } + + item_unlock(hv); + + resp->wbytes = p - resp->wbuf; + memcpy(resp->wbuf + resp->wbytes, "\r\n", 2); + resp->wbytes += 2; + resp_add_iov(resp, resp->wbuf, resp->wbytes); + return; +error: + if (it != NULL) + do_item_remove(it); + if (locked) + item_unlock(hv); + pout_errstring(resp, errstr); +} + +/*** Lua and internal handler ***/ + +int mcplib_internal(lua_State *L) { + luaL_checkudata(L, 1, "mcp.request"); + mcp_resp_t *r = lua_newuserdatauv(L, sizeof(mcp_resp_t), 0); + memset(r, 0, sizeof(mcp_resp_t)); + luaL_getmetatable(L, "mcp.response"); + lua_setmetatable(L, -2); + + lua_pushinteger(L, MCP_YIELD_LOCAL); + return lua_yield(L, 2); +} + +// we're pretending to be p_c_ascii(), but reusing our already tokenized code. +// the text parser should eventually move to the new tokenizer and we can +// merge all of this code together. +int mcplib_internal_run(lua_State *L, conn *c, mc_resp *top_resp, int coro_ref) { + mcp_request_t *rq = luaL_checkudata(L, 1, "mcp.request"); + mcp_resp_t *r = luaL_checkudata(L, 2, "mcp.response"); + mc_resp *resp = resp_start_unlinked(c); + LIBEVENT_THREAD *t = c->thread; + mcp_parser_t *pr = &rq->pr; + if (resp == NULL) { + return -1; + } + + // TODO: meta no-op isn't handled here. haven't decided how yet. + switch (rq->pr.command) { + case CMD_MG: + process_mget_cmd(t, pr, resp); + break; + case CMD_MS: + process_mset_cmd(t, pr, resp); + break; + case CMD_MD: + process_mdelete_cmd(t, pr, resp); + break; + case CMD_MA: + process_marithmetic_cmd(t, pr, resp); + break; + case CMD_GET: + process_get_cmd(t, pr, resp, _NO_CAS, _NO_TOUCH); + break; + case CMD_GETS: + process_get_cmd(t, pr, resp, _DO_CAS, _NO_TOUCH); + break; + case CMD_GAT: + process_get_cmd(t, pr, resp, _NO_CAS, _DO_TOUCH); + break; + case CMD_GATS: + process_get_cmd(t, pr, resp, _DO_CAS, _DO_TOUCH); + break; + case CMD_SET: + process_update_cmd(t, pr, resp, NREAD_SET, _NO_CAS); + break; + case CMD_ADD: + process_update_cmd(t, pr, resp, NREAD_ADD, _NO_CAS); + break; + case CMD_APPEND: + process_update_cmd(t, pr, resp, NREAD_APPEND, _NO_CAS); + break; + case CMD_PREPEND: + process_update_cmd(t, pr, resp, NREAD_PREPEND, _NO_CAS); + break; + case CMD_CAS: + process_update_cmd(t, pr, resp, NREAD_CAS, _DO_CAS); + break; + case CMD_REPLACE: + process_update_cmd(t, pr, resp, NREAD_REPLACE, _DO_CAS); + break; + case CMD_INCR: + process_arithmetic_cmd(t, pr, resp, true); + break; + case CMD_DECR: + process_arithmetic_cmd(t, pr, resp, false); + break; + case CMD_DELETE: + process_delete_cmd(t, pr, resp); + break; + case CMD_TOUCH: + process_touch_cmd(t, pr, resp); + break; + default: + resp_free(t, resp); + return -1; + } + + // TODO: I'd like to shortcut the parsing here, but if we want the resp + // object to have full support (ie: resp:line()/etc) it might be necessary + // to still do a full parsing. It might be possible to + // wrap the main commands with something that decorates r->resp directly + // instead of going through a parser to save some CPU. + // Either way this is a lot less code. + mcmc_bare_parse_buf(resp->iov[0].iov_base, resp->iov[0].iov_len, &r->resp); + + // in case someone logs this response it should make sense. + memcpy(r->be_name, "internal", strlen("internal")); + memcpy(r->be_port, "0", 1); + + // TODO: r-> will need status/code/mode copied from resp. + r->cresp = resp; + r->thread = c->thread; + r->cmd = rq->pr.command; + // Always return OK from here as this is signalling an internal error. + r->status = MCMC_OK; + + if (resp->io_pending) { + // TODO (v2): here we move the IO from the temporary resp to the top + // resp, but this feels kludgy so I'm leaving an explicit note to find + // a better way to do this. + top_resp->io_pending = resp->io_pending; + resp->io_pending = NULL; + + // Add io object to extstore submission queue. + io_queue_t *q = conn_io_queue_get(c, IO_QUEUE_EXTSTORE); + io_pending_proxy_t *io = (io_pending_proxy_t *)top_resp->io_pending; + + io->eio.next = q->stack_ctx; + q->stack_ctx = &io->eio; + assert(q->count >= 0); + q->count++; + + io->coro_ref = coro_ref; + io->coro = L; + io->c = c; + // we need to associate the top level mc_resp here so the run routine + // can fill it in later. + io->resp = top_resp; + // mark the buffer into the mcp_resp for freeing later. + r->buf = io->eio.buf; + return 1; + } + return 0; +} diff -Nru memcached-1.6.18/proxy_lua.c memcached-1.6.19/proxy_lua.c --- memcached-1.6.18/proxy_lua.c 2023-01-11 06:10:10.000000000 +0000 +++ memcached-1.6.19/proxy_lua.c 2023-03-08 21:34:27.000000000 +0000 @@ -2,6 +2,10 @@ #include "proxy.h" +// sad, I had to look this up... +#define NANOSECONDS(x) ((x) * 1E9 + 0.5) +#define MICROSECONDS(x) ((x) * 1E6 + 0.5) + // func prototype example: // static int fname (lua_State *L) // normal library open: @@ -86,6 +90,13 @@ free(r->buf); } + // release our temporary mc_resp sub-object. + if (r->cresp != NULL) { + mc_resp *cresp = r->cresp; + assert(r->thread != NULL); + resp_free(r->thread, cresp); + } + return 0; } @@ -109,7 +120,7 @@ // Since we're running in the config thread it could just busy poll // until the connection was picked up. assert(be->transferred); - proxy_event_thread_t *e = ctx->proxy_threads; + proxy_event_thread_t *e = be->event_thread; pthread_mutex_lock(&e->mutex); STAILQ_INSERT_TAIL(&e->beconn_head_in, be, beconn_next); pthread_mutex_unlock(&e->mutex); @@ -140,13 +151,112 @@ // backend label object; given to pools which then find or create backend // objects as necessary. +// allow optionally passing a table of arguments for extended options: +// { label = "etc", "host" = "127.0.0.1", port = "11211", +// readtimeout = 0.5, connecttimeout = 1, retrytime = 3, +// failurelimit = 3, tcpkeepalive = false } static int mcplib_backend(lua_State *L) { size_t llen = 0; size_t nlen = 0; size_t plen = 0; - const char *label = luaL_checklstring(L, 1, &llen); - const char *name = luaL_checklstring(L, 2, &nlen); - const char *port = luaL_checklstring(L, 3, &plen); + proxy_ctx_t *ctx = settings.proxy_ctx; + mcp_backend_label_t *be = lua_newuserdatauv(L, sizeof(mcp_backend_label_t), 0); + memset(be, 0, sizeof(*be)); + const char *label; + const char *name; + const char *port; + // copy global defaults for tunables. + memcpy(&be->tunables, &ctx->tunables, sizeof(be->tunables)); + + if (lua_istable(L, 1)) { + + // We don't pop the label/host/port strings so lua won't change them + // until after the function call. + if (lua_getfield(L, 1, "label") != LUA_TNIL) { + label = luaL_checklstring(L, -1, &llen); + } else { + proxy_lua_error(L, "backend must have a label argument"); + return 0; + } + + if (lua_getfield(L, 1, "host") != LUA_TNIL) { + name = luaL_checklstring(L, -1, &nlen); + } else { + proxy_lua_error(L, "backend must have a host argument"); + return 0; + } + + // TODO: allow a default port. + if (lua_getfield(L, 1, "port") != LUA_TNIL) { + port = luaL_checklstring(L, -1, &plen); + } else { + proxy_lua_error(L, "backend must have a port argument"); + return 0; + } + + if (lua_getfield(L, 1, "tcpkeepalive") != LUA_TNIL) { + be->tunables.tcp_keepalive = lua_toboolean(L, -1); + } + lua_pop(L, 1); + + if (lua_getfield(L, 1, "failurelimit") != LUA_TNIL) { + int limit = luaL_checkinteger(L, -1); + if (limit < 0) { + proxy_lua_error(L, "failure_limit must be >= 0"); + return 0; + } + + be->tunables.backend_failure_limit = limit; + } + lua_pop(L, 1); + + if (lua_getfield(L, 1, "connecttimeout") != LUA_TNIL) { + lua_Number secondsf = luaL_checknumber(L, -1); + lua_Integer secondsi = (lua_Integer) secondsf; + lua_Number subseconds = secondsf - secondsi; + + be->tunables.connect.tv_sec = secondsi; + be->tunables.connect.tv_usec = MICROSECONDS(subseconds); +#ifdef HAVE_LIBURING + be->tunables.connect_ur.tv_sec = secondsi; + be->tunables.connect_ur.tv_nsec = NANOSECONDS(subseconds); +#endif + } + lua_pop(L, 1); + + if (lua_getfield(L, 1, "retrytimeout") != LUA_TNIL) { + lua_Number secondsf = luaL_checknumber(L, -1); + lua_Integer secondsi = (lua_Integer) secondsf; + lua_Number subseconds = secondsf - secondsi; + + be->tunables.retry.tv_sec = secondsi; + be->tunables.retry.tv_usec = MICROSECONDS(subseconds); +#ifdef HAVE_LIBURING + be->tunables.retry_ur.tv_sec = secondsi; + be->tunables.retry_ur.tv_nsec = NANOSECONDS(subseconds); +#endif + } + lua_pop(L, 1); + + if (lua_getfield(L, 1, "readtimeout") != LUA_TNIL) { + lua_Number secondsf = luaL_checknumber(L, -1); + lua_Integer secondsi = (lua_Integer) secondsf; + lua_Number subseconds = secondsf - secondsi; + + be->tunables.read.tv_sec = secondsi; + be->tunables.read.tv_usec = MICROSECONDS(subseconds); +#ifdef HAVE_LIBURING + be->tunables.read_ur.tv_sec = secondsi; + be->tunables.read_ur.tv_nsec = NANOSECONDS(subseconds); +#endif + } + lua_pop(L, 1); + + } else { + label = luaL_checklstring(L, 1, &llen); + name = luaL_checklstring(L, 2, &nlen); + port = luaL_checklstring(L, 3, &plen); + } if (llen > MAX_LABELLEN-1) { proxy_lua_error(L, "backend label too long"); @@ -163,8 +273,6 @@ return 0; } - mcp_backend_label_t *be = lua_newuserdatauv(L, sizeof(mcp_backend_label_t), 0); - memset(be, 0, sizeof(*be)); memcpy(be->label, label, llen); be->label[llen] = '\0'; memcpy(be->name, name, nlen); @@ -172,22 +280,26 @@ memcpy(be->port, port, plen); be->port[plen] = '\0'; be->llen = llen; + if (lua_istable(L, 1)) { + lua_pop(L, 3); // drop label, name, port. + } luaL_getmetatable(L, "mcp.backend"); lua_setmetatable(L, -2); // set metatable to userdata. return 1; // return be object. } +// Called with the cache label at top of the stack. static mcp_backend_wrap_t *_mcplib_backend_checkcache(lua_State *L, mcp_backend_label_t *bel) { // first check our reference table to compare. // Note: The upvalue won't be found unless we're running from a function with it // set as an upvalue. - lua_pushlstring(L, bel->label, bel->llen); int ret = lua_gettable(L, lua_upvalueindex(MCP_BACKEND_UPVALUE)); if (ret != LUA_TNIL) { mcp_backend_wrap_t *be_orig = luaL_checkudata(L, -1, "mcp.backendwrap"); if (strncmp(be_orig->be->name, bel->name, MAX_NAMELEN) == 0 - && strncmp(be_orig->be->port, bel->port, MAX_PORTLEN) == 0) { + && strncmp(be_orig->be->port, bel->port, MAX_PORTLEN) == 0 + && memcmp(&be_orig->be->tunables, &bel->tunables, sizeof(bel->tunables)) == 0) { // backend is the same, return it. return be_orig; } else { @@ -201,7 +313,8 @@ return NULL; } -static mcp_backend_wrap_t *_mcplib_make_backendconn(lua_State *L, mcp_backend_label_t *bel) { +static mcp_backend_wrap_t *_mcplib_make_backendconn(lua_State *L, mcp_backend_label_t *bel, + proxy_event_thread_t *e) { // FIXME: remove global. proxy_ctx_t *ctx = settings.proxy_ctx; @@ -218,6 +331,7 @@ strncpy(be->name, bel->name, MAX_NAMELEN+1); strncpy(be->port, bel->port, MAX_PORTLEN+1); + memcpy(&be->tunables, &bel->tunables, sizeof(bel->tunables)); STAILQ_INIT(&be->io_head); be->state = mcp_backend_read; @@ -231,7 +345,9 @@ } // initialize libevent. - memset(&be->event, 0, sizeof(be->event)); + memset(&be->main_event, 0, sizeof(be->main_event)); + memset(&be->write_event, 0, sizeof(be->write_event)); + memset(&be->timeout_event, 0, sizeof(be->timeout_event)); // initialize the client be->client = malloc(mcmc_size(MCMC_OPTION_BLANK)); @@ -253,7 +369,7 @@ STAT_UL(ctx); be->connect_flags = flags; - proxy_event_thread_t *e = ctx->proxy_threads; + be->event_thread = e; pthread_mutex_lock(&e->mutex); STAILQ_INSERT_TAIL(&e->beconn_head_in, be, beconn_next); pthread_mutex_unlock(&e->mutex); @@ -272,8 +388,8 @@ } #endif + lua_pushvalue(L, 4); // push the label string back to the top. // Add this new backend connection to the object cache. - lua_pushlstring(L, bel->label, bel->llen); // put the label at the top for settable. lua_pushvalue(L, -2); // copy the backend reference to the top. // set our new backend wrapper object into the reference table. lua_settable(L, lua_upvalueindex(MCP_BACKEND_UPVALUE)); @@ -406,43 +522,43 @@ // UD now popped from stack. } -// p = mcp.pool(backends, { dist = f, hashfilter = f, seed = "a", hash = f }) -static int mcplib_pool(lua_State *L) { - int argc = lua_gettop(L); - luaL_checktype(L, 1, LUA_TTABLE); - int n = luaL_len(L, 1); // get length of array table - - size_t plen = sizeof(mcp_pool_t) + sizeof(mcp_pool_be_t) * n; - mcp_pool_t *p = lua_newuserdatauv(L, plen, 0); - // Zero the memory before use, so we can realibly use __gc to clean up - memset(p, 0, plen); - p->pool_size = n; - // TODO (v2): Nicer if this is fetched from mcp.default_key_hash - p->key_hasher = XXH3_64bits_withSeed; - pthread_mutex_init(&p->lock, NULL); - p->ctx = lua_touserdata(L, lua_upvalueindex(MCP_CONTEXT_UPVALUE)); - - luaL_setmetatable(L, "mcp.pool"); - - lua_pushvalue(L, -1); // dupe self for reference. - p->self_ref = luaL_ref(L, LUA_REGISTRYINDEX); - - // TODO (v2): move to after function check so we can find the right - // backend label to look up. +// in the proxy object, we can alias a ptr to the pool to where it needs to be +// based on worker number or io_thread right? +static void _mcplib_pool_make_be_loop(lua_State *L, mcp_pool_t *p, int offset, proxy_event_thread_t *t) { // remember lua arrays are 1 indexed. - for (int x = 1; x <= n; x++) { - mcp_pool_be_t *s = &p->pool[x-1]; + for (int x = 1; x <= p->pool_size; x++) { + mcp_pool_be_t *s = &p->pool[x-1 + (offset * p->pool_size)]; lua_geti(L, 1, x); // get next server into the stack. // If we bail here, the pool _gc() should handle releasing any backend // references we made so far. mcp_backend_label_t *bel = luaL_checkudata(L, -1, "mcp.backend"); // check label for pre-existing backend conn/wrapper + // TODO (v2): there're native ways of "from C make lua strings" + int toconcat = 1; + if (p->beprefix[0] != '\0') { + lua_pushstring(L, p->beprefix); + toconcat++; + } + if (p->use_iothread) { + lua_pushstring(L, ":io:"); + toconcat++; + } else { + lua_pushstring(L, ":w"); + lua_pushinteger(L, offset); + lua_pushstring(L, ":"); + toconcat += 3; + } + lua_pushlstring(L, bel->label, bel->llen); + lua_concat(L, toconcat); + + lua_pushvalue(L, -1); // copy the label string for the create method. mcp_backend_wrap_t *bew = _mcplib_backend_checkcache(L, bel); if (bew == NULL) { - bew = _mcplib_make_backendconn(L, bel); + bew = _mcplib_make_backendconn(L, bel, t); } s->be = bew->be; // unwrap the backend connection for direct ref. + bew->be->use_io_thread = p->use_iothread; // If found from cache or made above, the backend wrapper is on the // top of the stack, so we can now take its reference. @@ -451,11 +567,51 @@ s->ref = luaL_ref(L, LUA_REGISTRYINDEX); // references and pops object. lua_pop(L, 1); // pop the mcp.backend label object. + lua_pop(L, 1); // drop extra label copy. + } +} + +// call with table of backends in 1 +static void _mcplib_pool_make_be(lua_State *L, mcp_pool_t *p) { + if (p->use_iothread) { + proxy_ctx_t *ctx = settings.proxy_ctx; + _mcplib_pool_make_be_loop(L, p, 0, ctx->proxy_io_thread); + } else { + // TODO (v3) globals. + for (int n = 0; n < settings.num_threads; n++) { + LIBEVENT_THREAD *t = get_worker_thread(n); + _mcplib_pool_make_be_loop(L, p, t->thread_baseid, t->proxy_event_thread); + } } +} + +// p = mcp.pool(backends, { dist = f, hashfilter = f, seed = "a", hash = f }) +static int mcplib_pool(lua_State *L) { + int argc = lua_gettop(L); + luaL_checktype(L, 1, LUA_TTABLE); + int n = luaL_len(L, 1); // get length of array table + int workers = settings.num_threads; // TODO (v3): globals usage. + + size_t plen = sizeof(mcp_pool_t) + (sizeof(mcp_pool_be_t) * n * workers); + mcp_pool_t *p = lua_newuserdatauv(L, plen, 0); + // Zero the memory before use, so we can realibly use __gc to clean up + memset(p, 0, plen); + p->pool_size = n; + p->use_iothread = true; + // TODO (v2): Nicer if this is fetched from mcp.default_key_hash + p->key_hasher = XXH3_64bits_withSeed; + pthread_mutex_init(&p->lock, NULL); + p->ctx = lua_touserdata(L, lua_upvalueindex(MCP_CONTEXT_UPVALUE)); + + luaL_setmetatable(L, "mcp.pool"); + + lua_pushvalue(L, -1); // dupe self for reference. + p->self_ref = luaL_ref(L, LUA_REGISTRYINDEX); // Allow passing an ignored nil as a second argument. Makes the lua easier int type = lua_type(L, 2); if (argc == 1 || type == LUA_TNIL) { + _mcplib_pool_make_be(L, p); lua_getglobal(L, "mcp"); // TODO (v2): decide on a mcp.default_dist and use that instead if (lua_getfield(L, -1, "dist_jump_hash") != LUA_TNIL) { @@ -472,6 +628,31 @@ // pool, then pass it along to the a constructor if necessary. luaL_checktype(L, 2, LUA_TTABLE); + if (lua_getfield(L, 2, "iothread") != LUA_TNIL) { + luaL_checktype(L, -1, LUA_TBOOLEAN); + int use_iothread = lua_toboolean(L, -1); + if (use_iothread) { + p->use_iothread = true; + } else { + p->use_iothread = false; + } + lua_pop(L, 1); // remove value. + } else { + lua_pop(L, 1); // pop the nil. + } + + if (lua_getfield(L, 2, "beprefix") != LUA_TNIL) { + luaL_checktype(L, -1, LUA_TSTRING); + size_t len = 0; + const char *bepfx = lua_tolstring(L, -1, &len); + memcpy(p->beprefix, bepfx, len); + p->beprefix[len+1] = '\0'; + lua_pop(L, 1); // pop beprefix string. + } else { + lua_pop(L, 1); // pop the nil. + } + _mcplib_pool_make_be(L, p); + // stack: backends, options, mcp.pool if (lua_getfield(L, 2, "dist") != LUA_TNIL) { // overriding the distribution function. @@ -479,6 +660,17 @@ lua_pop(L, 1); // remove the dist table from stack. } else { lua_pop(L, 1); // pop the nil. + + // use the default dist if not specified with an override table. + lua_getglobal(L, "mcp"); + // TODO (v2): decide on a mcp.default_dist and use that instead + if (lua_getfield(L, -1, "dist_jump_hash") != LUA_TNIL) { + _mcplib_pool_dist(L, p); + lua_pop(L, 1); // pop "dist_jump_hash" value. + } else { + lua_pop(L, 1); + } + lua_pop(L, 1); // pop "mcp" } if (lua_getfield(L, 2, "filter") != LUA_TNIL) { @@ -558,7 +750,8 @@ return 0; } -mcp_backend_t *mcplib_pool_proxy_call_helper(lua_State *L, mcp_pool_t *p, const char *key, size_t len) { +mcp_backend_t *mcplib_pool_proxy_call_helper(lua_State *L, mcp_pool_proxy_t *pp, const char *key, size_t len) { + mcp_pool_t *p = pp->main; if (p->key_filter) { key = p->key_filter(p->key_filter_conf, key, len, &len); P_DEBUG("%s: filtered key for hashing (%.*s)\n", __func__, (int)len, key); @@ -574,7 +767,7 @@ proxy_lua_error(L, "key dist hasher tried to use out of bounds index"); } - return p->pool[lookup].be; + return pp->pool[lookup].be; } // hashfunc(request) -> backend(request) @@ -582,7 +775,6 @@ static int mcplib_pool_proxy_call(lua_State *L) { // internal args are the hash selector (self) mcp_pool_proxy_t *pp = luaL_checkudata(L, -2, "mcp.pool_proxy"); - mcp_pool_t *p = pp->main; // then request object. mcp_request_t *rq = luaL_checkudata(L, -1, "mcp.request"); @@ -593,10 +785,11 @@ } const char *key = MCP_PARSER_KEY(rq->pr); size_t len = rq->pr.klen; - rq->be = mcplib_pool_proxy_call_helper(L, p, key, len); + rq->be = mcplib_pool_proxy_call_helper(L, pp, key, len); // now yield request, pool up. - return lua_yield(L, 2); + lua_pushinteger(L, MCP_YIELD_POOL); + return lua_yield(L, 3); } static int mcplib_tcp_keepalive(lua_State *L) { @@ -627,10 +820,6 @@ return 0; } -// sad, I had to look this up... -#define NANOSECONDS(x) ((x) * 1E9 + 0.5) -#define MICROSECONDS(x) ((x) * 1E6 + 0.5) - static int mcplib_backend_connect_timeout(lua_State *L) { lua_Number secondsf = luaL_checknumber(L, -1); lua_Integer secondsi = (lua_Integer) secondsf; @@ -723,6 +912,11 @@ for (int x = loop_start; x < loop_end; x++) { struct proxy_hook *h = &hooks[x]; + if (x == CMD_MN) { + // disallow overriding MN so client pipeline flushes work. + // need to add flush support before allowing override + continue; + } lua_pushvalue(L, 2); // duplicate the function for the ref. if (tag) { @@ -1001,6 +1195,7 @@ }; const struct luaL_Reg mcplib_f [] = { + {"internal", mcplib_internal}, {"pool", mcplib_pool}, {"backend", mcplib_backend}, {"request", mcplib_request}, @@ -1008,6 +1203,7 @@ {"add_stat", mcplib_add_stat}, {"stat", mcplib_stat}, {"await", mcplib_await}, + {"await_logerrors", mcplib_await_logerrors}, {"log", mcplib_log}, {"log_req", mcplib_log_req}, {"log_reqsample", mcplib_log_reqsample}, diff -Nru memcached-1.6.18/proxy_network.c memcached-1.6.19/proxy_network.c --- memcached-1.6.18/proxy_network.c 2023-01-11 06:10:10.000000000 +0000 +++ memcached-1.6.19/proxy_network.c 2023-03-08 21:34:27.000000000 +0000 @@ -40,14 +40,18 @@ static void proxy_beconn_handler(const int fd, const short which, void *arg); static void proxy_event_handler(evutil_socket_t fd, short which, void *arg); static void proxy_event_beconn(evutil_socket_t fd, short which, void *arg); -static void proxy_event_updater(evutil_socket_t fd, short which, void *arg); static int _prep_pending_write(mcp_backend_t *be); -static bool _post_pending_write(mcp_backend_t *be, ssize_t sent); +static void _post_pending_write(mcp_backend_t *be, ssize_t sent); static int _flush_pending_write(mcp_backend_t *be); static void _cleanup_backend(mcp_backend_t *be); static int _reset_bad_backend(mcp_backend_t *be, enum proxy_be_failures err); static void _backend_failed(mcp_backend_t *be); -static void _set_event(mcp_backend_t *be, struct event_base *base, int flags, struct timeval t, event_callback_fn callback); +static void _set_main_event(mcp_backend_t *be, struct event_base *base, int flags, struct timeval *t, event_callback_fn callback); +static void _stop_main_event(mcp_backend_t *be); +static void _start_write_event(mcp_backend_t *be); +static void _stop_write_event(mcp_backend_t *be); +static void _start_timeout_event(mcp_backend_t *be); +static void _stop_timeout_event(mcp_backend_t *be); static int proxy_backend_drive_machine(mcp_backend_t *be); /* Helper routines common to io_uring and libevent modes */ @@ -129,17 +133,8 @@ // paranoia about moving items between lists. io->io_next.stqe_next = NULL; - // Need to check on await's before looking at backends, in case it - // doesn't have one. - // Here we're letting an await resume without waiting on the network. - if (io->await_background) { - return_io_pending((io_pending_t *)io); - continue; - } - mcp_backend_t *be = io->backend; // So the backend can retrieve its event base. - be->event_thread = t; if (be->bad) { P_DEBUG("%s: fast failing request to bad backend\n", __func__); io->client_resp->status = MCMC_ERR; @@ -147,9 +142,6 @@ continue; } STAILQ_INSERT_TAIL(&be->io_head, io, io_next); - if (be->io_next == NULL) { - be->io_next = io; // set write flush starting point. - } be->depth++; io_count++; if (!be->stacked) { @@ -174,10 +166,7 @@ static void _proxy_evthr_evset_be_readvalidate(mcp_backend_t *be, char *buf, size_t len, struct __kernel_timespec *ts); static void _proxy_evthr_evset_notifier(proxy_event_thread_t *t); static void _proxy_evthr_evset_benotifier(proxy_event_thread_t *t); -static void _proxy_evthr_evset_clock(proxy_event_thread_t *t); -static void proxy_event_updater_ur(void *udata, struct io_uring_cqe *cqe); static void _backend_failed_ur(mcp_backend_t *be); -struct __kernel_timespec updater_ts = {.tv_sec = 3, .tv_nsec = 0}; static void _flush_pending_write_ur(mcp_backend_t *be) { // Allow us to be called with an empty stack to prevent dev errors. @@ -188,7 +177,7 @@ int iovcnt = _prep_pending_write(be); // TODO: write timeout. - _proxy_evthr_evset_be_writev(be, iovcnt, &be->event_thread->tunables.read_ur); + _proxy_evthr_evset_be_writev(be, iovcnt, &be->tunables.read_ur); } // TODO: we shouldn't handle reads if a write is pending, so postwrite should @@ -203,7 +192,7 @@ // FIXME: sent == 0 is disconnected? I keep forgetting. if (sent == -EAGAIN || sent == -EWOULDBLOCK) { // didn't do any writing, wait for a writeable socket. - _proxy_evthr_evset_be_wrpoll(be, &be->event_thread->tunables.read_ur); + _proxy_evthr_evset_be_wrpoll(be, &be->tunables.read_ur); } else { _reset_bad_backend(be, P_BE_FAIL_WRITING); _backend_failed_ur(be); @@ -212,31 +201,19 @@ if (_post_pending_write(be, sent)) { // commands were flushed, set read handler. - _proxy_evthr_evset_be_read(be, be->rbuf+be->rbufused, READ_BUFFER_SIZE-be->rbufused, &be->event_thread->tunables.read_ur); + _proxy_evthr_evset_be_read(be, be->rbuf+be->rbufused, READ_BUFFER_SIZE-be->rbufused, &be->tunables.read_ur); } if (be->io_next) { // still have unflushed commands, re-run write command. // writev can't "block if EAGAIN" in io_uring so far as I can tell, so // we have to switch to polling mode here. - _proxy_evthr_evset_be_wrpoll(be, &be->event_thread->tunables.read_ur); + _proxy_evthr_evset_be_wrpoll(be, &be->tunables.read_ur); } // TODO: if rbufused != 0, push through drive machine? } -static void proxy_event_updater_ur(void *udata, struct io_uring_cqe *cqe) { - proxy_event_thread_t *t = udata; - proxy_ctx_t *ctx = t->ctx; - - _proxy_evthr_evset_clock(t); - - // we reuse the "global stats" lock since it's hardly ever used. - STAT_L(ctx); - memcpy(&t->tunables, &ctx->tunables, sizeof(t->tunables)); - STAT_UL(ctx); -} - // No-op at the moment. when the linked timeout fires uring returns the // linked request (read/write/poll/etc) with an interrupted/timeout/cancelled // error. So we don't need to explicitly handle timeouts. @@ -248,7 +225,7 @@ static void proxy_backend_retry_handler_ur(void *udata, struct io_uring_cqe *cqe) { mcp_backend_t *be = udata; - _proxy_evthr_evset_be_conn(be, &be->event_thread->tunables.connect_ur); + _proxy_evthr_evset_be_conn(be, &be->tunables.connect_ur); } static void _proxy_evthr_evset_be_retry(mcp_backend_t *be) { @@ -262,19 +239,19 @@ sqe = io_uring_get_sqe(&be->event_thread->ring); // TODO (v2): NULL? - io_uring_prep_timeout(sqe, &be->event_thread->tunables.retry_ur, 0, 0); + io_uring_prep_timeout(sqe, &be->tunables.retry_ur, 0, 0); io_uring_sqe_set_data(sqe, &be->ur_te_ev); be->ur_te_ev.set = true; } static void _backend_failed_ur(mcp_backend_t *be) { - if (++be->failed_count > be->event_thread->tunables.backend_failure_limit) { + if (++be->failed_count > be->tunables.backend_failure_limit) { P_DEBUG("%s: marking backend as bad\n", __func__); be->bad = true; _proxy_evthr_evset_be_retry(be); STAT_INCR(be->event_thread->ctx, backend_marked_bad, 1); } else { - _proxy_evthr_evset_be_conn(be, &be->event_thread->tunables.connect_ur); + _proxy_evthr_evset_be_conn(be, &be->tunables.connect_ur); STAT_INCR(be->event_thread->ctx, backend_failed, 1); } } @@ -301,7 +278,7 @@ // TODO (v2): when exactly do we need to reset the backend handler? if (!STAILQ_EMPTY(&be->io_head)) { - _proxy_evthr_evset_be_read(be, be->rbuf+be->rbufused, READ_BUFFER_SIZE-be->rbufused, &be->event_thread->tunables.read_ur); + _proxy_evthr_evset_be_read(be, be->rbuf+be->rbufused, READ_BUFFER_SIZE-be->rbufused, &be->tunables.read_ur); } } @@ -311,7 +288,7 @@ be->can_write = true; _flush_pending_write_ur(be); - _proxy_evthr_evset_be_read(be, be->rbuf+be->rbufused, READ_BUFFER_SIZE-be->rbufused, &be->event_thread->tunables.read_ur); + _proxy_evthr_evset_be_read(be, be->rbuf+be->rbufused, READ_BUFFER_SIZE-be->rbufused, &be->tunables.read_ur); } // a backend with an outstanding new connection has become writeable. @@ -355,7 +332,7 @@ // TODO: make validation optional. // set next handler on recv for validity check. - _proxy_evthr_evset_be_readvalidate(be, be->rbuf, READ_BUFFER_SIZE, &be->event_thread->tunables.read_ur); + _proxy_evthr_evset_be_readvalidate(be, be->rbuf, READ_BUFFER_SIZE, &be->tunables.read_ur); } // TODO: share more code with proxy_beconn_handler @@ -438,7 +415,6 @@ _cleanup_backend(be); } else { be->transferred = true; - be->event_thread = t; int status = mcmc_connect(be->client, be->name, be->port, be->connect_flags); if (status == MCMC_CONNECTING || status == MCMC_CONNECTED) { // if we're already connected for some reason, still push it @@ -446,7 +422,7 @@ // will auto-wake because the socket is writeable. be->connecting = true; be->can_write = false; - _proxy_evthr_evset_be_conn(be, &t->tunables.connect_ur); + _proxy_evthr_evset_be_conn(be, &be->tunables.connect_ur); } else { _reset_bad_backend(be, P_BE_FAIL_CONNECTING); _backend_failed_ur(be); @@ -657,17 +633,6 @@ io_uring_sqe_set_data(sqe, NULL); }*/ -static void _proxy_evthr_evset_clock(proxy_event_thread_t *t) { - struct io_uring_sqe *sqe; - - sqe = io_uring_get_sqe(&t->ring); - // FIXME (v2): NULL? - - io_uring_prep_timeout(sqe, &updater_ts, 0, 0); - io_uring_sqe_set_data(sqe, &t->ur_clock_event); - t->ur_clock_event.set = true; -} - static void _proxy_evthr_evset_benotifier(proxy_event_thread_t *t) { struct io_uring_sqe *sqe; P_DEBUG("%s: setting: %d\n", __func__, t->ur_benotify_event.set); @@ -748,28 +713,6 @@ } #endif // HAVE_LIBURING -// We need to get timeout/retry/etc updates to the event thread(s) -// occasionally. I'd like to have a better inteface around this where updates -// are shipped directly; but this is good enough to start with. -static void proxy_event_updater(evutil_socket_t fd, short which, void *arg) { - proxy_event_thread_t *t = arg; - proxy_ctx_t *ctx = t->ctx; - - // TODO (v2): double check how much of this boilerplate is still necessary? - // reschedule the clock event. - evtimer_del(&t->clock_event); - - evtimer_set(&t->clock_event, proxy_event_updater, t); - event_base_set(t->base, &t->clock_event); - struct timeval rate = {.tv_sec = 3, .tv_usec = 0}; - evtimer_add(&t->clock_event, &rate); - - // we reuse the "global stats" lock since it's hardly ever used. - STAT_L(ctx); - memcpy(&t->tunables, &ctx->tunables, sizeof(t->tunables)); - STAT_UL(ctx); -} - static void _cleanup_backend(mcp_backend_t *be) { #ifdef HAVE_LIBURING if (be->event_thread->use_uring) { @@ -777,12 +720,17 @@ } else { #endif // remove any pending events. - int pending = 0; - if (event_initialized(&be->event)) { - pending = event_pending(&be->event, EV_READ|EV_WRITE|EV_TIMEOUT, NULL); + int pending = event_pending(&be->main_event, EV_READ|EV_WRITE|EV_TIMEOUT, NULL); + if ((pending & (EV_READ|EV_WRITE|EV_TIMEOUT)) != 0) { + event_del(&be->main_event); // an error to call event_del() without event. } + pending = event_pending(&be->write_event, EV_READ|EV_WRITE|EV_TIMEOUT, NULL); if ((pending & (EV_READ|EV_WRITE|EV_TIMEOUT)) != 0) { - event_del(&be->event); // an error to call event_del() without event. + event_del(&be->write_event); // an error to call event_del() without event. + } + pending = event_pending(&be->timeout_event, EV_TIMEOUT, NULL); + if ((pending & (EV_TIMEOUT)) != 0) { + event_del(&be->timeout_event); // an error to call event_del() without event. } #ifdef HAVE_LIBURING } @@ -820,7 +768,6 @@ #endif beconn_head_t head; - struct timeval tmp_time = t->tunables.connect; STAILQ_INIT(&head); pthread_mutex_lock(&t->mutex); @@ -845,15 +792,22 @@ _cleanup_backend(be); } else { be->transferred = true; - be->event_thread = t; + // assign the initial events to the backend, so we don't have to + // constantly check if they were initialized yet elsewhere. + // note these events will not fire until event_add() is called. int status = mcmc_connect(be->client, be->name, be->port, be->connect_flags); + event_assign(&be->main_event, be->event_thread->base, mcmc_fd(be->client), EV_WRITE|EV_TIMEOUT, proxy_beconn_handler, be); + event_assign(&be->write_event, be->event_thread->base, mcmc_fd(be->client), EV_WRITE|EV_TIMEOUT, proxy_backend_handler, be); + event_assign(&be->timeout_event, be->event_thread->base, -1, EV_TIMEOUT, proxy_backend_handler, be); + if (status == MCMC_CONNECTING || status == MCMC_CONNECTED) { // if we're already connected for some reason, still push it // through the connection handler to keep the code unified. It // will auto-wake because the socket is writeable. be->connecting = true; be->can_write = false; - _set_event(be, t->base, EV_WRITE|EV_TIMEOUT, tmp_time, proxy_beconn_handler); + // kick off the event we intialized above. + event_add(&be->main_event, &be->tunables.connect); } else { _reset_bad_backend(be, P_BE_FAIL_CONNECTING); _backend_failed(be); @@ -862,6 +816,44 @@ } } +void proxy_run_backend_queue(be_head_t *head) { + mcp_backend_t *be; + STAILQ_FOREACH(be, head, be_next) { + be->stacked = false; + int flags = 0; + + if (be->bad) { + // flush queue if backend is still bad. + // TODO: duplicated from _reset_bad_backend() + io_pending_proxy_t *io = NULL; + while (!STAILQ_EMPTY(&be->io_head)) { + io = STAILQ_FIRST(&be->io_head); + STAILQ_REMOVE_HEAD(&be->io_head, io_next); + io->client_resp->status = MCMC_ERR; + be->depth--; + return_io_pending((io_pending_t *)io); + } + } else if (be->connecting || be->validating) { + P_DEBUG("%s: deferring IO pending connecting (%s:%s)\n", __func__, be->name, be->port); + } else { + flags = _flush_pending_write(be); + + if (flags == -1) { + _reset_bad_backend(be, P_BE_FAIL_WRITING); + _backend_failed(be); + } else if (flags & EV_WRITE) { + // only get here because we need to kick off the write handler + _start_write_event(be); + } + + if (be->pending_read) { + _start_timeout_event(be); + } + + } + } +} + // event handler for executing backend requests static void proxy_event_handler(evutil_socket_t fd, short which, void *arg) { proxy_event_thread_t *t = arg; @@ -893,29 +885,7 @@ } // Re-walk each backend and check set event as required. - mcp_backend_t *be = NULL; - struct timeval tmp_time = t->tunables.read; - - // FIXME (v2): _set_event() is buggy, see notes on function. - STAILQ_FOREACH(be, &t->be_head, be_next) { - be->stacked = false; - int flags = 0; - - if (be->connecting || be->validating) { - P_DEBUG("%s: deferring IO pending connecting (%s:%s)\n", __func__, be->name, be->port); - } else { - flags = _flush_pending_write(be); - - if (flags == -1) { - _reset_bad_backend(be, P_BE_FAIL_WRITING); - _backend_failed(be); - } else { - flags = be->can_write ? EV_READ|EV_TIMEOUT : EV_READ|EV_WRITE|EV_TIMEOUT; - _set_event(be, t->base, flags, tmp_time, proxy_backend_handler); - } - } - } - + proxy_run_backend_queue(&t->be_head); } void *proxy_event_thread(void *arg) { @@ -930,28 +900,58 @@ return NULL; } -// FIXME (v2): if we use the newer API the various pending checks can be adjusted. -static void _set_event(mcp_backend_t *be, struct event_base *base, int flags, struct timeval t, event_callback_fn callback) { - // FIXME (v2): chicken and egg. - // can't check if pending if the structure is was calloc'ed (sigh) - // don't want to double test here. should be able to event_assign but - // not add anything during initialization, but need the owner thread's - // event base. - int pending = 0; - if (event_initialized(&be->event)) { - pending = event_pending(&be->event, EV_READ|EV_WRITE|EV_TIMEOUT, NULL); - } - if ((pending & (EV_READ|EV_WRITE|EV_TIMEOUT)) != 0) { - event_del(&be->event); // replace existing event. +static void _set_main_event(mcp_backend_t *be, struct event_base *base, int flags, struct timeval *t, event_callback_fn callback) { + int pending = event_pending(&be->main_event, EV_READ|EV_WRITE, NULL); + if ((pending & (EV_READ|EV_WRITE)) != 0) { + event_del(&be->main_event); // replace existing event. } - // if we can't write, we could be connecting. - // TODO (v2): always check for READ in case some commands were sent - // successfully? The flags could be tracked on *be and reset in the - // handler, perhaps? - event_assign(&be->event, base, mcmc_fd(be->client), + event_assign(&be->main_event, base, mcmc_fd(be->client), flags, callback, be); - event_add(&be->event, &t); + event_add(&be->main_event, t); +} + +static void _stop_main_event(mcp_backend_t *be) { + int pending = event_pending(&be->main_event, EV_READ|EV_WRITE, NULL); + if ((pending & (EV_READ|EV_WRITE|EV_TIMEOUT)) == 0) { + return; + } + event_del(&be->write_event); +} + +static void _start_write_event(mcp_backend_t *be) { + int pending = event_pending(&be->main_event, EV_WRITE, NULL); + if ((pending & (EV_WRITE|EV_TIMEOUT)) != 0) { + return; + } + // FIXME: wasn't there a write timeout? + event_add(&be->write_event, &be->tunables.read); +} + +static void _stop_write_event(mcp_backend_t *be) { + int pending = event_pending(&be->main_event, EV_WRITE, NULL); + if ((pending & (EV_WRITE|EV_TIMEOUT)) == 0) { + return; + } + event_del(&be->write_event); +} + +// handle the read timeouts with a side event, so we can stick with a +// persistent listener (optimization + catch disconnects faster) +static void _start_timeout_event(mcp_backend_t *be) { + int pending = event_pending(&be->timeout_event, EV_TIMEOUT, NULL); + if ((pending & (EV_TIMEOUT)) != 0) { + return; + } + event_add(&be->timeout_event, &be->tunables.read); +} + +static void _stop_timeout_event(mcp_backend_t *be) { + int pending = event_pending(&be->timeout_event, EV_TIMEOUT, NULL); + if ((pending & (EV_TIMEOUT)) == 0) { + return; + } + event_del(&be->timeout_event); } // NOTES: @@ -966,6 +966,7 @@ static int proxy_backend_drive_machine(mcp_backend_t *be) { bool stop = false; io_pending_proxy_t *p = NULL; + struct timeval end; int flags = 0; p = STAILQ_FIRST(&be->io_head); @@ -1037,6 +1038,11 @@ if (p->ascii_multiget && r->resp.type == MCMC_RESP_END) { // Ascii multiget hack mode; consume END's + be->rbufused -= r->resp.reslen; + if (be->rbufused > 0) { + memmove(be->rbuf, be->rbuf+r->resp.reslen, be->rbufused); + } + be->state = mcp_backend_next; break; } @@ -1134,7 +1140,7 @@ memcpy(r->buf+r->bread, be->rbuf, tocopy); r->bread += tocopy; - if (r->bread >= r->resp.vlen) { + if (r->bread >= r->blen) { // all done copying data. if (r->resp.type == MCMC_RESP_GET) { be->state = mcp_backend_read_end; @@ -1160,6 +1166,13 @@ // set the head here. when we break the head will be correct. STAILQ_REMOVE_HEAD(&be->io_head, io_next); be->depth--; + be->pending_read--; + + // stamp the elapsed time into the response object. + gettimeofday(&end, NULL); + p->client_resp->elapsed = (end.tv_sec - p->client_resp->start.tv_sec) * 1000000 + + (end.tv_usec - p->client_resp->start.tv_usec); + // have to do the q->count-- and == 0 and redispatch_conn() // stuff here. The moment we call return_io here we // don't own *p anymore. @@ -1204,37 +1217,59 @@ return flags; } +static void _backend_reconnect(mcp_backend_t *be) { + int status = mcmc_connect(be->client, be->name, be->port, be->connect_flags); + if (status == MCMC_CONNECTED) { + // TODO (v2): unexpected but lets let it be here. + be->connecting = false; + be->can_write = true; + } else if (status == MCMC_CONNECTING) { + be->connecting = true; + be->can_write = false; + } else { + // TODO (v2): failed to immediately re-establish the connection. + // need to put the BE into a bad/retry state. + // FIXME (v2): until we get an event to specifically handle connecting and + // bad server handling, attempt to force a reconnect here the next + // time a request comes through. + // The event thread will attempt to write to the backend, fail, then + // end up in this routine again. + be->connecting = false; + be->can_write = true; + } + // re-create the write handler for the new file descriptor. + // the main event will be re-assigned after this call. + event_assign(&be->write_event, be->event_thread->base, mcmc_fd(be->client), EV_WRITE|EV_TIMEOUT, proxy_backend_handler, be); + // do not need to re-assign the timer event because it's not tied to fd +} + // All we need to do here is schedule the backend to attempt to connect again. static void proxy_backend_retry_handler(const int fd, const short which, void *arg) { mcp_backend_t *be = arg; assert(which & EV_TIMEOUT); - struct timeval tmp_time = be->event_thread->tunables.retry; - _set_event(be, be->event_thread->base, EV_WRITE|EV_TIMEOUT, tmp_time, proxy_beconn_handler); + struct timeval tmp_time = be->tunables.retry; + _backend_reconnect(be); + _set_main_event(be, be->event_thread->base, EV_WRITE, &tmp_time, proxy_beconn_handler); } -// currently just for timeouts, but certain errors should consider a backend -// to be "bad" as well. // must be called after _reset_bad_backend(), so the backend is currently // clear. -// TODO (v2): currently only notes for "bad backends" in cases of timeouts or -// connect failures. We need a specific connect() handler that executes a -// "version" call to at least check that the backend isn't speaking garbage. -// In theory backends can fail such that responses are constantly garbage, -// but it's more likely an app is doing something bad and culling the backend -// may prevent any other clients from talking to that backend. In -// that case we need to track if clients are causing errors consistently and -// block them instead. That's more challenging so leaving a note instead -// of doing this now :) +// TODO (v2): extra counter for "backend connect tries" so it's still possible +// to see dead backends exist static void _backend_failed(mcp_backend_t *be) { - struct timeval tmp_time = be->event_thread->tunables.retry; - if (++be->failed_count > be->event_thread->tunables.backend_failure_limit) { - P_DEBUG("%s: marking backend as bad\n", __func__); + struct timeval tmp_time = be->tunables.retry; + if (++be->failed_count > be->tunables.backend_failure_limit) { + if (!be->bad) { + P_DEBUG("%s: marking backend as bad\n", __func__); + STAT_INCR(be->event_thread->ctx, backend_marked_bad, 1); + LOGGER_LOG(NULL, LOG_PROXYEVENTS, LOGGER_PROXY_BE_ERROR, NULL, "markedbad", be->name, be->port, 0, NULL, 0); + } be->bad = true; - _set_event(be, be->event_thread->base, EV_TIMEOUT, tmp_time, proxy_backend_retry_handler); - STAT_INCR(be->event_thread->ctx, backend_marked_bad, 1); + _set_main_event(be, be->event_thread->base, EV_TIMEOUT, &tmp_time, proxy_backend_retry_handler); } else { STAT_INCR(be->event_thread->ctx, backend_failed, 1); - _set_event(be, be->event_thread->base, EV_WRITE|EV_TIMEOUT, tmp_time, proxy_beconn_handler); + _backend_reconnect(be); + _set_main_event(be, be->event_thread->base, EV_WRITE, &tmp_time, proxy_beconn_handler); } } @@ -1250,6 +1285,7 @@ io_pending_proxy_t *io = NULL; // Can't use STAILQ_FOREACH() since return_io_pending() free's the current // io. STAILQ_FOREACH_SAFE maybe? + int depth = be->depth; while (!STAILQ_EMPTY(&be->io_head)) { io = STAILQ_FIRST(&be->io_head); STAILQ_REMOVE_HEAD(&be->io_head, io_next); @@ -1263,30 +1299,20 @@ STAILQ_INIT(&be->io_head); be->io_next = NULL; // also reset the write offset. + // Only log if we don't already know it's messed up. + if (!be->bad) { + LOGGER_LOG(NULL, LOG_PROXYEVENTS, LOGGER_PROXY_BE_ERROR, NULL, proxy_be_failure_text[err], be->name, be->port, depth, be->rbuf, be->rbufused); + } + // reset buffer to blank state. be->rbufused = 0; + be->pending_read = 0; + // allow the _backend_failed() routine to connect when ready. + _stop_write_event(be); + _stop_main_event(be); + _stop_timeout_event(be); mcmc_disconnect(be->client); - int status = mcmc_connect(be->client, be->name, be->port, be->connect_flags); - if (status == MCMC_CONNECTED) { - // TODO (v2): unexpected but lets let it be here. - be->connecting = false; - be->can_write = true; - } else if (status == MCMC_CONNECTING) { - be->connecting = true; - be->can_write = false; - } else { - // TODO (v2): failed to immediately re-establish the connection. - // need to put the BE into a bad/retry state. - // FIXME (v2): until we get an event to specifically handle connecting and - // bad server handling, attempt to force a reconnect here the next - // time a request comes through. - // The event thread will attempt to write to the backend, fail, then - // end up in this routine again. - be->connecting = false; - be->can_write = true; - } - - LOGGER_LOG(NULL, LOG_PROXYEVENTS, LOGGER_PROXY_BE_ERROR, NULL, proxy_be_failure_text[err], be->name, be->port); + // we leave the main event alone, because be_failed() always overwrites. return 0; } @@ -1297,9 +1323,10 @@ int iovused = 0; if (be->io_next == NULL) { // separate pointer for how far into the list we've flushed. - be->io_next = STAILQ_FIRST(&be->io_head); + io = STAILQ_FIRST(&be->io_head); + } else { + io = be->io_next; } - io = be->io_next; assert(io != NULL); for (; io; io = STAILQ_NEXT(io, io_next)) { // TODO (v2): paranoia for now, but this check should never fire @@ -1318,16 +1345,16 @@ } // returns true if any pending writes were fully flushed. -static bool _post_pending_write(mcp_backend_t *be, ssize_t sent) { +static void _post_pending_write(mcp_backend_t *be, ssize_t sent) { io_pending_proxy_t *io = be->io_next; - assert(io != NULL); + if (io == NULL) { + io = STAILQ_FIRST(&be->io_head); + } - bool did_flush = false; for (; io; io = STAILQ_NEXT(io, io_next)) { bool flushed = true; if (io->flushed) continue; - if (sent >= io->iovbytes) { // short circuit for common case. sent -= io->iovbytes; @@ -1348,11 +1375,10 @@ } } io->flushed = flushed; - if (flushed) { - did_flush = flushed; - be->io_next = STAILQ_NEXT(io, io_next); + be->pending_read++; } + if (sent <= 0) { // really shouldn't be negative, though. assert(sent >= 0); @@ -1360,7 +1386,12 @@ } } // for - return did_flush; + // resume the flush from this point. + if (io != NULL && !io->flushed) { + be->io_next = io; + } else { + be->io_next = NULL; + } } static int _flush_pending_write(mcp_backend_t *be) { @@ -1374,11 +1405,10 @@ ssize_t sent = writev(mcmc_fd(be->client), be->write_iovs, iovcnt); if (sent > 0) { - if (_post_pending_write(be, sent)) { - flags |= EV_READ; - } + _post_pending_write(be, sent); // still have unflushed pending IO's, check for write and re-loop. if (be->io_next) { + be->can_write = false; flags |= EV_WRITE; } } else if (sent == -1) { @@ -1398,7 +1428,7 @@ assert(arg != NULL); mcp_backend_t *be = arg; int flags = EV_TIMEOUT; - struct timeval tmp_time = be->event_thread->tunables.read; + struct timeval tmp_time = be->tunables.read; if (which & EV_TIMEOUT) { P_DEBUG("%s: backend timed out while connecting\n", __func__); @@ -1414,7 +1444,7 @@ if (_proxy_beconn_checkconnect(be) == -1) { return; } - _set_event(be, be->event_thread->base, EV_READ, tmp_time, proxy_beconn_handler); + _set_main_event(be, be->event_thread->base, EV_READ, &tmp_time, proxy_beconn_handler); } // TODO: currently never taken, until validation is made optional. @@ -1426,6 +1456,7 @@ return; } flags |= res; + // FIXME: set write event? } } @@ -1442,7 +1473,7 @@ // Needed more data for a version line, somehow. I feel like // this should set off some alarms, but it is possible. if (r.code == MCMC_WANT_READ) { - _set_event(be, be->event_thread->base, EV_READ, tmp_time, proxy_beconn_handler); + _set_main_event(be, be->event_thread->base, EV_READ, &tmp_time, proxy_beconn_handler); return; } @@ -1471,7 +1502,7 @@ _backend_failed(be); return; } - _set_event(be, be->event_thread->base, EV_READ, tmp_time, proxy_beconn_handler); + _set_main_event(be, be->event_thread->base, EV_READ, &tmp_time, proxy_beconn_handler); return; } @@ -1482,14 +1513,18 @@ _backend_failed(be); return; } - flags |= res; + if (flags & EV_WRITE) { + _start_write_event(be); + } + if (be->pending_read) { + _start_timeout_event(be); + } } - // Still pending requests to read or write. - if (!be->validating && !STAILQ_EMPTY(&be->io_head)) { - _set_event(be, be->event_thread->base, flags, tmp_time, proxy_backend_handler); + // switch to the primary persistent read event. + if (!be->validating) { + _set_main_event(be, be->event_thread->base, EV_READ|EV_PERSIST, NULL, proxy_backend_handler); } - } // The libevent backend callback handler. @@ -1497,8 +1532,6 @@ // state. static void proxy_backend_handler(const int fd, const short which, void *arg) { mcp_backend_t *be = arg; - int flags = EV_TIMEOUT; - struct timeval tmp_time = be->event_thread->tunables.read; if (which & EV_TIMEOUT) { P_DEBUG("%s: timeout received, killing backend queue\n", __func__); @@ -1515,10 +1548,14 @@ _backend_failed(be); return; } - flags |= res; + if (res & EV_WRITE) { + _start_write_event(be); + } } if (which & EV_READ) { + // got a read event, always kill the pending read timer. + _stop_timeout_event(be); // We do the syscall here before diving into the state machine to allow a // common code path for io_uring/epoll int read = recv(mcmc_fd(be->client), be->rbuf + be->rbufused, @@ -1552,17 +1589,60 @@ #endif } - // Still pending requests to read or write. - if (!STAILQ_EMPTY(&be->io_head)) { - flags |= EV_READ; // FIXME (v2): might not be necessary here, but ensures we get a disconnect event. - _set_event(be, be->event_thread->base, flags, tmp_time, proxy_backend_handler); + if (be->pending_read) { + _start_timeout_event(be); } } // TODO (v2): IORING_SETUP_ATTACH_WQ port from bench_event once we have multiple // event threads. -void proxy_init_evthread_events(proxy_event_thread_t *t) { +// TODO: this either needs a restructure or split into two funcs: +// 1) for the IO thread which creates its own ring/event base +// 2) for the worker thread which reuses the event base. +// io_uring will probably only work for the IO thread which makes further +// exceptions. +void proxy_init_event_thread(proxy_event_thread_t *t, proxy_ctx_t *ctx, struct event_base *base) { + t->ctx = ctx; +#ifdef USE_EVENTFD + t->event_fd = eventfd(0, EFD_NONBLOCK); + if (t->event_fd == -1) { + perror("failed to create backend notify eventfd"); + exit(1); + } + t->be_event_fd = eventfd(0, EFD_NONBLOCK); + if (t->be_event_fd == -1) { + perror("failed to create backend notify eventfd"); + exit(1); + } +#else + int fds[2]; + if (pipe(fds)) { + perror("can't create proxy backend notify pipe"); + exit(1); + } + + t->notify_receive_fd = fds[0]; + t->notify_send_fd = fds[1]; + + if (pipe(fds)) { + perror("can't create proxy backend connection notify pipe"); + exit(1); + } + t->be_notify_receive_fd = fds[0]; + t->be_notify_send_fd = fds[1]; +#endif + + // incoming request queue. + STAILQ_INIT(&t->io_head_in); + STAILQ_INIT(&t->beconn_head_in); + pthread_mutex_init(&t->mutex, NULL); + pthread_cond_init(&t->cond, NULL); + + // initialize the event system. + #ifdef HAVE_LIBURING + fprintf(stderr, "Sorry, io_uring not supported right now\n"); + abort(); bool use_uring = t->ctx->use_uring; struct io_uring_params p = {0}; assert(t->event_fd); // uring only exists where eventfd also does. @@ -1610,12 +1690,6 @@ t->ur_benotify_event.set = false; _proxy_evthr_evset_benotifier(t); - // periodic data updater for event thread - t->ur_clock_event.cb = proxy_event_updater_ur; - t->ur_clock_event.udata = t; - t->ur_clock_event.set = false; - _proxy_evthr_evset_clock(t); - t->use_uring = true; return; } else { @@ -1628,14 +1702,19 @@ } #endif - struct event_config *ev_config; - ev_config = event_config_new(); - event_config_set_flag(ev_config, EVENT_BASE_FLAG_NOLOCK); - t->base = event_base_new_with_config(ev_config); - event_config_free(ev_config); - if (! t->base) { - fprintf(stderr, "Can't allocate event base\n"); - exit(1); + if (base == NULL) { + struct event_config *ev_config; + ev_config = event_config_new(); + event_config_set_flag(ev_config, EVENT_BASE_FLAG_NOLOCK); + t->base = event_base_new_with_config(ev_config); + event_config_free(ev_config); + if (! t->base) { + fprintf(stderr, "Can't allocate event base\n"); + exit(1); + } + } else { + // reusing an event base from a worker thread. + t->base = base; } // listen for notifications. @@ -1653,11 +1732,6 @@ EV_READ | EV_PERSIST, proxy_event_beconn, t); #endif - evtimer_set(&t->clock_event, proxy_event_updater, t); - event_base_set(t->base, &t->clock_event); - struct timeval rate = {.tv_sec = 3, .tv_usec = 0}; - evtimer_add(&t->clock_event, &rate); - event_base_set(t->base, &t->notify_event); if (event_add(&t->notify_event, 0) == -1) { fprintf(stderr, "Can't monitor libevent notify pipe\n"); diff -Nru memcached-1.6.18/sizes.c memcached-1.6.19/sizes.c --- memcached-1.6.18/sizes.c 2022-02-21 18:58:33.000000000 +0000 +++ memcached-1.6.19/sizes.c 2023-03-08 21:34:27.000000000 +0000 @@ -11,7 +11,7 @@ display("Slab Stats", sizeof(struct slab_stats)); display("Thread stats", sizeof(struct thread_stats) - - (200 * sizeof(struct slab_stats))); + - (MAX_NUMBER_OF_SLAB_CLASSES * sizeof(struct slab_stats))); display("Global stats", sizeof(struct stats)); display("Settings", sizeof(struct settings)); display("Item (no cas)", sizeof(item)); diff -Nru memcached-1.6.18/storage.c memcached-1.6.19/storage.c --- memcached-1.6.18/storage.c 2023-01-11 06:10:10.000000000 +0000 +++ memcached-1.6.19/storage.c 2023-03-08 21:34:27.000000000 +0000 @@ -19,6 +19,8 @@ /* * API functions */ +static void storage_finalize_cb(io_pending_t *pending); +static void storage_return_cb(io_pending_t *pending); // re-cast an io_pending_t into this more descriptive structure. // the first few items _must_ match the original struct. @@ -26,7 +28,10 @@ int io_queue_type; LIBEVENT_THREAD *thread; conn *c; - mc_resp *resp; /* original struct ends here */ + mc_resp *resp; + io_queue_cb return_cb; // called on worker thread. + io_queue_cb finalize_cb; // called back on the worker thread. + /* original struct ends here */ item *hdr_it; /* original header item. */ obj_io io_ctx; /* embedded extstore IO header */ unsigned int iovec_data; /* specific index of data iovec */ @@ -119,12 +124,10 @@ } - -// FIXME: This runs in the IO thread. to get better IO performance this should -// simply mark the io wrapper with the return value and decrement wrapleft, if -// zero redispatching. Still a bit of work being done in the side thread but -// minimized at least. -// TODO: wrap -> p? +// This callback runs in the IO thread. +// TODO: Some or all of this should move to the +// io_pending's callback back in the worker thread. +// It might make sense to keep the crc32c check here though. static void _storage_get_item_cb(void *e, obj_io *io, int ret) { // FIXME: assumes success io_pending_storage_t *p = (io_pending_storage_t *)io->data; @@ -227,13 +230,7 @@ p->active = false; //assert(c->io_wrapleft >= 0); - // All IO's have returned, lets re-attach this connection to our original - // thread. - io_queue_t *q = conn_io_queue_get(p->c, p->io_queue_type); - q->count--; - if (q->count == 0) { - redispatch_conn(c); - } + return_io_pending((io_pending_t *)p); } int storage_get_item(conn *c, item *it, mc_resp *resp) { @@ -271,6 +268,9 @@ p->miss = false; p->badcrc = false; p->noreply = c->noreply; + p->thread = c->thread; + p->return_cb = storage_return_cb; + p->finalize_cb = storage_finalize_cb; // io_pending owns the reference for this object now. p->hdr_it = it; p->resp = resp; @@ -371,8 +371,12 @@ void storage_submit_cb(io_queue_t *q) { // Don't need to do anything special for extstore. extstore_submit(q->ctx, q->stack_ctx); + + // need to reset the stack for next use. + q->stack_ctx = NULL; } +// Runs locally in worker thread. static void recache_or_free(io_pending_t *pending) { // re-cast to our specific struct. io_pending_storage_t *p = (io_pending_storage_t *)pending; @@ -446,18 +450,17 @@ item_remove(p->hdr_it); } -// Called after the IO is processed but before the response is transmitted. -// TODO: stubbed with a reminder: should be able to move most of the extstore -// callback code into this code instead, executing on worker thread instead of -// IO thread. -void storage_complete_cb(io_queue_t *q) { - // need to reset the stack for next use. - q->stack_ctx = NULL; - return; +// Called after an IO has been returned to the worker thread. +static void storage_return_cb(io_pending_t *pending) { + io_queue_t *q = conn_io_queue_get(pending->c, pending->io_queue_type); + q->count--; + if (q->count == 0) { + conn_worker_readd(pending->c); + } } // Called after responses have been transmitted. Need to free up related data. -void storage_finalize_cb(io_pending_t *pending) { +static void storage_finalize_cb(io_pending_t *pending) { recache_or_free(pending); io_pending_storage_t *p = (io_pending_storage_t *)pending; obj_io *io = &p->io_ctx; diff -Nru memcached-1.6.18/storage.h memcached-1.6.19/storage.h --- memcached-1.6.18/storage.h 2023-01-11 05:58:39.000000000 +0000 +++ memcached-1.6.19/storage.h 2023-03-08 21:34:27.000000000 +0000 @@ -17,10 +17,8 @@ bool storage_validate_item(void *e, item *it); int storage_get_item(conn *c, item *it, mc_resp *resp); -// callbacks for the IO queue subsystem. +// callback for the IO queue subsystem. void storage_submit_cb(io_queue_t *q); -void storage_complete_cb(io_queue_t *q); -void storage_finalize_cb(io_pending_t *pending); // Thread functions. int start_storage_write_thread(void *arg); diff -Nru memcached-1.6.18/t/lib/MemcachedTest.pm memcached-1.6.19/t/lib/MemcachedTest.pm --- memcached-1.6.18/t/lib/MemcachedTest.pm 2022-11-25 00:28:47.000000000 +0000 +++ memcached-1.6.19/t/lib/MemcachedTest.pm 2023-03-08 21:34:27.000000000 +0000 @@ -427,6 +427,11 @@ kill 'SIGUSR1', $self->{pid}; } +sub reload { + my $self = shift; + kill 'SIGHUP', $self->{pid}; +} + # -1 if the pid is actually dead. sub is_running { my $self = shift; diff -Nru memcached-1.6.18/t/proxyconfig.lua memcached-1.6.19/t/proxyconfig.lua --- memcached-1.6.18/t/proxyconfig.lua 1970-01-01 00:00:00.000000000 +0000 +++ memcached-1.6.19/t/proxyconfig.lua 2023-03-08 21:34:27.000000000 +0000 @@ -0,0 +1,59 @@ +-- get some information about the test being run from an external file +-- so we can modify ourselves. +local mode = dofile("/tmp/proxyconfigmode.lua") + +mcp.backend_read_timeout(4) +mcp.backend_connect_timeout(5) + +function mcp_config_pools(old) + if mode == "none" then + return {} + elseif mode == "start" then + local b1 = mcp.backend('b1', '127.0.0.1', 11511) + local b2 = mcp.backend('b2', '127.0.0.1', 11512) + local b3 = mcp.backend('b3', '127.0.0.1', 11513) + + local pools = { + test = mcp.pool({b1, b2, b3}) + } + return pools + elseif mode == "betable" then + local b1 = mcp.backend({ label = "b1", host = "127.0.0.1", port = 11511, + connecttimeout = 2, retrytimeout = 5, readtimeout = 0.1, + failurelimit = 0 }) + local b2 = mcp.backend({ label = "b2", host = "127.0.0.1", port = 11512, + connecttimeout = 2, retrytimeout = 5, readtimeout = 5 }) + local b3 = mcp.backend({ label = "b3", host = "127.0.0.1", port = 11513, + connecttimeout = 5, retrytimeout = 5, readtimeout = 5 }) + + local pools = { + test = mcp.pool({b1, b2, b3}) + } + return pools + elseif mode == "noiothread" then + local b1 = mcp.backend('b1', '127.0.0.1', 11514) + local b2 = mcp.backend('b2', '127.0.0.1', 11515) + local b3 = mcp.backend('b3', '127.0.0.1', 11516) + + local pools = { + test = mcp.pool({b1, b2, b3}, { iothread = false }) + } + return pools + end +end + +-- At least to start we don't need to test every command, but we should do +-- some tests against the two broad types of commands (gets vs sets with +-- payloads) +function mcp_config_routes(zones) + if mode == "none" then + mcp.attach(mcp.CMD_MG, function(r) return "SERVER_ERROR no mg route\r\n" end) + mcp.attach(mcp.CMD_MS, function(r) return "SERVER_ERROR no ms route\r\n" end) + elseif mode == "start" or mode == "betable" then + mcp.attach(mcp.CMD_MG, function(r) return zones["test"](r) end) + mcp.attach(mcp.CMD_MS, function(r) return zones["test"](r) end) + elseif mode == "noiothread" then + mcp.attach(mcp.CMD_MG, function(r) return zones["test"](r) end) + mcp.attach(mcp.CMD_MS, function(r) return zones["test"](r) end) + end +end diff -Nru memcached-1.6.18/t/proxyconfig.t memcached-1.6.19/t/proxyconfig.t --- memcached-1.6.18/t/proxyconfig.t 1970-01-01 00:00:00.000000000 +0000 +++ memcached-1.6.19/t/proxyconfig.t 2023-03-08 21:34:27.000000000 +0000 @@ -0,0 +1,286 @@ +#!/usr/bin/env perl + +# NOTE: These tests cover the act of reloading the configuration; changing +# backends, pools, routes, etc. It doesn't cover ensuring the code of the main +# file changes naturally, which is fine: there isn't any real way that can +# fail and it can be covered specifically in a different test file. + +use strict; +use warnings; +use Test::More; +use FindBin qw($Bin); +use lib "$Bin/lib"; +use Carp qw(croak); +use MemcachedTest; +use IO::Select; +use IO::Socket qw(AF_INET SOCK_STREAM); + +# TODO: possibly... set env var to a generated temp filename before starting +# the server so we can pass that in? +my $modefile = "/tmp/proxyconfigmode.lua"; + +if (!supports_proxy()) { + plan skip_all => 'proxy not enabled'; + exit 0; +} + +# Set up some server sockets. +sub mock_server { + my $port = shift; + my $srv = IO::Socket->new( + Domain => AF_INET, + Type => SOCK_STREAM, + Proto => 'tcp', + LocalHost => '127.0.0.1', + LocalPort => $port, + ReusePort => 1, + Listen => 5) || die "IO::Socket: $@"; + return $srv; +} + +# Put a version command down the pipe to ensure the socket is clear. +# client version commands skip the proxy code +sub check_version { + my $ps = shift; + print $ps "version\r\n"; + like(<$ps>, qr/VERSION /, "version received"); +} + +sub write_modefile { + my $cmd = shift; + open(my $fh, "> $modefile") or die "Couldn't overwrite $modefile: $!"; + print $fh $cmd; + close($fh); +} + +sub wait_reload { + my $w = shift; + like(<$w>, qr/ts=(\S+) gid=\d+ type=proxy_conf status=start/, "reload started"); + like(<$w>, qr/ts=(\S+) gid=\d+ type=proxy_conf status=done/, "reload completed"); +} + +my @mocksrvs = (); +diag "making mock servers"; +for my $port (11511, 11512, 11513) { + my $srv = mock_server($port); + ok(defined $srv, "mock server created"); + push(@mocksrvs, $srv); +} + +diag "testing failure to start"; +write_modefile("invalid syntax"); +eval { + my $p_srv = new_memcached('-o proxy_config=./t/proxyconfig.lua -l 127.0.0.1', 11510); +}; +ok($@ && $@ =~ m/Failed to connect/, "server successfully not started"); + +write_modefile('return "none"'); +my $p_srv = new_memcached('-o proxy_config=./t/proxyconfig.lua -l 127.0.0.1', 11510); +my $ps = $p_srv->sock; +$ps->autoflush(1); + +# Create a watcher so we can monitor when reloads complete. +my $watcher = $p_srv->new_sock; +print $watcher "watch proxyevents\n"; +is(<$watcher>, "OK\r\n", "watcher enabled"); + +{ + # test with stubbed main routes. + print $ps "mg foo v\r\n"; + is(scalar <$ps>, "SERVER_ERROR no mg route\r\n", "no mg route loaded"); +} + +# Load some backends +{ + write_modefile('return "start"'); + + $p_srv->reload(); + wait_reload($watcher); +} + +my @mbe = (); +# A map of where keys route to for worker IO tests later +my %keymap = (); +my $keycount = 100; +{ + # set up server backend sockets. + for my $msrv ($mocksrvs[0], $mocksrvs[1], $mocksrvs[2]) { + my $be = $msrv->accept(); + $be->autoflush(1); + ok(defined $be, "mock backend created"); + push(@mbe, $be); + } + + my $s = IO::Select->new(); + + for my $be (@mbe) { + $s->add($be); + like(<$be>, qr/version/, "received version command"); + print $be "VERSION 1.0.0-mock\r\n"; + } + + # Try sending something. + my $cmd = "mg foo v\r\n"; + print $ps $cmd; + my @readable = $s->can_read(0.25); + is(scalar @readable, 1, "only one backend became readable"); + my $be = shift @readable; + is(scalar <$be>, $cmd, "metaget passthrough"); + print $be "EN\r\n"; + is(scalar <$ps>, "EN\r\n", "miss received"); + + # Route a bunch of keys and map them to backends. + for my $key (0 .. $keycount) { + print $ps "mg /test/$key\r\n"; + my @readable = $s->can_read(0.25); + is(scalar @readable, 1, "only one backend became readable"); + my $be = shift @readable; + for (0 .. 2) { + if ($be == $mbe[$_]) { + $keymap{$key} = $_; + } + } + is(scalar <$be>, "mg /test/$key\r\n", "got mg passthrough"); + print $be "EN\r\n"; + is(scalar <$ps>, "EN\r\n", "miss received"); + } +} + +# Test backend table arguments and per-backend time overrides +my @holdbe = (); # avoid having the backends immediately disconnect and pollute log lines. +{ + # This should create three new backend sockets + write_modefile('return "betable"'); + $p_srv->reload(); + wait_reload($watcher); + + # sleep a short time; b1 should have a very short timeout and the + # others are long. + select(undef, undef, undef, 0.5); + + my $s = IO::Select->new(); + for my $msrv (@mocksrvs) { + $s->add($msrv); + } + my @readable = $s->can_read(0.25); + # All three backends should have changed despite having the same label, + # host, and port arguments. + is(scalar @readable, 3, "all listeners became readable"); + + like(<$watcher>, qr/ts=(\S+) gid=\d+ type=proxy_backend error=timeout name=\S+ port=11511/, "one backend timed out connecting"); + + for my $msrv (@readable) { + my $be = $msrv->accept(); + ok(defined $be, "mock backend accepted"); + like(<$be>, qr/version/, "received version command"); + print $be "VERSION 1.0.0-mock\r\n"; + push(@holdbe, $be); + } + + # reload again and ensure no sockets become readable + $p_srv->reload(); + wait_reload($watcher); + @readable = $s->can_read(0.5); + is(scalar @readable, 0, "no new sockets"); +} + +# Disconnect the existing sockets +@mbe = (); +@holdbe = (); +@mocksrvs = (); +$watcher = $p_srv->new_sock; +# Reset the watcher and let logs die off. +sleep 1; +print $watcher "watch proxyevents\n"; +is(<$watcher>, "OK\r\n", "watcher enabled"); + +{ + # re-create the mock servers so we get clean connects, the previous + # backends could be reconnecting still. + for my $port (11514, 11515, 11516) { + my $srv = mock_server($port); + ok(defined $srv, "mock server created"); + push(@mocksrvs, $srv); + } + + write_modefile('return "noiothread"'); + $p_srv->reload(); + wait_reload($watcher); + + my $s = IO::Select->new(); + for my $msrv (@mocksrvs) { + $s->add($msrv); + } + my @readable = $s->can_read(0.25); + # All three backends should become readable with new sockets. + is(scalar @readable, 3, "all listeners became readable"); + + my @bepile = (); + my $bes = IO::Select->new(); # selector just for the backend sockets. + # Each backend should create one socket per worker thread. + for my $msrv (@readable) { + my @temp = (); + for (0 .. 3) { + my $be = $msrv->accept(); + ok(defined $be, "mock backend accepted"); + like(<$be>, qr/version/, "received version command"); + print $be "VERSION 1.0.0-mock\r\n"; + $bes->add($be); + push(@temp, $be); + } + for (0 .. 2) { + if ($mocksrvs[$_] == $msrv) { + $bepile[$_] = \@temp; + } + } + } + + # clients round robin onto different worker threads, so we can test the + # key dist on different offsets. + my @cli = (); + for (0 .. 2) { + my $p = $p_srv->new_sock; + + for my $key (0 .. $keycount) { + print $p "mg /test/$key\r\n"; + @readable = $bes->can_read(0.25); + is(scalar @readable, 1, "only one backend became readable"); + my $be = shift @readable; + # find which listener this be belongs to + for my $x (0 .. 2) { + for (@{$bepile[$x]}) { + if ($_ == $be) { + cmp_ok($x, '==', $keymap{$key}, "key routed to correct listener: " . $keymap{$key}); + } + } + } + + is(scalar <$be>, "mg /test/$key\r\n", "got mg passthrough"); + print $be "EN\r\n"; + is(scalar <$p>, "EN\r\n", "miss received"); + } + + # hold onto the sockets just in case. + push(@cli, $p); + } + +} + +# TODO: +# remove backends +# do dead sockets close? +# adding user stats +# changing user stats +# adding backends with the same label don't create more connections +# total backend counters +# change top level routes mid-request +# - send the request to backend +# - issue and wait for reload +# - read from backend and respond, should use the original code still. +# - could also read from backend and then do reload/etc. + +done_testing(); + +END { + unlink $modefile; +} diff -Nru memcached-1.6.18/t/proxyinternal.lua memcached-1.6.19/t/proxyinternal.lua --- memcached-1.6.18/t/proxyinternal.lua 1970-01-01 00:00:00.000000000 +0000 +++ memcached-1.6.19/t/proxyinternal.lua 2023-03-08 21:34:27.000000000 +0000 @@ -0,0 +1,111 @@ +function mcp_config_pools(oldss) + mcp.backend_read_timeout(0.5) + mcp.backend_connect_timeout(5) + + local srv = mcp.backend + + -- Single backend for zones to ease testing. + -- For purposes of this config the proxy is always "zone 1" (z1) + local b1 = srv('b1', '127.0.0.1', 11611) + local b2 = srv('b2', '127.0.0.1', 11612) + local b3 = srv('b3', '127.0.0.1', 11613) + + local b1z = {b1} + local b2z = {b2} + local b3z = {b3} + + -- convert the backends to pools. + -- as per a normal full config see simple.lua or t/startfile.lua + local zones = { + z1 = mcp.pool(b1z), + z2 = mcp.pool(b2z), + z3 = mcp.pool(b3z), + } + + return zones +end + +-- WORKER CODE: + +-- Using a very simple route handler only to allow testing the three +-- workarounds in the same configuration file. +function prefix_factory(pattern, list, default) + local p = pattern + local l = list + local d = default + return function(r) + local route = l[string.match(r:key(), p)] + if route == nil then + return d(r) + end + return route(r) + end +end + +-- just for golfing the code in mcp_config_routes() +function toproute_factory(pfx, label) + local err = "SERVER_ERROR no " .. label .. " route\r\n" + return prefix_factory("^/(%a+)/", pfx, function(r) return err end) +end + +-- Do specialized testing based on the key prefix. +function mcp_config_routes(zones) + local pfx_get = {} + local pfx_set = {} + local pfx_touch = {} + local pfx_gets = {} + local pfx_gat = {} + local pfx_gats = {} + local pfx_cas = {} + local pfx_add = {} + local pfx_delete = {} + local pfx_incr = {} + local pfx_decr = {} + local pfx_append = {} + local pfx_prepend = {} + local pfx_mg = {} + local pfx_ms = {} + local pfx_md = {} + local pfx_ma = {} + + local basic = function(r) + return mcp.internal(r) + end + + pfx_get["b"] = basic + pfx_set["b"] = basic + pfx_touch["b"] = basic + pfx_gets["b"] = basic + pfx_gat["b"] = basic + pfx_gats["b"] = basic + pfx_cas["b"] = basic + pfx_add["b"] = basic + pfx_delete["b"] = basic + pfx_incr["b"] = basic + pfx_decr["b"] = basic + pfx_append["b"] = basic + pfx_prepend["b"] = basic + pfx_mg["b"] = basic + pfx_ms["b"] = basic + pfx_md["b"] = basic + pfx_ma["b"] = basic + + mcp.attach(mcp.CMD_GET, toproute_factory(pfx_get, "get")) + mcp.attach(mcp.CMD_SET, toproute_factory(pfx_set, "set")) + mcp.attach(mcp.CMD_TOUCH, toproute_factory(pfx_touch, "touch")) + mcp.attach(mcp.CMD_GETS, toproute_factory(pfx_gets, "gets")) + mcp.attach(mcp.CMD_GAT, toproute_factory(pfx_gat, "gat")) + mcp.attach(mcp.CMD_GATS, toproute_factory(pfx_gats, "gats")) + mcp.attach(mcp.CMD_CAS, toproute_factory(pfx_cas, "cas")) + mcp.attach(mcp.CMD_ADD, toproute_factory(pfx_add, "add")) + mcp.attach(mcp.CMD_DELETE, toproute_factory(pfx_delete, "delete")) + mcp.attach(mcp.CMD_INCR, toproute_factory(pfx_incr, "incr")) + mcp.attach(mcp.CMD_DECR, toproute_factory(pfx_decr, "decr")) + mcp.attach(mcp.CMD_APPEND, toproute_factory(pfx_append, "append")) + mcp.attach(mcp.CMD_PREPEND, toproute_factory(pfx_prepend, "prepend")) + mcp.attach(mcp.CMD_MG, toproute_factory(pfx_mg, "mg")) + mcp.attach(mcp.CMD_MS, toproute_factory(pfx_ms, "ms")) + mcp.attach(mcp.CMD_MD, toproute_factory(pfx_md, "md")) + mcp.attach(mcp.CMD_MA, toproute_factory(pfx_ma, "ma")) + +end diff -Nru memcached-1.6.18/t/proxyinternal.t memcached-1.6.19/t/proxyinternal.t --- memcached-1.6.18/t/proxyinternal.t 1970-01-01 00:00:00.000000000 +0000 +++ memcached-1.6.19/t/proxyinternal.t 2023-03-08 21:34:27.000000000 +0000 @@ -0,0 +1,128 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use Test::More; +use FindBin qw($Bin); +use lib "$Bin/lib"; +use Carp qw(croak); +use MemcachedTest; +use IO::Socket qw(AF_INET SOCK_STREAM); +use IO::Select; + +if (!supports_proxy()) { + plan skip_all => 'proxy not enabled'; + exit 0; +} + +# Don't want to write two distinct set of tests, and extstore is a default. +if (!supports_extstore()) { + plan skip_all => 'extstore not enabled'; + exit 0; +} + +my $ext_path = "/tmp/proxyinternal.$$"; + +# Set up some server sockets. +sub mock_server { + my $port = shift; + my $srv = IO::Socket->new( + Domain => AF_INET, + Type => SOCK_STREAM, + Proto => 'tcp', + LocalHost => '127.0.0.1', + LocalPort => $port, + ReusePort => 1, + Listen => 5) || die "IO::Socket: $@"; + return $srv; +} + +# Put a version command down the pipe to ensure the socket is clear. +# client version commands skip the proxy code +sub check_version { + my $ps = shift; + print $ps "version\r\n"; + like(<$ps>, qr/VERSION /, "version received"); +} + +my @mocksrvs = (); +#diag "making mock servers"; +for my $port (11611, 11612, 11613) { + my $srv = mock_server($port); + ok(defined $srv, "mock server created"); + push(@mocksrvs, $srv); +} + +my $p_srv = new_memcached("-o proxy_config=./t/proxyinternal.lua,ext_item_size=500,ext_item_age=1,ext_path=$ext_path:64m,ext_max_sleep=100000 -l 127.0.0.1 -U 0", 11510); +my $ps = $p_srv->sock; +$ps->autoflush(1); + +# set up server backend sockets. +# uncomment when needed. currently they get thrown out so this can hang. +#my @mbe = (); +#diag "accepting mock backends"; +#for my $msrv (@mocksrvs) { +# my $be = $msrv->accept(); +# $be->autoflush(1); +# ok(defined $be, "mock backend created"); +# push(@mbe, $be); +#} + +#diag "validating backends"; +#for my $be (@mbe) { +# like(<$be>, qr/version/, "received version command"); +# print $be "VERSION 1.0.0-mock\r\n"; +#} + +#diag "object too large" +{ + my $data = 'x' x 2000000; + print $ps "set /b/toolarge 0 0 2000000\r\n$data\r\n"; + is(scalar <$ps>, "SERVER_ERROR object too large for cache\r\n", "set too large"); + + print $ps "ms /b/toolarge 2000000 T30\r\n$data\r\n"; + is(scalar <$ps>, "SERVER_ERROR object too large for cache\r\n", "ms too large"); +} + +#diag "basic tests" +{ + print $ps "set /b/foo 0 0 2\r\nhi\r\n"; + is(scalar <$ps>, "STORED\r\n", "int set"); + print $ps "get /b/foo\r\n"; + is(scalar <$ps>, "VALUE /b/foo 0 2\r\n", "get response"); + is(scalar <$ps>, "hi\r\n", "get value"); + is(scalar <$ps>, "END\r\n", "get END"); +} + +#diag "fetch from extstore" +{ + my $data = 'x' x 1000; + print $ps "set /b/ext 0 0 1000\r\n$data\r\n"; + is(scalar <$ps>, "STORED\r\n", "int set for extstore"); + sleep 3; # TODO: import wait_for_ext + + print $ps "get /b/ext\r\n"; + is(scalar <$ps>, "VALUE /b/ext 0 1000\r\n", "get response from extstore"); + is(scalar <$ps>, "$data\r\n", "got data from extstore"); + is(scalar <$ps>, "END\r\n", "get END"); +} + +#diag "flood memory" +{ + # ensure we don't have a basic reference counter leak + my $data = 'x' x 500000; + for (1 .. 200) { + print $ps "set /b/$_ 0 0 500000\r\n$data\r\n"; + is(scalar <$ps>, "STORED\r\n", "flood set"); + } + for (1 .. 200) { + print $ps "ms /b/$_ 500000 T30\r\n$data\r\n"; + is(scalar <$ps>, "HD\r\n", "flood ms"); + } +} + +done_testing(); + +END { + unlink $ext_path if $ext_path; +} diff -Nru memcached-1.6.18/t/proxyunits.lua memcached-1.6.19/t/proxyunits.lua --- memcached-1.6.18/t/proxyunits.lua 1970-01-01 00:00:00.000000000 +0000 +++ memcached-1.6.19/t/proxyunits.lua 2023-03-08 21:34:27.000000000 +0000 @@ -0,0 +1,281 @@ +mcp.backend_read_timeout(0.5) +mcp.backend_connect_timeout(5) + +function mcp_config_pools(oldss) + local srv = mcp.backend + + -- Single backend for zones to ease testing. + -- For purposes of this config the proxy is always "zone 1" (z1) + local b1 = srv('b1', '127.0.0.1', 11411) + local b2 = srv('b2', '127.0.0.1', 11412) + local b3 = srv('b3', '127.0.0.1', 11413) + + local b1z = {b1} + local b2z = {b2} + local b3z = {b3} + + -- convert the backends to pools. + -- as per a normal full config see simple.lua or t/startfile.lua + local zones = { + z1 = mcp.pool(b1z), + z2 = mcp.pool(b2z), + z3 = mcp.pool(b3z), + } + + return zones +end + +-- WORKER CODE: + +-- Using a very simple route handler only to allow testing the three +-- workarounds in the same configuration file. +function prefix_factory(pattern, list, default) + local p = pattern + local l = list + local d = default + return function(r) + local route = l[string.match(r:key(), p)] + if route == nil then + return d(r) + end + return route(r) + end +end + +-- just for golfing the code in mcp_config_routes() +function toproute_factory(pfx, label) + local err = "SERVER_ERROR no " .. label .. " route\r\n" + return prefix_factory("^/(%a+)/", pfx, function(r) return err end) +end + +-- Do specialized testing based on the key prefix. +function mcp_config_routes(zones) + local pfx_get = {} + local pfx_set = {} + local pfx_touch = {} + local pfx_gets = {} + local pfx_gat = {} + local pfx_gats = {} + local pfx_cas = {} + local pfx_add = {} + local pfx_delete = {} + local pfx_incr = {} + local pfx_decr = {} + local pfx_append = {} + local pfx_prepend = {} + local pfx_mg = {} + local pfx_ms = {} + local pfx_md = {} + local pfx_ma = {} + + local basic = function(r) + return zones.z1(r) + end + + pfx_get["b"] = basic + pfx_set["b"] = basic + pfx_touch["b"] = basic + pfx_gets["b"] = basic + pfx_gat["b"] = basic + pfx_gats["b"] = basic + pfx_cas["b"] = basic + pfx_add["b"] = basic + pfx_delete["b"] = basic + pfx_incr["b"] = basic + pfx_decr["b"] = basic + pfx_append["b"] = basic + pfx_prepend["b"] = basic + pfx_mg["b"] = basic + pfx_ms["b"] = basic + pfx_md["b"] = basic + pfx_ma["b"] = basic + + -- show that we fetched the key by generating our own response string. + pfx_get["getkey"] = function(r) + return "VALUE |" .. r:key() .. " 0 2\r\nts\r\nEND\r\n" + end + + pfx_get["rtrimkey"] = function(r) + r:rtrimkey(4) + return zones.z1(r) + end + + pfx_get["ltrimkey"] = function(r) + r:ltrimkey(10) + return zones.z1(r) + end + + -- Basic test for routing requests to specific pools. + -- Not sure how this could possibly break but testing for completeness. + pfx_get["zonetest"] = function(r) + local key = r:key() + if key == "/zonetest/a" then + return zones.z1(r) + elseif key == "/zonetest/b" then + return zones.z2(r) + elseif key == "/zonetest/c" then + return zones.z3(r) + else + return "END\r\n" + end + end + + pfx_get["logtest"] = function(r) + mcp.log("testing manual log messages") + return "END\r\n" + end + + pfx_get["logreqtest"] = function(r) + local res = zones.z1(r) + mcp.log_req(r, res, "logreqtest") + return res + end + + -- tell caller what we got back via a fake response + pfx_get["awaitbasic"] = function(r) + local vals = {} + local rtable = mcp.await(r, { zones.z1, zones.z2, zones.z3 }) + + for i, res in pairs(rtable) do + if res:hit() == true then + vals[i] = "hit" + elseif res:ok() == true then + vals[i] = "ok" + else + vals[i] = "err" + end + end + + local val = table.concat(vals, " ") + local vlen = string.len(val) + -- convenience functions for creating responses would be nice :) + return "VALUE " .. r:key() .. " 0 " .. vlen .. "\r\n" .. val .. "\r\nEND\r\n" + end + + pfx_get["awaitone"] = function(r) + local mode = string.sub(r:key(), -1, -1) + local num = 0 + if mode == "a" then + num = 1 + elseif mode == "b" then + num = 2 + end + local rtable = mcp.await(r, { zones.z1, zones.z2, zones.z3 }, num) + + local count = 0 + for i, res in pairs(rtable) do + count = count + 1 + end + + local vlen = string.len(count) + return "VALUE " .. r:key() .. " 0 " .. vlen .. "\r\n" .. count .. "\r\nEND\r\n" + end + + -- should be the same as awaitone + pfx_get["awaitgood"] = function(r) + local mode = string.sub(r:key(), -1, -1) + local num = 0 + if mode == "a" then + num = 1 + elseif mode == "b" then + num = 2 + end + local rtable = mcp.await(r, { zones.z1, zones.z2, zones.z3 }, num, mcp.AWAIT_GOOD) + + local count = 0 + for i, res in pairs(rtable) do + count = count + 1 + end + + local vlen = string.len(count) + return "VALUE " .. r:key() .. " 0 " .. vlen .. "\r\n" .. count .. "\r\nEND\r\n" + end + + -- not sure if anything else should be checked here? if err or not? + pfx_get["awaitany"] = function(r) + local rtable = mcp.await(r, { zones.z1, zones.z2, zones.z3 }, 2, mcp.AWAIT_ANY) + local count = 0 + for i, res in pairs(rtable) do + count = count + 1 + end + + local vlen = string.len(count) + return "VALUE " .. r:key() .. " 0 " .. vlen .. "\r\n" .. count .. "\r\nEND\r\n" + end + + pfx_get["awaitbg"] = function(r) + local rtable = mcp.await(r, { zones.z1, zones.z2, zones.z3 }, 1, mcp.AWAIT_BACKGROUND) + local count = 0 + for i, res in pairs(rtable) do + count = count + 1 + end + + local vlen = string.len(count) + return "VALUE " .. r:key() .. " 0 " .. vlen .. "\r\n" .. count .. "\r\nEND\r\n" + end + + pfx_set["awaitlogerr"] = function(r) + local rtable = mcp.await_logerrors(r, { zones.z1, zones.z2, zones.z3 }, 1, mcp.AWAIT_FASTGOOD, "write_failed") + return rtable[1] + end + + -- testing different styles of building the table argument for mcp.await() + pfx_get["awaitfastgood"] = function(r) + local all_zones = {} + for k, v in pairs(zones) do + all_zones[k] = v + end + + local restable = mcp.await(r, all_zones, 2, mcp.AWAIT_FASTGOOD) + + local final_res = restable[1] + local count = 0 + for _, res in pairs(restable) do + if res:hit() then + final_res = res + end + count = count + 1 + end + + return final_res + end + + pfx_set["awaitfastgood"] = function(r) + local all_zones = {} + for _, v in pairs(zones) do + table.insert(all_zones, v) + end + + local restable = mcp.await(r, all_zones, 2) + local count = 0 + local good_res = restable[1] + for _, res in pairs(restable) do + if res:ok() then + good_res = res + end + count = count + 1 + end + + print("Set Response count: " .. count) + return good_res + end + + mcp.attach(mcp.CMD_GET, toproute_factory(pfx_get, "get")) + mcp.attach(mcp.CMD_SET, toproute_factory(pfx_set, "set")) + mcp.attach(mcp.CMD_TOUCH, toproute_factory(pfx_touch, "touch")) + mcp.attach(mcp.CMD_GETS, toproute_factory(pfx_gets, "gets")) + mcp.attach(mcp.CMD_GAT, toproute_factory(pfx_gat, "gat")) + mcp.attach(mcp.CMD_GATS, toproute_factory(pfx_gats, "gats")) + mcp.attach(mcp.CMD_CAS, toproute_factory(pfx_cas, "cas")) + mcp.attach(mcp.CMD_ADD, toproute_factory(pfx_add, "add")) + mcp.attach(mcp.CMD_DELETE, toproute_factory(pfx_delete, "delete")) + mcp.attach(mcp.CMD_INCR, toproute_factory(pfx_incr, "incr")) + mcp.attach(mcp.CMD_DECR, toproute_factory(pfx_decr, "decr")) + mcp.attach(mcp.CMD_APPEND, toproute_factory(pfx_append, "append")) + mcp.attach(mcp.CMD_PREPEND, toproute_factory(pfx_prepend, "prepend")) + mcp.attach(mcp.CMD_MG, toproute_factory(pfx_mg, "mg")) + mcp.attach(mcp.CMD_MS, toproute_factory(pfx_ms, "ms")) + mcp.attach(mcp.CMD_MD, toproute_factory(pfx_md, "md")) + mcp.attach(mcp.CMD_MA, toproute_factory(pfx_ma, "ma")) + +end diff -Nru memcached-1.6.18/t/proxyunits.t memcached-1.6.19/t/proxyunits.t --- memcached-1.6.18/t/proxyunits.t 1970-01-01 00:00:00.000000000 +0000 +++ memcached-1.6.19/t/proxyunits.t 2023-03-08 21:34:27.000000000 +0000 @@ -0,0 +1,733 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use Test::More; +use FindBin qw($Bin); +use lib "$Bin/lib"; +use Carp qw(croak); +use MemcachedTest; +use IO::Socket qw(AF_INET SOCK_STREAM); +use IO::Select; + +if (!supports_proxy()) { + plan skip_all => 'proxy not enabled'; + exit 0; +} + +# Set up some server sockets. +sub mock_server { + my $port = shift; + my $srv = IO::Socket->new( + Domain => AF_INET, + Type => SOCK_STREAM, + Proto => 'tcp', + LocalHost => '127.0.0.1', + LocalPort => $port, + ReusePort => 1, + Listen => 5) || die "IO::Socket: $@"; + return $srv; +} + +# Put a version command down the pipe to ensure the socket is clear. +# client version commands skip the proxy code +sub check_version { + my $ps = shift; + print $ps "version\r\n"; + like(<$ps>, qr/VERSION /, "version received"); +} + +my @mocksrvs = (); +#diag "making mock servers"; +for my $port (11411, 11412, 11413) { + my $srv = mock_server($port); + ok(defined $srv, "mock server created"); + push(@mocksrvs, $srv); +} + +my $p_srv = new_memcached('-o proxy_config=./t/proxyunits.lua -l 127.0.0.1', 11410); +my $ps = $p_srv->sock; +$ps->autoflush(1); + +# set up server backend sockets. +my @mbe = (); +#diag "accepting mock backends"; +for my $msrv (@mocksrvs) { + my $be = $msrv->accept(); + $be->autoflush(1); + ok(defined $be, "mock backend created"); + push(@mbe, $be); +} + +#diag "validating backends"; +for my $be (@mbe) { + like(<$be>, qr/version/, "received version command"); + print $be "VERSION 1.0.0-mock\r\n"; +} + +{ + # Test a fix for passing through partial read data if END ends up missing. + print $ps "get /b/a\r\n"; + my $be = $mbe[0]; + + is(scalar <$be>, "get /b/a\r\n", "get passthrough"); + print $be "VALUE /b/a 0 2\r\nhi\r\nEN"; + + is(scalar <$ps>, "SERVER_ERROR backend failure\r\n", "backend failure error"); + + # re-accept the backend. + $be = $mocksrvs[0]->accept(); + $be->autoflush(1); + like(<$be>, qr/version/, "received version command"); + print $be "VERSION 1.0.0-mock\r\n"; + $mbe[0] = $be; +} + +{ + # Test a log line with detailed data from backend failures. + my $be = $mbe[0]; + my $w = $p_srv->new_sock; + print $w "watch proxyevents\n"; + is(<$w>, "OK\r\n", "watcher enabled"); + + print $ps "get /b/c\r\n"; + is(scalar <$be>, "get /b/c\r\n", "get passthrough"); + # Set off a "trailing data" error + print $be "VALUE /b/c 0 2\r\nok\r\nEND\r\ngarbage"; + + is(scalar <$ps>, "VALUE /b/c 0 2\r\n", "got value back"); + is(scalar <$ps>, "ok\r\n", "got data back"); + is(scalar <$ps>, "END\r\n", "got end string"); + + like(<$w>, qr/ts=(\S+) gid=\d+ type=proxy_backend error=trailingdata name=127.0.0.1 port=\d+ depth=0 rbuf=garbage/, "got backend error log line"); + + # re-accept the backend. + $be = $mocksrvs[0]->accept(); + $be->autoflush(1); + like(<$be>, qr/version/, "received version command"); + print $be "VERSION 1.0.0-mock\r\n"; + $mbe[0] = $be; +} + +SKIP: { + skip "Remove this skip line to demonstrate pre-patch bug", 1; + # Test issue with finding response complete when read lands between value + # size and value + response line in size. + my $be = $mbe[0]; + my $w = $p_srv->new_sock; + print $w "watch proxyevents\n"; + is(<$w>, "OK\r\n", "watcher enabled"); + + print $ps "get /b/c\r\n"; + is(scalar <$be>, "get /b/c\r\n", "get passthrough"); + + # Set off a "missingend" error. + # The server will wake up several times, thinking it has read the + # full size of response but it only read enough for the value portion. + print $be "VALUE /b/c 0 5\r\nhe"; + sleep 0.1; + print $be "llo"; + sleep 0.1; + print $be "\r\nEND\r\n"; + + is(scalar <$ps>, "SERVER_ERROR backend failure\r\n"); + + like(<$w>, qr/ts=(\S+) gid=\d+ type=proxy_backend error=missingend name=127.0.0.1 port=\d+ depth=1 rbuf=/, "got missingend error log line"); + + # re-accept the backend. + $be = $mocksrvs[0]->accept(); + $be->autoflush(1); + like(<$be>, qr/version/, "received version command"); + print $be "VERSION 1.0.0-mock\r\n"; + $mbe[0] = $be; +} + +{ + # Test issue with finding response complete when read lands between value + # size and value + response line in size. + my $be = $mbe[0]; + + print $ps "get /b/c\r\n"; + is(scalar <$be>, "get /b/c\r\n", "get passthrough"); + + # Set off a "missingend" error. + # The server will wake up several times, thinking it has read the + # full size of response but it only read enough for the value portion. + print $be "VALUE /b/c 0 5\r\nhe"; + sleep 0.1; + print $be "llo"; + sleep 0.1; + print $be "\r\nEND\r\n"; + + is(scalar <$ps>, "VALUE /b/c 0 5\r\n", "got value back"); + is(scalar <$ps>, "hello\r\n", "got data back"); + is(scalar <$ps>, "END\r\n", "got end string"); +} + +#diag "ready for main tests"; +# Target a single backend, validating basic syntax. +# Should test all command types. +# uses /b/ path for "basic" +{ + # Test invalid route. + print $ps "set /invalid/a 0 0 2\r\nhi\r\n"; + is(scalar <$ps>, "SERVER_ERROR no set route\r\n"); + + # Testing against just one backend. Results should make sense despite our + # invalid request above. + my $be = $mbe[0]; + my $cmd; + + # TODO: add more tests for the varying response codes. + + # Basic set. + $cmd = "set /b/a 0 0 2"; + print $ps "$cmd\r\nhi\r\n"; + is(scalar <$be>, "$cmd\r\n", "set passthrough"); + is(scalar <$be>, "hi\r\n", "set value"); + print $be "STORED\r\n"; + + is(scalar <$ps>, "STORED\r\n", "got STORED from set"); + + # Basic get + $cmd = "get /b/a\r\n"; + print $ps $cmd; + is(scalar <$be>, $cmd, "get passthrough"); + print $be "VALUE /b/a 0 2\r\nhi\r\nEND\r\n"; + + is(scalar <$ps>, "VALUE /b/a 0 2\r\n", "get rline"); + is(scalar <$ps>, "hi\r\n", "get data"); + is(scalar <$ps>, "END\r\n", "get end"); + + # touch + $cmd = "touch /b/a 50\r\n"; + print $ps $cmd; + is(scalar <$be>, $cmd, "touch passthrough"); + print $be "TOUCHED\r\n"; + + is(scalar <$ps>, "TOUCHED\r\n", "got touch response"); + + # gets + $cmd = "gets /b/a\r\n"; + print $ps $cmd; + is(scalar <$be>, $cmd, "gets passthrough"); + print $be "VALUE /b/a 0 2 2\r\nhi\r\nEND\r\n"; + + is(scalar <$ps>, "VALUE /b/a 0 2 2\r\n", "gets rline"); + is(scalar <$ps>, "hi\r\n", "gets data"); + is(scalar <$ps>, "END\r\n", "gets end"); + + # gat + $cmd = "gat 10 /b/a\r\n"; + print $ps $cmd; + is(scalar <$be>, $cmd, "gat passthrough"); + print $be "VALUE /b/a 0 2\r\nhi\r\nEND\r\n"; + + is(scalar <$ps>, "VALUE /b/a 0 2\r\n", "gat rline"); + is(scalar <$ps>, "hi\r\n", "gat data"); + is(scalar <$ps>, "END\r\n", "gat end"); + + # gats + $cmd = "gats 11 /b/a\r\n"; + print $ps $cmd; + is(scalar <$be>, $cmd, "gats passthrough"); + print $be "VALUE /b/a 0 2 1\r\nhi\r\nEND\r\n"; + + is(scalar <$ps>, "VALUE /b/a 0 2 1\r\n", "gats rline"); + is(scalar <$ps>, "hi\r\n", "gats data"); + is(scalar <$ps>, "END\r\n", "gats end"); + + # cas + $cmd = "cas /b/a 0 0 2 5"; + print $ps "$cmd\r\nhi\r\n"; + is(scalar <$be>, "$cmd\r\n", "cas passthrough"); + is(scalar <$be>, "hi\r\n", "cas value"); + print $be "STORED\r\n"; + + is(scalar <$ps>, "STORED\r\n", "got STORED from cas"); + + # add + $cmd = "add /b/a 0 0 2"; + print $ps "$cmd\r\nhi\r\n"; + is(scalar <$be>, "$cmd\r\n", "add passthrough"); + is(scalar <$be>, "hi\r\n", "add value"); + print $be "STORED\r\n"; + + is(scalar <$ps>, "STORED\r\n", "got STORED from add"); + + # delete + $cmd = "delete /b/a\r\n"; + print $ps $cmd; + is(scalar <$be>, $cmd, "delete passthrough"); + print $be "DELETED\r\n"; + + is(scalar <$ps>, "DELETED\r\n", "got delete response"); + + # incr + $cmd = "incr /b/a 1\r\n"; + print $ps $cmd; + is(scalar <$be>, $cmd, "incr passthrough"); + print $be "2\r\n"; + + is(scalar <$ps>, "2\r\n", "got incr response"); + + # decr + $cmd = "decr /b/a 1\r\n"; + print $ps $cmd; + is(scalar <$be>, $cmd, "decr passthrough"); + print $be "10\r\n"; + + is(scalar <$ps>, "10\r\n", "got decr response"); + + # append + $cmd = "append /b/a 0 0 2"; + print $ps "$cmd\r\nhi\r\n"; + is(scalar <$be>, "$cmd\r\n", "append passthrough"); + is(scalar <$be>, "hi\r\n", "append value"); + print $be "STORED\r\n"; + + is(scalar <$ps>, "STORED\r\n", "got STORED from append"); + + # prepend + $cmd = "prepend /b/a 0 0 2"; + print $ps "$cmd\r\nhi\r\n"; + is(scalar <$be>, "$cmd\r\n", "prepend passthrough"); + is(scalar <$be>, "hi\r\n", "prepend value"); + print $be "STORED\r\n"; + + is(scalar <$ps>, "STORED\r\n", "got STORED from prepend"); + + # [meta commands] + # testing the bare meta commands. + # TODO: add more tests for tokens and changing response codes. + # mg + $cmd = "mg /b/a\r\n"; + print $ps $cmd; + is(scalar <$be>, $cmd, "mg passthrough"); + print $be "HD\r\n"; + + is(scalar <$ps>, "HD\r\n", "got mg response"); + # ms + $cmd = "ms /b/a 2"; + print $ps "$cmd\r\nhi\r\n"; + is(scalar <$be>, "$cmd\r\n", "ms passthrough"); + is(scalar <$be>, "hi\r\n", "ms value"); + print $be "HD\r\n"; + + is(scalar <$ps>, "HD\r\n", "got HD from ms"); + + # md + $cmd = "md /b/a\r\n"; + print $ps $cmd; + is(scalar <$be>, $cmd, "md passthrough"); + print $be "HD\r\n"; + + is(scalar <$ps>, "HD\r\n", "got HD from md"); + # ma + $cmd = "ma /b/a\r\n"; + print $ps $cmd; + is(scalar <$be>, $cmd, "ma passthrough"); + print $be "HD\r\n"; + + is(scalar <$ps>, "HD\r\n", "got HD from ma"); + # mn? + # me? +} + +# run a cleanser check between each set of tests. +check_version($ps); + +{ + # multiget syntax + # - gets broken into individual gets on backend + my $be = $mbe[0]; + my $cmd = "get /b/a /b/b /b/c\r\n"; + print $ps $cmd; + # NOTE: the proxy ends up reversing the keys to the backend, but returns keys in the + # proper order. This is undesireable but not problematic: because of how + # ascii multiget syntax works the server cannot start responding until all + # answers are resolved anyway. + is(scalar <$be>, "get /b/c\r\n", "multiget breakdown c"); + is(scalar <$be>, "get /b/b\r\n", "multiget breakdown b"); + is(scalar <$be>, "get /b/a\r\n", "multiget breakdown a"); + + print $be "VALUE /b/c 0 1\r\nc\r\n", + "END\r\n", + "VALUE /b/b 0 1\r\nb\r\n", + "END\r\n", + "VALUE /b/a 0 1\r\na\r\n", + "END\r\n"; + + for my $key ('a', 'b', 'c') { + is(scalar <$ps>, "VALUE /b/$key 0 1\r\n", "multiget res $key"); + is(scalar <$ps>, "$key\r\n", "multiget value $key"); + } + is(scalar <$ps>, "END\r\n", "final END from multiget"); + + # Test multiget workaround with misses (known bug) + print $ps $cmd; + is(scalar <$be>, "get /b/c\r\n", "multiget breakdown c"); + is(scalar <$be>, "get /b/b\r\n", "multiget breakdown b"); + is(scalar <$be>, "get /b/a\r\n", "multiget breakdown a"); + + print $be "END\r\nEND\r\nEND\r\n"; + is(scalar <$ps>, "END\r\n", "final END from multiget"); + + # If bugged, the backend will have closed. + print $ps "get /b/a\r\n"; + is(scalar <$be>, "get /b/a\r\n", "get works after empty multiget"); + print $be "END\r\n"; + is(scalar <$ps>, "END\r\n", "end after empty multiget"); +} + +check_version($ps); + +{ + # noreply tests. + # - backend should receive with noreply/q stripped or mangled + # - backend should reply as normal + # - frontend should get nothing; to test issue another command and ensure + # it only gets that response. + my $be = $mbe[0]; + my $cmd = "set /b/a 0 0 2 noreply\r\nhi\r\n"; + print $ps $cmd; + is(scalar <$be>, "set /b/a 0 0 2 noreplY\r\n", "set received with broken noreply"); + is(scalar <$be>, "hi\r\n", "set payload received"); + + print $be "STORED\r\n"; + + # To ensure success, make another req and ensure res isn't STORED + $cmd = "touch /b/a 50\r\n"; + print $ps $cmd; + is(scalar <$be>, $cmd, "canary touch received"); + print $be "TOUCHED\r\n"; + + is(scalar <$ps>, "TOUCHED\r\n", "got TOUCHED instread of STORED"); + + # TODO: meta quiet cases + # - q should be turned into a space on the backend + # - errors should still pass through to client +} + +check_version($ps); + +# Test Lua request API +{ + my $be = $mbe[0]; + + # fetching the key. + print $ps "get /getkey/testkey\r\n"; + # look for the key to be slightly different to ensure we hit lua. + is(scalar <$ps>, "VALUE |/getkey/testkey 0 2\r\n", "request:key()"); + is(scalar <$ps>, "ts\r\n", "request:key() value"); + is(scalar <$ps>, "END\r\n", "request:key() END"); + + # rtrimkey + # this overwrites part of the key with spaces, which should be skipped by + # a valid protocol parser. + print $ps "get /rtrimkey/onehalf\r\n"; + is(scalar <$be>, "get /rtrimkey/one \r\n", "request:rtrimkey()"); + print $be "END\r\n"; + is(scalar <$ps>, "END\r\n", "rtrimkey END"); + + # ltrimkey + print $ps "get /ltrimkey/test\r\n"; + is(scalar <$be>, "get test\r\n", "request:ltrimkey()"); + print $be "END\r\n"; + is(scalar <$ps>, "END\r\n", "ltrimkey END"); + + # token(n) fetch + # token(n, "replacement") + # token(n, "") removal + # ntokens() + # command() integer + # + # meta: + # has_flag("F") + # test has_flag() against non-meta command + # flag_token("F") with no token (bool, nil|token) + # flag_token("F") with token + # flag_token("F", "FReplacement") + # flag_token("F", "") removal + # flag_token("F", "FReplacement") -> flag_token("F") test repeated fetch + + # mcp.request() - has a few modes to test + # - allows passing in an existing request to clone/edit + # - passing in value blob +} + +check_version($ps); +# Test Lua response API +#{ + # elapsed() + # ok() + # hit() + # vlen() + # code() + # line() +#} + +# Test requests land in proper backend in basic scenarios +{ + # TODO: maybe should send values to ensure the right response? + # I don't think this test is very useful though; probably better to try + # harder when testing error conditions. + for my $tu (['a', $mbe[0]], ['b', $mbe[1]], ['c', $mbe[2]]) { + my $be = $tu->[1]; + my $cmd = "get /zonetest/" . $tu->[0] . "\r\n"; + print $ps $cmd; + is(scalar <$be>, $cmd, "routed proper zone: " . $tu->[0]); + print $be "END\r\n"; + is(scalar <$ps>, "END\r\n", "end from zone fetch"); + } + my $cmd = "get /zonetest/invalid\r\n"; + print $ps $cmd; + is(scalar <$ps>, "END\r\n", "END from invalid route"); +} + +check_version($ps); +# Test re-requests in lua. +# - fetch zones.z1() then fetch zones.z2() +# - return z1 or z2 or netiher +# - fetch all three zones +# - hit the same zone multiple times + +# Test out of spec commands from client +# - wrong # of tokens +# - bad key size +# - etc + +# Test errors/garbage from server +# - certain errors pass through to the client, most close the backend. + +# Test delayed read (timeout) + +# Test Lua logging (see t/watcher.t) +{ + my $be = $mbe[0]; + my $watcher = $p_srv->new_sock; + print $watcher "watch proxyuser proxyreqs\n"; + is(<$watcher>, "OK\r\n", "watcher enabled"); + + # log(msg) + print $ps "get /logtest/a\r\n"; + like(<$watcher>, qr/ts=(\S+) gid=\d+ type=proxy_user msg=testing manual log messages/, + "log a manual message"); + is(scalar <$ps>, "END\r\n", "logtest END"); + + # log_req(r, res) + my $cmd = "get /logreqtest/a\r\n"; + print $ps $cmd; + is(scalar <$be>, $cmd, "got passthru for log"); + print $be "END\r\n"; + is(scalar <$ps>, "END\r\n", "got END from log test"); + like(<$watcher>, qr/ts=(\S+) gid=\d+ type=proxy_req elapsed=\d+ type=105 code=17 status=0 be=127.0.0.1:11411 detail=logreqtest req=get \/logreqtest\/a/, "found request log entry"); + + # test log_req with nil res (should be 0's in places) + # log_reqsample() +} + +# Basic proxy stats validation + +# Test user stats + +check_version($ps); +# Test await arguments (may move to own file?) +# TODO: the results table from mcp.await() contains all of the results so far, +# regardless of the mode. +# need some tests that show this. +{ + my $cmd; + # await(r, p) + # this should hit all three backends + my $key = "/awaitbasic/a"; + $cmd = "get $key\r\n"; + print $ps $cmd; + for my $be (@mbe) { + is(scalar <$be>, $cmd, "awaitbasic backend req"); + print $be "VALUE $key 0 2\r\nok\r\nEND\r\n"; + } + is(scalar <$ps>, "VALUE $key 0 11\r\n", "response from await"); + is(scalar <$ps>, "hit hit hit\r\n", "hit responses from await"); + is(scalar <$ps>, "END\r\n", "end from await"); + # repeat above test but with different combo of results + + # await(r, p, 1) + $key = "/awaitone/a"; + $cmd = "get $key\r\n"; + print $ps $cmd; + for my $be (@mbe) { + is(scalar <$be>, $cmd, "awaitone backend req"); + print $be "VALUE $key 0 2\r\nok\r\nEND\r\n"; + } + is(scalar <$ps>, "VALUE $key 0 1\r\n", "response from await"); + is(scalar <$ps>, "1\r\n", "looking for a single response"); + is(scalar <$ps>, "END\r\n", "end from await"); + + # await(r, p(3+), 2) + $key = "/awaitone/b"; + $cmd = "get $key\r\n"; + print $ps $cmd; + for my $be (@mbe) { + is(scalar <$be>, $cmd, "awaitone backend req"); + print $be "VALUE $key 0 2\r\nok\r\nEND\r\n"; + } + is(scalar <$ps>, "VALUE $key 0 1\r\n", "response from await"); + is(scalar <$ps>, "2\r\n", "looking two responses"); + is(scalar <$ps>, "END\r\n", "end from await"); + + # await(r, p, 1, mcp.AWAIT_GOOD) + $key = "/awaitgood/a"; + $cmd = "get $key\r\n"; + print $ps $cmd; + for my $be (@mbe) { + is(scalar <$be>, $cmd, "awaitgood backend req"); + print $be "VALUE $key 0 2\r\nok\r\nEND\r\n"; + } + is(scalar <$ps>, "VALUE $key 0 1\r\n", "response from await"); + is(scalar <$ps>, "1\r\n", "looking for a single response"); + is(scalar <$ps>, "END\r\n", "end from await"); + # should test above with first response being err, second good, third + # miss, and a few similar iterations. + + # await(r, p, 2, mcp.AWAIT_ANY) + $key = "/awaitany/a"; + $cmd = "get $key\r\n"; + print $ps $cmd; + for my $be (@mbe) { + is(scalar <$be>, $cmd, "awaitany backend req"); + print $be "VALUE $key 0 2\r\nok\r\nEND\r\n"; + } + is(scalar <$ps>, "VALUE $key 0 1\r\n", "response from await"); + is(scalar <$ps>, "2\r\n", "looking for a two responses"); + is(scalar <$ps>, "END\r\n", "end from await"); + + # await(r, p, 2, mcp.AWAIT_OK) + # await(r, p, 1, mcp.AWAIT_FIRST) + # more AWAIT_FIRST tests? to see how much it waits on/etc. + # await(r, p, 2, mcp.AWAIT_FASTGOOD) + # - should return 1 res on good, else wait for N non-error responses + $key = "/awaitfastgood/a"; + $cmd = "get $key\r\n"; + print $ps $cmd; + my $fbe = $mbe[0]; + is(scalar <$fbe>, $cmd, "awaitfastgood backend req"); + print $fbe "VALUE $key 0 2\r\nok\r\nEND\r\n"; + # Should have response after the first hit. + is(scalar <$ps>, "VALUE $key 0 2\r\n", "response from await"); + is(scalar <$ps>, "ok\r\n", "await value"); + is(scalar <$ps>, "END\r\n", "end from await"); + for my $be ($mbe[1], $mbe[2]) { + is(scalar <$be>, $cmd, "awaitfastgood backend req"); + print $be "VALUE $key 0 2\r\nok\r\nEND\r\n"; + } + + # test three pools, second response returns good. should have a hit. + print $ps $cmd; + for my $be (@mbe) { + is(scalar <$be>, $cmd, "awaitfastgood backend req"); + } + $fbe = $mbe[0]; + print $fbe "END\r\n"; + $fbe = $mbe[1]; + print $fbe "VALUE $key 0 2\r\nun\r\nEND\r\n"; + is(scalar <$ps>, "VALUE $key 0 2\r\n", "response from await"); + is(scalar <$ps>, "un\r\n", "await value"); + is(scalar <$ps>, "END\r\n", "end from await"); + $fbe = $mbe[2]; + print $fbe "END\r\n"; + + # test three pools, but third returns good. should have returned already + print $ps $cmd; + for my $be ($mbe[0], $mbe[1]) { + is(scalar <$be>, $cmd, "awaitfastgood backend req"); + print $be "END\r\n"; + } + $fbe = $mbe[2]; + is(scalar <$fbe>, $cmd, "awaitfastgood backend req"); + print $fbe "VALUE $key 0 2\r\nnu\r\nEND\r\n"; + is(scalar <$ps>, "END\r\n", "miss from awaitfastgood"); + + # Testing a set related to fastgood. waiting for two responses. + $cmd = "set $key 0 0 2\r\nmo\r\n"; + print $ps $cmd; + for my $be ($mbe[0], $mbe[1]) { + is(scalar <$be>, "set $key 0 0 2\r\n", "set backend req"); + is(scalar <$be>, "mo\r\n", "set backend data"); + print $be "STORED\r\n"; + } + is(scalar <$ps>, "STORED\r\n", "got stored from await"); + $fbe = $mbe[2]; + is(scalar <$fbe>, "set $key 0 0 2\r\n", "set backend req"); + is(scalar <$fbe>, "mo\r\n", "set backend data"); + + # Testing another set; ensure it isn't returning early. + my $s = IO::Select->new(); + $s->add($ps); + print $ps $cmd; + for my $be (@mbe) { + is(scalar <$be>, "set $key 0 0 2\r\n", "set backend req"); + is(scalar <$be>, "mo\r\n", "set backend data"); + } + $fbe = $mbe[0]; + print $fbe "STORED\r\n"; + my @readable = $s->can_read(0.25); + is(scalar @readable, 0, "set doesn't return early"); + for my $be ($mbe[1], $mbe[2]) { + print $be "STORED\r\n"; + } + is(scalar <$ps>, "STORED\r\n", "set completed normally"); + + # await(r, p, 1, mcp.AWAIT_BACKGROUND) - ensure res without waiting + $key = "/awaitbg/a"; + $cmd = "get $key\r\n"; + print $ps $cmd; + # check we can get a response _before_ the backends are consulted. + is(scalar <$ps>, "VALUE $key 0 1\r\n", "response from await"); + is(scalar <$ps>, "0\r\n", "looking for zero responses"); + is(scalar <$ps>, "END\r\n", "end from await"); + for my $be (@mbe) { + is(scalar <$be>, $cmd, "awaitbg backend req"); + print $be "VALUE $key 0 2\r\nok\r\nEND\r\n"; + } + + # test hitting a pool normally then hit mcp.await() + # test hitting mcp.await() then a pool normally +} + +{ + my $watcher = $p_srv->new_sock; + print $watcher "watch proxyreqs\n"; + is(<$watcher>, "OK\r\n", "watcher enabled"); + + # test logging errors from special await. + my $key = "/awaitlogerr/a"; + my $cmd = "set $key 0 0 5\r\n"; + print $ps $cmd . "hello\r\n"; + # respond from the first backend normally, then other two with errors. + my $be = $mbe[0]; + is(scalar <$be>, $cmd, "await_logerrors backend req"); + is(scalar <$be>, "hello\r\n", "await_logerrors set payload"); + print $be "STORED\r\n"; + + is(scalar <$ps>, "STORED\r\n", "block until await responded"); + # now ship some errors. + for my $be ($mbe[1], $mbe[2]) { + is(scalar <$be>, $cmd, "await_logerrors backend req"); + is(scalar <$be>, "hello\r\n", "await_logerrors set payload"); + print $be "SERVER_ERROR out of memory\r\n"; + } + like(<$watcher>, qr/ts=(\S+) gid=\d+ type=proxy_req elapsed=\d+ type=\d+ code=\d+ status=-1 be=(\S+) detail=write_failed req=set \/awaitlogerr\/a/, "await_logerrors log entry 1"); + like(<$watcher>, qr/ts=(\S+) gid=\d+ type=proxy_req elapsed=\d+ type=\d+ code=\d+ status=-1 be=(\S+) detail=write_failed req=set \/awaitlogerr\/a/, "await_logerrors log entry 2"); + + # Repeat the logreqtest to ensure we only got the log lines we expected. + $cmd = "get /logreqtest/a\r\n"; + print $ps $cmd; + is(scalar <$be>, $cmd, "got passthru for log"); + print $be "END\r\n"; + is(scalar <$ps>, "END\r\n", "got END from log test"); + like(<$watcher>, qr/ts=(\S+) gid=\d+ type=proxy_req elapsed=\d+ type=105 code=17 status=0 be=127.0.0.1:11411 detail=logreqtest req=get \/logreqtest\/a/, "found request log entry"); +} + +check_version($ps); +done_testing(); diff -Nru memcached-1.6.18/t/watcher_connid.t memcached-1.6.19/t/watcher_connid.t --- memcached-1.6.18/t/watcher_connid.t 2023-01-11 05:58:39.000000000 +0000 +++ memcached-1.6.19/t/watcher_connid.t 2023-03-08 21:34:27.000000000 +0000 @@ -7,11 +7,13 @@ use warnings; use Socket qw/SO_RCVBUF/; -use Test::More tests => 4; +use Test::More; use FindBin qw($Bin); use lib "$Bin/lib"; use MemcachedTest; +plan tests => 4; + my $server = new_memcached('-m 60 -o watcher_logbuf_size=8'); my $client_first = $server->sock; diff -Nru memcached-1.6.18/t/watcher.t memcached-1.6.19/t/watcher.t --- memcached-1.6.18/t/watcher.t 2023-01-11 05:58:39.000000000 +0000 +++ memcached-1.6.19/t/watcher.t 2023-03-08 21:34:27.000000000 +0000 @@ -5,11 +5,13 @@ use warnings; use Socket qw/SO_RCVBUF/; -use Test::More tests => 30; +use Test::More; use FindBin qw($Bin); use lib "$Bin/lib"; use MemcachedTest; +plan tests => 30; + my $server = new_memcached('-m 60 -o watcher_logbuf_size=8'); my $client = $server->sock; my $watcher = $server->new_sock; diff -Nru memcached-1.6.18/thread.c memcached-1.6.19/thread.c --- memcached-1.6.18/thread.c 2023-01-11 06:10:10.000000000 +0000 +++ memcached-1.6.19/thread.c 2023-03-08 21:34:27.000000000 +0000 @@ -86,7 +86,7 @@ static pthread_mutex_t *item_locks; /* size of the item lock hash table */ static uint32_t item_lock_count; -unsigned int item_lock_hashpower; +static unsigned int item_lock_hashpower; #define hashsize(n) ((unsigned long int)1<<(n)) #define hashmask(n) (hashsize(n)-1) @@ -482,12 +482,11 @@ // me->storage is set just before this function is called. if (me->storage) { thread_io_queue_add(me, IO_QUEUE_EXTSTORE, me->storage, - storage_submit_cb, storage_complete_cb, NULL, storage_finalize_cb); + storage_submit_cb); } #endif #ifdef PROXY - thread_io_queue_add(me, IO_QUEUE_PROXY, settings.proxy_ctx, proxy_submit_cb, - proxy_complete_cb, proxy_return_cb, proxy_finalize_cb); + thread_io_queue_add(me, IO_QUEUE_PROXY, settings.proxy_ctx, proxy_submit_cb); // TODO: maybe register hooks to be called here from sub-packages? ie; // extstore, TLS, proxy. @@ -495,7 +494,7 @@ proxy_thread_init(settings.proxy_ctx, me); } #endif - thread_io_queue_add(me, IO_QUEUE_NONE, NULL, NULL, NULL, NULL, NULL); + thread_io_queue_add(me, IO_QUEUE_NONE, NULL, NULL); } /* @@ -809,7 +808,7 @@ /* * Allocates a new item. */ -item *item_alloc(char *key, size_t nkey, int flags, rel_time_t exptime, int nbytes) { +item *item_alloc(const char *key, size_t nkey, int flags, rel_time_t exptime, int nbytes) { item *it; /* do_item_alloc handles its own locks */ it = do_item_alloc(key, nkey, flags, exptime, nbytes); @@ -820,12 +819,12 @@ * Returns an item if it hasn't been marked as expired, * lazy-expiring as needed. */ -item *item_get(const char *key, const size_t nkey, conn *c, const bool do_update) { +item *item_get(const char *key, const size_t nkey, LIBEVENT_THREAD *t, const bool do_update) { item *it; uint32_t hv; hv = hash(key, nkey); item_lock(hv); - it = do_item_get(key, nkey, hv, c, do_update); + it = do_item_get(key, nkey, hv, t, do_update); item_unlock(hv); return it; } @@ -833,20 +832,20 @@ // returns an item with the item lock held. // lock will still be held even if return is NULL, allowing caller to replace // an item atomically if desired. -item *item_get_locked(const char *key, const size_t nkey, conn *c, const bool do_update, uint32_t *hv) { +item *item_get_locked(const char *key, const size_t nkey, LIBEVENT_THREAD *t, const bool do_update, uint32_t *hv) { item *it; *hv = hash(key, nkey); item_lock(*hv); - it = do_item_get(key, nkey, *hv, c, do_update); + it = do_item_get(key, nkey, *hv, t, do_update); return it; } -item *item_touch(const char *key, size_t nkey, uint32_t exptime, conn *c) { +item *item_touch(const char *key, size_t nkey, uint32_t exptime, LIBEVENT_THREAD *t) { item *it; uint32_t hv; hv = hash(key, nkey); item_lock(hv); - it = do_item_touch(key, nkey, exptime, hv, c); + it = do_item_touch(key, nkey, exptime, hv, t); item_unlock(hv); return it; } @@ -901,7 +900,7 @@ /* * Does arithmetic on a numeric item value. */ -enum delta_result_type add_delta(conn *c, const char *key, +enum delta_result_type add_delta(LIBEVENT_THREAD *t, const char *key, const size_t nkey, bool incr, const int64_t delta, char *buf, uint64_t *cas) { @@ -910,7 +909,7 @@ hv = hash(key, nkey); item_lock(hv); - ret = do_add_delta(c, key, nkey, incr, delta, buf, cas, hv, NULL); + ret = do_add_delta(t, key, nkey, incr, delta, buf, cas, hv, NULL); item_unlock(hv); return ret; } @@ -918,13 +917,13 @@ /* * Stores an item in the cache (high level, obeys set/add/replace semantics) */ -enum store_item_type store_item(item *item, int comm, conn* c) { +enum store_item_type store_item(item *item, int comm, LIBEVENT_THREAD *t, uint64_t *cas, bool cas_stale) { enum store_item_type ret; uint32_t hv; hv = hash(ITEM_key(item), item->nkey); item_lock(hv); - ret = do_store_item(item, comm, c, hv); + ret = do_store_item(item, comm, t, hv, cas, cas_stale); item_unlock(hv); return ret; } @@ -1092,6 +1091,7 @@ #ifdef EXTSTORE threads[i].storage = arg; #endif + threads[i].thread_baseid = i; setup_thread(&threads[i]); /* Reserve three fds for the libevent base, and two for the pipe */ stats_state.reserved_fds += 5; diff -Nru memcached-1.6.18/timedrun.c memcached-1.6.19/timedrun.c --- memcached-1.6.18/timedrun.c 2022-02-21 18:58:33.000000000 +0000 +++ memcached-1.6.19/timedrun.c 2023-03-08 21:34:27.000000000 +0000 @@ -44,6 +44,16 @@ break; } else { int sig = 0; + /* pass along SIGHUP gracefully */ + if (caught_sig == SIGHUP) { + i = 0; + int sig = caught_sig; + if (kill(pid, sig) < 0) { + /* Kill failed. Must have lost the process. :/ */ + perror("lost child when trying to kill"); + } + continue; + } switch (i) { case 0: /* On the first iteration, pass the signal through */ Binary files /tmp/tmpb7al8mcp/xHt7Uqnl_a/memcached-1.6.18/vendor/lua/src/lapi.o and /tmp/tmpb7al8mcp/P0TGHc1FWC/memcached-1.6.19/vendor/lua/src/lapi.o differ Binary files /tmp/tmpb7al8mcp/xHt7Uqnl_a/memcached-1.6.18/vendor/lua/src/lauxlib.o and /tmp/tmpb7al8mcp/P0TGHc1FWC/memcached-1.6.19/vendor/lua/src/lauxlib.o differ Binary files /tmp/tmpb7al8mcp/xHt7Uqnl_a/memcached-1.6.18/vendor/lua/src/lbaselib.o and /tmp/tmpb7al8mcp/P0TGHc1FWC/memcached-1.6.19/vendor/lua/src/lbaselib.o differ Binary files /tmp/tmpb7al8mcp/xHt7Uqnl_a/memcached-1.6.18/vendor/lua/src/lcode.o and /tmp/tmpb7al8mcp/P0TGHc1FWC/memcached-1.6.19/vendor/lua/src/lcode.o differ Binary files /tmp/tmpb7al8mcp/xHt7Uqnl_a/memcached-1.6.18/vendor/lua/src/lcorolib.o and /tmp/tmpb7al8mcp/P0TGHc1FWC/memcached-1.6.19/vendor/lua/src/lcorolib.o differ Binary files /tmp/tmpb7al8mcp/xHt7Uqnl_a/memcached-1.6.18/vendor/lua/src/lctype.o and /tmp/tmpb7al8mcp/P0TGHc1FWC/memcached-1.6.19/vendor/lua/src/lctype.o differ Binary files /tmp/tmpb7al8mcp/xHt7Uqnl_a/memcached-1.6.18/vendor/lua/src/ldblib.o and /tmp/tmpb7al8mcp/P0TGHc1FWC/memcached-1.6.19/vendor/lua/src/ldblib.o differ Binary files /tmp/tmpb7al8mcp/xHt7Uqnl_a/memcached-1.6.18/vendor/lua/src/ldebug.o and /tmp/tmpb7al8mcp/P0TGHc1FWC/memcached-1.6.19/vendor/lua/src/ldebug.o differ Binary files /tmp/tmpb7al8mcp/xHt7Uqnl_a/memcached-1.6.18/vendor/lua/src/ldo.o and /tmp/tmpb7al8mcp/P0TGHc1FWC/memcached-1.6.19/vendor/lua/src/ldo.o differ Binary files /tmp/tmpb7al8mcp/xHt7Uqnl_a/memcached-1.6.18/vendor/lua/src/ldump.o and /tmp/tmpb7al8mcp/P0TGHc1FWC/memcached-1.6.19/vendor/lua/src/ldump.o differ Binary files /tmp/tmpb7al8mcp/xHt7Uqnl_a/memcached-1.6.18/vendor/lua/src/lfunc.o and /tmp/tmpb7al8mcp/P0TGHc1FWC/memcached-1.6.19/vendor/lua/src/lfunc.o differ Binary files /tmp/tmpb7al8mcp/xHt7Uqnl_a/memcached-1.6.18/vendor/lua/src/lgc.o and /tmp/tmpb7al8mcp/P0TGHc1FWC/memcached-1.6.19/vendor/lua/src/lgc.o differ Binary files /tmp/tmpb7al8mcp/xHt7Uqnl_a/memcached-1.6.18/vendor/lua/src/liblua.a and /tmp/tmpb7al8mcp/P0TGHc1FWC/memcached-1.6.19/vendor/lua/src/liblua.a differ Binary files /tmp/tmpb7al8mcp/xHt7Uqnl_a/memcached-1.6.18/vendor/lua/src/linit.o and /tmp/tmpb7al8mcp/P0TGHc1FWC/memcached-1.6.19/vendor/lua/src/linit.o differ Binary files /tmp/tmpb7al8mcp/xHt7Uqnl_a/memcached-1.6.18/vendor/lua/src/liolib.o and /tmp/tmpb7al8mcp/P0TGHc1FWC/memcached-1.6.19/vendor/lua/src/liolib.o differ Binary files /tmp/tmpb7al8mcp/xHt7Uqnl_a/memcached-1.6.18/vendor/lua/src/llex.o and /tmp/tmpb7al8mcp/P0TGHc1FWC/memcached-1.6.19/vendor/lua/src/llex.o differ Binary files /tmp/tmpb7al8mcp/xHt7Uqnl_a/memcached-1.6.18/vendor/lua/src/lmathlib.o and /tmp/tmpb7al8mcp/P0TGHc1FWC/memcached-1.6.19/vendor/lua/src/lmathlib.o differ Binary files /tmp/tmpb7al8mcp/xHt7Uqnl_a/memcached-1.6.18/vendor/lua/src/lmem.o and /tmp/tmpb7al8mcp/P0TGHc1FWC/memcached-1.6.19/vendor/lua/src/lmem.o differ Binary files /tmp/tmpb7al8mcp/xHt7Uqnl_a/memcached-1.6.18/vendor/lua/src/loadlib.o and /tmp/tmpb7al8mcp/P0TGHc1FWC/memcached-1.6.19/vendor/lua/src/loadlib.o differ Binary files /tmp/tmpb7al8mcp/xHt7Uqnl_a/memcached-1.6.18/vendor/lua/src/lobject.o and /tmp/tmpb7al8mcp/P0TGHc1FWC/memcached-1.6.19/vendor/lua/src/lobject.o differ Binary files /tmp/tmpb7al8mcp/xHt7Uqnl_a/memcached-1.6.18/vendor/lua/src/lopcodes.o and /tmp/tmpb7al8mcp/P0TGHc1FWC/memcached-1.6.19/vendor/lua/src/lopcodes.o differ Binary files /tmp/tmpb7al8mcp/xHt7Uqnl_a/memcached-1.6.18/vendor/lua/src/loslib.o and /tmp/tmpb7al8mcp/P0TGHc1FWC/memcached-1.6.19/vendor/lua/src/loslib.o differ Binary files /tmp/tmpb7al8mcp/xHt7Uqnl_a/memcached-1.6.18/vendor/lua/src/lparser.o and /tmp/tmpb7al8mcp/P0TGHc1FWC/memcached-1.6.19/vendor/lua/src/lparser.o differ Binary files /tmp/tmpb7al8mcp/xHt7Uqnl_a/memcached-1.6.18/vendor/lua/src/lstate.o and /tmp/tmpb7al8mcp/P0TGHc1FWC/memcached-1.6.19/vendor/lua/src/lstate.o differ Binary files /tmp/tmpb7al8mcp/xHt7Uqnl_a/memcached-1.6.18/vendor/lua/src/lstring.o and /tmp/tmpb7al8mcp/P0TGHc1FWC/memcached-1.6.19/vendor/lua/src/lstring.o differ Binary files /tmp/tmpb7al8mcp/xHt7Uqnl_a/memcached-1.6.18/vendor/lua/src/lstrlib.o and /tmp/tmpb7al8mcp/P0TGHc1FWC/memcached-1.6.19/vendor/lua/src/lstrlib.o differ Binary files /tmp/tmpb7al8mcp/xHt7Uqnl_a/memcached-1.6.18/vendor/lua/src/ltable.o and /tmp/tmpb7al8mcp/P0TGHc1FWC/memcached-1.6.19/vendor/lua/src/ltable.o differ Binary files /tmp/tmpb7al8mcp/xHt7Uqnl_a/memcached-1.6.18/vendor/lua/src/ltablib.o and /tmp/tmpb7al8mcp/P0TGHc1FWC/memcached-1.6.19/vendor/lua/src/ltablib.o differ Binary files /tmp/tmpb7al8mcp/xHt7Uqnl_a/memcached-1.6.18/vendor/lua/src/ltm.o and /tmp/tmpb7al8mcp/P0TGHc1FWC/memcached-1.6.19/vendor/lua/src/ltm.o differ Binary files /tmp/tmpb7al8mcp/xHt7Uqnl_a/memcached-1.6.18/vendor/lua/src/lua and /tmp/tmpb7al8mcp/P0TGHc1FWC/memcached-1.6.19/vendor/lua/src/lua differ Binary files /tmp/tmpb7al8mcp/xHt7Uqnl_a/memcached-1.6.18/vendor/lua/src/luac and /tmp/tmpb7al8mcp/P0TGHc1FWC/memcached-1.6.19/vendor/lua/src/luac differ Binary files /tmp/tmpb7al8mcp/xHt7Uqnl_a/memcached-1.6.18/vendor/lua/src/luac.o and /tmp/tmpb7al8mcp/P0TGHc1FWC/memcached-1.6.19/vendor/lua/src/luac.o differ Binary files /tmp/tmpb7al8mcp/xHt7Uqnl_a/memcached-1.6.18/vendor/lua/src/lua.o and /tmp/tmpb7al8mcp/P0TGHc1FWC/memcached-1.6.19/vendor/lua/src/lua.o differ Binary files /tmp/tmpb7al8mcp/xHt7Uqnl_a/memcached-1.6.18/vendor/lua/src/lundump.o and /tmp/tmpb7al8mcp/P0TGHc1FWC/memcached-1.6.19/vendor/lua/src/lundump.o differ Binary files /tmp/tmpb7al8mcp/xHt7Uqnl_a/memcached-1.6.18/vendor/lua/src/lutf8lib.o and /tmp/tmpb7al8mcp/P0TGHc1FWC/memcached-1.6.19/vendor/lua/src/lutf8lib.o differ Binary files /tmp/tmpb7al8mcp/xHt7Uqnl_a/memcached-1.6.18/vendor/lua/src/lvm.o and /tmp/tmpb7al8mcp/P0TGHc1FWC/memcached-1.6.19/vendor/lua/src/lvm.o differ Binary files /tmp/tmpb7al8mcp/xHt7Uqnl_a/memcached-1.6.18/vendor/lua/src/lzio.o and /tmp/tmpb7al8mcp/P0TGHc1FWC/memcached-1.6.19/vendor/lua/src/lzio.o differ Binary files /tmp/tmpb7al8mcp/xHt7Uqnl_a/memcached-1.6.18/vendor/mcmc/example and /tmp/tmpb7al8mcp/P0TGHc1FWC/memcached-1.6.19/vendor/mcmc/example differ diff -Nru memcached-1.6.18/vendor/mcmc/mcmc.c memcached-1.6.19/vendor/mcmc/mcmc.c --- memcached-1.6.18/vendor/mcmc/mcmc.c 2023-01-11 06:10:10.000000000 +0000 +++ memcached-1.6.19/vendor/mcmc/mcmc.c 2023-03-08 21:34:27.000000000 +0000 @@ -361,6 +361,11 @@ return _mcmc_parse_response(ctx, r); } +int mcmc_bare_parse_buf(char *buf, size_t read, mcmc_resp_t *r) { + mcmc_ctx_t ctx; + return mcmc_parse_buf(&ctx, buf, read, r); +} + /*** Functions wrapping syscalls **/ // TODO: should be able to flip between block and nonblock. diff -Nru memcached-1.6.18/vendor/mcmc/mcmc.h memcached-1.6.19/vendor/mcmc/mcmc.h --- memcached-1.6.18/vendor/mcmc/mcmc.h 2023-01-11 06:10:10.000000000 +0000 +++ memcached-1.6.19/vendor/mcmc/mcmc.h 2023-03-08 21:34:27.000000000 +0000 @@ -79,6 +79,7 @@ size_t mcmc_size(int options); size_t mcmc_min_buffer_size(int options); int mcmc_parse_buf(void *c, char *buf, size_t read, mcmc_resp_t *r); +int mcmc_bare_parse_buf(char *buf, size_t read, mcmc_resp_t *r); int mcmc_connect(void *c, char *host, char *port, int options); int mcmc_check_nonblock_connect(void *c, int *err); int mcmc_send_request(void *c, const char *request, int len, int count); Binary files /tmp/tmpb7al8mcp/xHt7Uqnl_a/memcached-1.6.18/vendor/mcmc/mcmc.o and /tmp/tmpb7al8mcp/P0TGHc1FWC/memcached-1.6.19/vendor/mcmc/mcmc.o differ diff -Nru memcached-1.6.18/version.m4 memcached-1.6.19/version.m4 --- memcached-1.6.18/version.m4 2023-01-11 06:17:57.000000000 +0000 +++ memcached-1.6.19/version.m4 2023-03-08 21:42:11.000000000 +0000 @@ -1 +1 @@ -m4_define([VERSION_NUMBER], [1.6.18]) +m4_define([VERSION_NUMBER], [1.6.19])