Binary files /tmp/tmpFLLNnA/6m_V_jmLxa/r-cran-surveillance-1.12.2/build/partial.rdb and /tmp/tmpFLLNnA/NJ1xxKg7_3/r-cran-surveillance-1.13.0/build/partial.rdb differ diff -Nru r-cran-surveillance-1.12.2/debian/changelog r-cran-surveillance-1.13.0/debian/changelog --- r-cran-surveillance-1.12.2/debian/changelog 2016-11-23 20:32:31.000000000 +0000 +++ r-cran-surveillance-1.13.0/debian/changelog 2016-12-22 18:48:15.000000000 +0000 @@ -1,3 +1,9 @@ +r-cran-surveillance (1.13.0-1) unstable; urgency=medium + + * New upstream version + + -- Andreas Tille Thu, 22 Dec 2016 19:48:15 +0100 + r-cran-surveillance (1.12.2-1) unstable; urgency=medium * New upstream version diff -Nru r-cran-surveillance-1.12.2/demo/fluBYBW.R r-cran-surveillance-1.13.0/demo/fluBYBW.R --- r-cran-surveillance-1.12.2/demo/fluBYBW.R 2016-04-02 14:28:06.000000000 +0000 +++ r-cran-surveillance-1.13.0/demo/fluBYBW.R 2016-12-20 15:09:39.000000000 +0000 @@ -9,6 +9,7 @@ ### a copy of which is available at http://www.r-project.org/Licenses/. ################################################################################ +set.seed(1) # for reproducibility (affects initial values for ri() terms) library("surveillance") ## Weekly counts of influenza in 140 districts of Bavaria and Baden-Wuerttemberg @@ -164,7 +165,7 @@ ## in Paul & Held (2011, Table IV). -## comparison with best model B2 +## assess statistical significance of score differences compareWithBest <- function(best, whichModels, nPermut=9999, seed=1234){ set.seed(seed) diff -Nru r-cran-surveillance-1.12.2/DESCRIPTION r-cran-surveillance-1.13.0/DESCRIPTION --- r-cran-surveillance-1.12.2/DESCRIPTION 2016-11-17 13:34:37.000000000 +0000 +++ r-cran-surveillance-1.13.0/DESCRIPTION 2016-12-20 17:31:08.000000000 +0000 @@ -1,8 +1,8 @@ Package: surveillance Title: Temporal and Spatio-Temporal Modeling and Monitoring of Epidemic Phenomena -Version: 1.12.2 -Date: 2016-11-14 +Version: 1.13.0 +Date: 2016-12-20 Authors@R: c(MH = person("Michael", "Höhle", email = "hoehle@math.su.se", role = c("aut", "ths")), @@ -13,6 +13,7 @@ LH = person("Leonhard", "Held", email = "Leonhard.Held@uzh.ch", role = c("ctb", "ths")), + person("Howard", "Burkom", role = "ctb"), person("Thais", "Correa", role = "ctb"), person("Mathias", "Hofmann", role = "ctb"), person("Christian", "Lang", role = "ctb"), @@ -29,7 +30,7 @@ comment = "A few code segments are modified versions of code from base R")) Author: Michael Höhle [aut, ths], Sebastian Meyer [aut, cre], - Michaela Paul [aut], Leonhard Held [ctb, ths], + Michaela Paul [aut], Leonhard Held [ctb, ths], Howard Burkom [ctb], Thais Correa [ctb], Mathias Hofmann [ctb], Christian Lang [ctb], Juliane Manitz [ctb], Andrea Riebler [ctb], Daniel Sabanés Bové [ctb], Maëlle Salmon [ctb], Dirk Schumacher [ctb], Stefan Steiner [ctb], @@ -83,6 +84,6 @@ Encoding: latin1 VignetteBuilder: utils, knitr NeedsCompilation: yes -Packaged: 2016-11-17 10:20:40 UTC; smeyer +Packaged: 2016-12-20 15:49:46 UTC; smeyer Repository: CRAN -Date/Publication: 2016-11-17 14:34:37 +Date/Publication: 2016-12-20 18:31:08 Binary files /tmp/tmpFLLNnA/6m_V_jmLxa/r-cran-surveillance-1.12.2/inst/doc/glrnb.pdf and /tmp/tmpFLLNnA/NJ1xxKg7_3/r-cran-surveillance-1.13.0/inst/doc/glrnb.pdf differ Binary files /tmp/tmpFLLNnA/6m_V_jmLxa/r-cran-surveillance-1.12.2/inst/doc/hhh4.pdf and /tmp/tmpFLLNnA/NJ1xxKg7_3/r-cran-surveillance-1.13.0/inst/doc/hhh4.pdf differ Binary files /tmp/tmpFLLNnA/6m_V_jmLxa/r-cran-surveillance-1.12.2/inst/doc/hhh4_spacetime.pdf and /tmp/tmpFLLNnA/NJ1xxKg7_3/r-cran-surveillance-1.13.0/inst/doc/hhh4_spacetime.pdf differ Binary files /tmp/tmpFLLNnA/6m_V_jmLxa/r-cran-surveillance-1.12.2/inst/doc/monitoringCounts.pdf and /tmp/tmpFLLNnA/NJ1xxKg7_3/r-cran-surveillance-1.13.0/inst/doc/monitoringCounts.pdf differ diff -Nru r-cran-surveillance-1.12.2/inst/doc/monitoringCounts.R r-cran-surveillance-1.13.0/inst/doc/monitoringCounts.R --- r-cran-surveillance-1.12.2/inst/doc/monitoringCounts.R 2016-11-17 10:20:33.000000000 +0000 +++ r-cran-surveillance-1.13.0/inst/doc/monitoringCounts.R 2016-12-20 15:49:38.000000000 +0000 @@ -48,9 +48,9 @@ cex.leg <- cex.text line.lwd <- 2#1 stsPlotCol <- c("mediumblue","mediumblue","red2") -alarm.symbol <- list(pch=17, col="red2", cex=2,lwd=3) +alarm.symbol <- list(pch=17, col="red2", cex=2,lwd=3) #Define list with arguments to use with do.call("legend", legOpts) -legOpts <- list(x="topleft",legend=c(expression(U[t])),bty="n",lty=1,lwd=line.lwd,col=alarm.symbol$col,horiz=TRUE,cex=cex.leg) +legOpts <- list(x="topleft",legend=c(expression(U[t])),bty="n",lty=1,lwd=line.lwd,col=alarm.symbol$col,horiz=TRUE,cex=cex.leg) #How should the par of each plot look? par.list <- list(mar=c(6,5,5,5),family="Times") #Do this once @@ -62,23 +62,23 @@ ylab="No. of reports", xlab="Time (weeks)",lty=c(1,1,1), legend.opts=legOpts,alarm.symbol=alarm.symbol, xaxis.tickFreq=list("%V"=atChange,"%m"=atChange,"%G"=atChange), - xaxis.labelFreq=list("%Y"=atMedian), + xaxis.labelFreq=list("%Y"=atMedian), xaxis.labelFormat="%Y", par.list=par.list,hookFunc=hookFunc) - + ################################################### ### code chunk number 6: stsLoad ################################################### -# Load data +# Load data data("salmNewport") ################################################### ### code chunk number 7: NewportPlot ################################################### -# Plot +# Plot y.max <- max(aggregate(salmNewport,by="unit")@observed,na.rm=TRUE) plotOpts2 <- modifyList(plotOpts,list(x=salmNewport,legend.opts=NULL,ylim=c(0,y.max),type = observed ~ time),keep.null=TRUE) plotOpts2$par.list <- list(mar=c(6,5,0,5),family="Times") @@ -90,7 +90,7 @@ ### code chunk number 8: NewportPlot ################################################### getOption("SweaveHooks")[["fig"]]() -# Plot +# Plot y.max <- max(aggregate(salmNewport,by="unit")@observed,na.rm=TRUE) plotOpts2 <- modifyList(plotOpts,list(x=salmNewport,legend.opts=NULL,ylim=c(0,y.max),type = observed ~ time),keep.null=TRUE) plotOpts2$par.list <- list(mar=c(6,5,0,5),family="Times") @@ -133,7 +133,7 @@ in2011 <- which(isoWeekYear(epoch(salmNewport))$ISOYear == 2011) salmNewportGermany <- aggregate(salmNewport, by = "unit") control <- list(range = in2011, method = "C1", alpha = 0.05) -surv <- earsC(salmNewportGermany, control = control) +surv <- earsC(salmNewportGermany, control = control) plot(surv) @@ -148,7 +148,7 @@ control <- list(range = in2011, method="C1", alpha=0.05) # Apply earsC function surv <- earsC(salmNewportGermany, control=control) -# Plot the results +# Plot the results #plot(surv) # Plot y.max <- max(observed(surv),upperbound(surv),na.rm=TRUE) @@ -167,7 +167,7 @@ control <- list(range = in2011, method="C1", alpha=0.05) # Apply earsC function surv <- earsC(salmNewportGermany, control=control) -# Plot the results +# Plot the results #plot(surv) # Plot y.max <- max(observed(surv),upperbound(surv),na.rm=TRUE) @@ -182,7 +182,7 @@ b=4,w=3,weightsThreshold=1,pastWeeksNotIncluded=3, pThresholdTrend=0.05,thresholdMethod="delta",alpha=0.05, limit54=c(0,50)) -# Control slot for the improved method +# Control slot for the improved method control2 <- list(range=in2011,noPeriods=10, b=4,w=3,weightsThreshold=2.58,pastWeeksNotIncluded=26, pThresholdTrend=1,thresholdMethod="nbPlugin",alpha=0.05, @@ -242,13 +242,13 @@ annotate("text", label = "Time", x = 170, y = 0, size = 8, colour = "black", family="serif") + # ticks labels -annotate('text',label=c("t[0]-2 %.% freq","t[0]-freq","t[0]"),x = xTicks, - y = yTicksEnd - 10, size = 8,family="serif",parse=T) +annotate('text',label=c("t[0]-2 %.% freq","t[0]-freq","t[0]"),x = xTicks, + y = yTicksEnd - 10, size = 8,family="serif",parse=T) p+ # periods labels -annotate('text',label=c("A","A","A","B","B"),x = xPeriods, - y = rep(6,5), size = 8,family="serif",parse=T) +annotate('text',label=c("A","A","A","B","B"),x = xPeriods, + y = rep(6,5), size = 8,family="serif",parse=T) ################################################### @@ -262,14 +262,14 @@ p + geom_segment(aes(x = newX, y = yTicksBigStart2, xend = newX, yend = yTicksBigEnd2), size=1)+ # periods labels -annotate('text',label=c("A","A","A","B","B","C","C"),x = xPeriods, - y = rep(6,7), size = 8,family="serif",parse=T) +annotate('text',label=c("A","A","A","B","B","C","C"),x = xPeriods, + y = rep(6,7), size = 8,family="serif",parse=T) ################################################### ### code chunk number 19: oldVsNewprep ################################################### -salm.farrington <- farringtonFlexible(salmNewportGermany, control1) +salm.farrington <- farringtonFlexible(salmNewportGermany, control1) salm.noufaily <- farringtonFlexible(salmNewportGermany, control2) @@ -295,8 +295,8 @@ ################################################### # Load data and create \code{sts}-object data("campyDE") -cam.sts <- new("sts",epoch=as.numeric(campyDE$date), - observed=campyDE$case, state=campyDE$state, +cam.sts <- new("sts",epoch=as.numeric(campyDE$date), + observed=campyDE$case, state=campyDE$state, epochAsDate=TRUE) par(las=1) # Plot @@ -306,15 +306,15 @@ do.call("plot",plotOpts3) par(las=0) #mtext(side=2,text="No. of reports", - # las=0,line=3, cex=cex.text,family="Times") + # las=0,line=3, cex=cex.text,family="Times") par(family="Times") -text(-20, 2600, "No. of\n reports", pos = 3, xpd = T,cex=cex.text) +text(-20, 2600, "No. of\n reports", pos = 3, xpd = T,cex=cex.text) text(510, 2900, "Absolute humidity", pos = 3, xpd = T,cex=cex.text) text(510, 2550, expression(paste("[",g/m^3,"]", sep='')), pos = 3, xpd = T,cex=cex.text) -lines(campyDE$hum*50, col="white", lwd=2) +lines(campyDE$hum*50, col="white", lwd=2) axis(side=4, at=seq(0,2500,by=500),labels=seq(0,50,by=10),las=1,cex.lab=cex.text, cex=cex.text,cex.axis=cex.text,pos=length(epoch(cam.sts))+20) #mtext(side=4,text=expression(paste("Absolute humidity [ ",g/m^3,"]", sep='')), - # las=0,line=1, cex=cex.text,family="Times") + # las=0,line=1, cex=cex.text,family="Times") @@ -324,8 +324,8 @@ getOption("SweaveHooks")[["fig"]]() # Load data and create \code{sts}-object data("campyDE") -cam.sts <- new("sts",epoch=as.numeric(campyDE$date), - observed=campyDE$case, state=campyDE$state, +cam.sts <- new("sts",epoch=as.numeric(campyDE$date), + observed=campyDE$case, state=campyDE$state, epochAsDate=TRUE) par(las=1) # Plot @@ -335,15 +335,15 @@ do.call("plot",plotOpts3) par(las=0) #mtext(side=2,text="No. of reports", - # las=0,line=3, cex=cex.text,family="Times") + # las=0,line=3, cex=cex.text,family="Times") par(family="Times") -text(-20, 2600, "No. of\n reports", pos = 3, xpd = T,cex=cex.text) +text(-20, 2600, "No. of\n reports", pos = 3, xpd = T,cex=cex.text) text(510, 2900, "Absolute humidity", pos = 3, xpd = T,cex=cex.text) text(510, 2550, expression(paste("[",g/m^3,"]", sep='')), pos = 3, xpd = T,cex=cex.text) -lines(campyDE$hum*50, col="white", lwd=2) +lines(campyDE$hum*50, col="white", lwd=2) axis(side=4, at=seq(0,2500,by=500),labels=seq(0,50,by=10),las=1,cex.lab=cex.text, cex=cex.text,cex.axis=cex.text,pos=length(epoch(cam.sts))+20) #mtext(side=4,text=expression(paste("Absolute humidity [ ",g/m^3,"]", sep='')), - # las=0,line=1, cex=cex.text,family="Times") + # las=0,line=1, cex=cex.text,family="Times") @@ -354,7 +354,7 @@ ## data("campyDE") ## cam.sts <- new("sts", epoch = as.numeric(campyDE$date), ## observed = campyDE$case, state = campyDE$state, -## epochAsDate = TRUE) +## epochAsDate = TRUE) ## plot(cam.sts, legend = NULL, xlab = "time [weeks]", ylab = "No. reported", ## col = "gray", cex = 2, cex.axis = 2, cex.lab = 2) ## lines(campyDE$hum * 50, col = "darkblue", lwd = 2) @@ -366,7 +366,7 @@ ## ## rangeBoda <- which(epoch(cam.sts) >= as.Date("2007-01-01")) ## control.boda <- list(range = rangeBoda, X = NULL, trend = TRUE, -## season = TRUE, prior = "iid", alpha = 0.025, +## season = TRUE, prior = "iid", alpha = 0.025, ## mc.munu = 10000, mc.y = 1000, ## samplingMethod = "marginals") ## boda <- boda(cam.sts, control = control.boda) @@ -379,7 +379,7 @@ if (computeALL) { library("INLA") control.boda <- list(range=rangeBoda, X=NULL, trend=TRUE, - season=TRUE, prior='rw1', alpha=0.025, + season=TRUE, prior='rw1', alpha=0.025, mc.munu=10000, mc.y=1000, samplingMethod = "marginals") # boda without covariates: trend + spline + periodic spline @@ -393,9 +393,9 @@ ################################################### ### code chunk number 27: NICELOOKINGboda2 (eval = FALSE) ################################################### -## covarNames <- c("l1.hum", "l2.hum", "l3.hum", "l4.hum", +## covarNames <- c("l1.hum", "l2.hum", "l3.hum", "l4.hum", ## "newyears", "christmas", "O104period") -## control.boda2 <- modifyList(control.boda, +## control.boda2 <- modifyList(control.boda, ## list(X = campyDE[, covarNames], season = FALSE)) ## boda.covars <- boda(cam.sts, control = control.boda2) @@ -407,7 +407,7 @@ # boda with covariates: trend + spline + lagged hum + indicator variables covarNames <- c(paste("l",1:4,".hum",sep=""),"newyears","christmas", "O104period") -control.boda2 <- modifyList(control.boda, +control.boda2 <- modifyList(control.boda, list(X=campyDE[,covarNames],season=FALSE)) boda.covars <- boda(cam.sts, control=control.boda2) save(boda.covars, file = "monitoringCounts-cache/boda.covars.RData") @@ -420,7 +420,7 @@ ### code chunk number 29: alarmplot2 (eval = FALSE) ################################################### ## cam.surv <- combineSTS(list(boda.covars=boda.covars,boda=boda,bayes=bayes, -## farrington=far,farringtonFlexible=farflex)) +## farrington=far,farringtonFlexible=farflex)) ## plot(cam.surv,type = alarm ~ time) @@ -483,7 +483,7 @@ cam.surv <- combineSTS(list(boda.covars=boda.covars,boda=boda,bayes=bayes, farrington=far,farringtonFlexible=farflex)) -par(mar=c(4,8,2.1,2),family="Times") +par(mar=c(4,8,2.1,2),family="Times") plot(cam.surv,type = alarm ~ time,lvl=rep(1,ncol(cam.surv)), alarm.symbol=list(pch=17, col="red2", cex=1,lwd=3), cex.axis=1,xlab="Time (weeks)",cex.lab=1,xaxis.tickFreq=list("%m"=atChange,"%G"=atChange),xaxis.labelFreq=list("%G"=at2ndChange), @@ -499,7 +499,7 @@ cam.surv <- combineSTS(list(boda.covars=boda.covars,boda=boda,bayes=bayes, farrington=far,farringtonFlexible=farflex)) -par(mar=c(4,8,2.1,2),family="Times") +par(mar=c(4,8,2.1,2),family="Times") plot(cam.surv,type = alarm ~ time,lvl=rep(1,ncol(cam.surv)), alarm.symbol=list(pch=17, col="red2", cex=1,lwd=3), cex.axis=1,xlab="Time (weeks)",cex.lab=1,xaxis.tickFreq=list("%m"=atChange,"%G"=atChange),xaxis.labelFreq=list("%G"=at2ndChange), @@ -513,7 +513,7 @@ ## phase1 <- which(isoWeekYear(epoch(salmNewportGermany))$ISOYear < 2011) ## phase2 <- in2011 ## control = list(range = phase2, c.ARL = 4, theta = log(2), ret = "cases", -## mu0 = list(S = 1, trend = TRUE, refit = FALSE)) +## mu0 = list(S = 1, trend = TRUE, refit = FALSE)) ## salmGlrnb <- glrnb(salmNewportGermany, control = control) @@ -529,7 +529,7 @@ trend=TRUE, refit=FALSE),c.ARL = 4, theta=log(2),ret="cases") -# Perform monitoring with glrnb +# Perform monitoring with glrnb salmGlrnb <- glrnb(salmNewportGermany,control=control) @@ -538,7 +538,7 @@ ################################################### # Plot y.max <- max(observed(salmGlrnb),upperbound(salmGlrnb),na.rm=TRUE) -do.call("plot",modifyList(plotOpts,list(x=salmGlrnb,ylim=c(0,y.max)))) +do.call("plot",modifyList(plotOpts,list(x=salmGlrnb,ylim=c(0,y.max)))) ################################################### @@ -547,7 +547,7 @@ getOption("SweaveHooks")[["fig"]]() # Plot y.max <- max(observed(salmGlrnb),upperbound(salmGlrnb),na.rm=TRUE) -do.call("plot",modifyList(plotOpts,list(x=salmGlrnb,ylim=c(0,y.max)))) +do.call("plot",modifyList(plotOpts,list(x=salmGlrnb,ylim=c(0,y.max)))) @@ -561,13 +561,13 @@ ## data2013 <- which(isoWeekYearData$ISOYear == 2013) ## dataEarly2014 <- which(isoWeekYearData$ISOYear == 2014 ## & isoWeekYearData$ISOWeek <= 4) -## +## ## phase1 <- dataBefore2013 ## phase2 <- c(data2013, dataEarly2014) ## ## weekNumbers <- isoWeekYearData$ISOWeek ## salmHospitalized.df <- cbind(as.data.frame(salmHospitalized), weekNumbers) -## colnames(salmHospitalized.df) <- c("y", "t", "state", "alarm", "n", +## colnames(salmHospitalized.df) <- c("y", "t", "state", "alarm", "upperbound","n", ## "freq", "epochInPeriod", "weekNumber") @@ -584,7 +584,7 @@ # Prepare data for fitting the model weekNumber <- isoWeekYear(epoch(salmHospitalized))$ISOWeek salmHospitalized.df <- cbind(as.data.frame(salmHospitalized),weekNumber) -colnames(salmHospitalized.df) <- c("y","t","state","alarm","n","freq", +colnames(salmHospitalized.df) <- c("y","t","state","alarm","upperbound","n","freq", "epochInPeriod","weekNumber") @@ -593,9 +593,9 @@ ################################################### vars <- c( "y", "n", "t", "epochInPeriod", "weekNumber") m.bbin <- gamlss(cbind(y, n-y) ~ 1 + t - + sin(2 * pi * epochInPeriod) + cos(2 * pi * epochInPeriod) + + sin(2 * pi * epochInPeriod) + cos(2 * pi * epochInPeriod) + sin(4 * pi * epochInPeriod) + cos(4 * pi * epochInPeriod) - + I(weekNumber == 1) + I(weekNumber == 2), + + I(weekNumber == 1) + I(weekNumber == 2), sigma.formula =~ 1, family = BB(sigma.link = "log"), data = salmHospitalized.df[phase1, vars]) @@ -604,7 +604,7 @@ ################################################### ### code chunk number 43: cat2longVersion (eval = FALSE) ################################################### -## R <- 2 +## R <- 2 ## h <- 2 ## pi0 <- predict(m.bbin, newdata = salmHospitalized.df[phase2, vars], ## type = "response") @@ -618,7 +618,7 @@ ################################################### # CUSUM parameters R <- 2 #detect a doubling of the odds for a salmHospitalized being positive -h <- 2 #threshold of the cusum +h <- 2 #threshold of the cusum # Compute \textit{in-control} and out of control mean pi0 <- predict(m.bbin,newdata=salmHospitalized.df[phase2,vars], type="response") @@ -632,10 +632,10 @@ ################################################### ### code chunk number 45: cat2bislongVersion (eval = FALSE) ################################################### -## populationHosp <- cbind(population(salmHospitalized), +## populationHosp <- cbind(population(salmHospitalized), ## population(salmHospitalized)) -## observedHosp <- cbind(observed(salmHospitalized), -## population(salmHospitalized) - +## observedHosp <- cbind(observed(salmHospitalized), +## population(salmHospitalized) - ## observed(salmHospitalized)) ## nrowHosp <- nrow(salmHospitalized) ## salmHospitalized.multi <- new("sts", freq = 52, start = c(2004, 1), @@ -643,7 +643,7 @@ ## epochAsDate = TRUE, ## observed = observedHosp, ## populationFrac = populationHosp, -## state = matrix(0, nrow = nrowHosp, ncol = 2), +## state = matrix(0, nrow = nrowHosp, ncol = 2), ## multinomialTS = TRUE) @@ -661,7 +661,7 @@ populationFrac = cbind(population, population), state=matrix(0, nrow=nrow(salmHospitalized), - ncol = 2), + ncol = 2), multinomialTS=TRUE) @@ -669,30 +669,30 @@ ### code chunk number 47: cat2terdisplay (eval = FALSE) ################################################### ## dBB.cusum <- function(y, mu, sigma, size, log = FALSE) { -## return(dBB(if (is.matrix(y)) y[1,] else y, +## return(dBB(if (is.matrix(y)) y[1,] else y, ## if (is.matrix(y)) mu[1,] else mu, -## sigma = sigma, bd = size, log = log)) +## sigma = sigma, bd = size, log = log)) ## } ################################################### ### code chunk number 48: cat2ter ################################################### -# Function to use as dfun in the categoricalCUSUM +# Function to use as dfun in the categoricalCUSUM dBB.cusum <- function(y, mu, sigma, size, log = FALSE) { return(dBB( if (is.matrix(y)) y[1,] else y, if (is.matrix(y)) mu[1,] else mu, - sigma = sigma, bd = size, log = log)) + sigma = sigma, bd = size, log = log)) } ################################################### ### code chunk number 49: cat3display (eval = FALSE) ################################################### -## controlCat <- list(range = phase2, h = 2, pi0 = pi0m, pi1 = pi1m, +## controlCat <- list(range = phase2, h = 2, pi0 = pi0m, pi1 = pi1m, ## ret = "cases", dfun = dBB.cusum) -## salmHospitalizedCat <- categoricalCUSUM(salmHospitalized.multi, +## salmHospitalizedCat <- categoricalCUSUM(salmHospitalized.multi, ## control = controlCat, -## sigma = exp(m.bbin$sigma.coef)) +## sigma = exp(m.bbin$sigma.coef)) ################################################### @@ -701,9 +701,9 @@ # Monitoring controlCat <- list(range = phase2,h = 2,pi0 = pi0m, pi1 = pi1m, ret = "cases", dfun = dBB.cusum) -salmHospitalizedCat <- categoricalCUSUM(salmHospitalized.multi, +salmHospitalizedCat <- categoricalCUSUM(salmHospitalized.multi, control = controlCat, - sigma = exp(m.bbin$sigma.coef)) + sigma = exp(m.bbin$sigma.coef)) ################################################### @@ -715,14 +715,14 @@ plotOpts2$par.list <- list(mar=c(6,5,5,5),family="Times",las=1) do.call("plot",plotOpts2) lines(salmHospitalized@populationFrac/4000,col="grey80",lwd=2) -lines(campyDE$hum*50, col="white", lwd=2) +lines(campyDE$hum*50, col="white", lwd=2) axis(side=4, at=seq(0,2000,by=500)/4000,labels=as.character(seq(0,2000,by=500)),las=1, cex=2,cex.axis=1.5,pos=length(observed(salmHospitalized))+20) par(family="Times") -text(-20, 0.6, "Proportion", pos = 3, xpd = T,cex=cex.text) +text(-20, 0.6, "Proportion", pos = 3, xpd = T,cex=cex.text) text(520, 0.6, "Total number of \n reported cases", pos = 3, xpd = T,cex=cex.text) #mtext(side=4,text=expression(paste("Total number of reported cases (thousands)", sep='')), - #las=0,line=1, cex=cex.text) + #las=0,line=1, cex=cex.text) @@ -736,14 +736,14 @@ plotOpts2$par.list <- list(mar=c(6,5,5,5),family="Times",las=1) do.call("plot",plotOpts2) lines(salmHospitalized@populationFrac/4000,col="grey80",lwd=2) -lines(campyDE$hum*50, col="white", lwd=2) +lines(campyDE$hum*50, col="white", lwd=2) axis(side=4, at=seq(0,2000,by=500)/4000,labels=as.character(seq(0,2000,by=500)),las=1, cex=2,cex.axis=1.5,pos=length(observed(salmHospitalized))+20) par(family="Times") -text(-20, 0.6, "Proportion", pos = 3, xpd = T,cex=cex.text) +text(-20, 0.6, "Proportion", pos = 3, xpd = T,cex=cex.text) text(520, 0.6, "Total number of \n reported cases", pos = 3, xpd = T,cex=cex.text) #mtext(side=4,text=expression(paste("Total number of reported cases (thousands)", sep='')), - #las=0,line=1, cex=cex.text) + #las=0,line=1, cex=cex.text) @@ -757,13 +757,13 @@ ################################################### ### code chunk number 54: NICELOOKING (eval = FALSE) ################################################### -## h.grid <- seq(1, 10, by = 0.5) +## h.grid <- seq(1, 10, by = 0.5) ## ## simone <- function(sts, h) { ## y <- rBB(length(phase2), mu = pi0m[1, , drop = FALSE], ## bd = population(sts)[phase2, ], ## sigma = exp(m.bbin$sigma.coef)) -## observed(sts)[phase2, ] <- cbind(y, sts@populationFrac[phase2, 1] - y) +## observed(sts)[phase2, ] <- cbind(y, sts@populationFrac[phase2, 1] - y) ## one.surv <- categoricalCUSUM(sts, modifyList(controlCat, list(h = h)), ## sigma = exp(m.bbin$sigma.coef)) ## return(any(alarms(one.surv)[, 1])) @@ -772,13 +772,13 @@ ## ## nSims <- 1000 ## -## pMC <- sapply(h.grid, function(h) { -## mean(replicate(nSims, simone(salmHospitalized.multi, h))) +## pMC <- sapply(h.grid, function(h) { +## mean(replicate(nSims, simone(salmHospitalized.multi, h))) ## }) ## ## pMarkovChain <- sapply( h.grid, function(h) { -## TA <- LRCUSUM.runlength(mu = pi0m[1,, drop = FALSE], -## mu0 = pi0m[1,, drop = FALSE], +## TA <- LRCUSUM.runlength(mu = pi0m[1,, drop = FALSE], +## mu0 = pi0m[1,, drop = FALSE], ## mu1 = pi1m[1,, drop = FALSE], ## n = population(salmHospitalized.multi)[phase2, ], ## h = h, dfun = dBB.cusum, @@ -791,8 +791,8 @@ ### code chunk number 55: cath ################################################### # Values of the threshold to be investigated -h.grid <- seq(1,10,by=0.5) - +h.grid <- seq(1,10,by=0.5) + # Prepare function for simulations simone <- function(sts, h) { # Draw observed values from the \textit{in-control} distribution @@ -800,7 +800,7 @@ bd=population(sts)[phase2,], sigma=exp(m.bbin$sigma.coef)) observed(sts)[phase2,] <- cbind(y,sts@populationFrac[phase2,1] - y) -# Perform monitoring +# Perform monitoring one.surv <- categoricalCUSUM(sts, control=modifyList(controlCat, list(h=h)), sigma=exp(m.bbin$sigma.coef)) # Return 1 if there was at least one alarm @@ -812,17 +812,17 @@ # Number of simulations nSims=1000 # Simulations over the possible h values -pMC <- sapply(h.grid, function(h) { +pMC <- sapply(h.grid, function(h) { h <- h - mean(replicate(nSims, simone(salmHospitalized.multi,h))) + mean(replicate(nSims, simone(salmHospitalized.multi,h))) }) # Distribution function to be used by LRCUSUM.runlength dBB.rl <- function(y, mu, sigma, size, log = FALSE) { - dBB(y, mu = mu, sigma = sigma, bd = size, log = log) + dBB(y, mu = mu, sigma = sigma, bd = size, log = log) } # Markov Chain approximation over h.grid pMarkovChain <- sapply( h.grid, function(h) { - TA <- LRCUSUM.runlength(mu=pi0m[1,,drop=FALSE], mu0=pi0m[1,,drop=FALSE], + TA <- LRCUSUM.runlength(mu=pi0m[1,,drop=FALSE], mu0=pi0m[1,,drop=FALSE], mu1=pi1m[1,,drop=FALSE], n=population(salmHospitalized.multi)[phase2,], h=h, dfun=dBB.rl, sigma=exp(m.bbin$sigma.coef)) @@ -878,7 +878,7 @@ ### code chunk number 59: ROTAPLOT (eval = FALSE) ################################################### ## data("rotaBB") -## plot(rotaBB, xlab = "Time (months)", +## plot(rotaBB, xlab = "Time (months)", ## ylab = "Proportion of reported cases") @@ -911,7 +911,7 @@ fun(epoch(rotaBB),observed(rotaBB)[,i],type="l",xlab="Time (months)",ylab="Reported cases",ylim=c(0,max(observed(rotaBB))),col=pal[i],lwd=2) } else { fun(epoch(rotaBB),observed(rotaBB)[,i,drop=FALSE]/rowSums(observed(rotaBB)),type="l",xlab="Time (months)",ylab="Proportion of reported cases",ylim=c(0,max(observed(rotaBB)/rowSums(observed(rotaBB)))),col=pal[i],lwd=2) - } + } } # Add legend axis(1,at=as.numeric(epoch(rotaBB)),label=NA,tck=-0.01) @@ -932,19 +932,19 @@ ### code chunk number 62: monitoringCounts.Rnw:1394-1408 (eval = FALSE) ################################################### ## rotaBB.df <- as.data.frame(rotaBB) -## +## ## X <- with(rotaBB.df, cbind(intercept = 1, epoch, -## sin1 = sin(2 * pi * epochInPeriod), +## sin1 = sin(2 * pi * epochInPeriod), ## cos1 = cos(2 * pi * epochInPeriod))) ## -## phase1 <- epoch(rotaBB) < as.Date("2009-01-01") +## phase1 <- epoch(rotaBB) < as.Date("2009-01-01") ## phase2 <- !phase1 ## ## order <- c(2:5, 1); reorder <- c(5, 1:4) ## ## library("MGLM") -## m0 <- MGLMreg(as.matrix(rotaBB.df[phase1, order]) ~ -1 + X[phase1, ], -## dist = "MN") +## m0 <- MGLMreg(as.matrix(rotaBB.df[phase1, order]) ~ -1 + X[phase1, ], +## dist = "MN") ################################################### @@ -953,12 +953,12 @@ # Convert sts object to data.frame useful for regression modelling rotaBB.df <- as.data.frame(rotaBB) -# Create matrix +# Create matrix X <- with(rotaBB.df,cbind(intercept=1,epoch, sin1=sin(2*pi*epochInPeriod),cos1=cos(2*pi*epochInPeriod))) # Fit model to 2002-2009 data -phase1 <- epoch(rotaBB) < as.Date("2009-01-01") +phase1 <- epoch(rotaBB) < as.Date("2009-01-01") phase2 <- !phase1 # MGLMreg automatically takes the last class as ref so we reorder @@ -966,7 +966,7 @@ # Fit multinomial logit model (i.e. dist="MN") to phase1 data library("MGLM") -m0 <- MGLMreg(as.matrix(rotaBB.df[phase1,order])~ -1 + X[phase1,], dist="MN") +m0 <- MGLMreg(as.matrix(rotaBB.df[phase1,order])~ -1 + X[phase1,], dist="MN") ################################################### @@ -991,7 +991,7 @@ ### code chunk number 66: monitoringCounts.Rnw:1444-1450 ################################################### m1 <- m0 -# Out-of control model: shift in all intercept coeffs +# Out-of control model: shift in all intercept coeffs m1$coefficients[1,] <- m0$coefficients[1,] + log(2) # Proportion over time for phase2 based on fitted model (re-order back) pi0 <- t(predict(m0, newdata=X[phase2,])[,reorder]) @@ -1005,7 +1005,7 @@ return(dmultinom(x = y, size = size, prob = mu, log = log)) } -control <- list(range = seq(nrow(rotaBB))[phase2], h = h, pi0 = pi0, +control <- list(range = seq(nrow(rotaBB))[phase2], h = h, pi0 = pi0, pi1 = pi1, ret = "value", dfun = dfun) surv <- categoricalCUSUM(rotaBB,control=control) @@ -1016,7 +1016,7 @@ #Number of MC samples nSamples <- 1e4 -#Do MC +#Do MC simone.stop <- function(sts, control) { phase2Times <- seq(nrow(sts))[phase2] #Generate new phase2 data from the fitted in control model @@ -1053,7 +1053,7 @@ ## return(dmultinom(x = y, size = size, prob = mu, log = log)) ## } ## -## control <- list(range = seq(nrow(rotaBB))[phase2], h = h, pi0 = pi0, +## control <- list(range = seq(nrow(rotaBB))[phase2], h = h, pi0 = pi0, ## pi1 = pi1, ret = "value", dfun = dfun) ## surv <- categoricalCUSUM(rotaBB,control=control) @@ -1061,7 +1061,7 @@ ################################################### ### code chunk number 71: monitoringCounts.Rnw:1508-1512 ################################################### -m0.dm <- MGLMreg(as.matrix(rotaBB.df[phase1, 1:5]) ~ -1 + X[phase1, ], +m0.dm <- MGLMreg(as.matrix(rotaBB.df[phase1, 1:5]) ~ -1 + X[phase1, ], dist = "DM") c(m0$AIC, m0.dm$AIC) @@ -1071,21 +1071,21 @@ ### code chunk number 72: monitoringCounts.Rnw:1520-1538 (eval = FALSE) ################################################### ## delta <- 2 -## m1.dm <- m0.dm -## m1.dm$coefficients[1, ] <- m0.dm$coefficients[1, ] + +## m1.dm <- m0.dm +## m1.dm$coefficients[1, ] <- m0.dm$coefficients[1, ] + ## c(-delta, rep(delta/4, 4)) ## ## alpha0 <- exp(X[phase2,] %*% m0.dm$coefficients) ## alpha1 <- exp(X[phase2,] %*% m1.dm$coefficients) -## +## ## dfun <- function(y, size, mu, log = FALSE) { ## dLog <- ddirm(t(y), t(mu)) -## if (log) { return(dLog) } else { return(exp(dLog)) } +## if (log) { return(dLog) } else { return(exp(dLog)) } ## } ## ## h <- 2 -## control <- list(range = seq(nrow(rotaBB))[phase2], h = h, -## pi0 = t(alpha0), pi1 = t(alpha1), +## control <- list(range = seq(nrow(rotaBB))[phase2], h = h, +## pi0 = t(alpha0), pi1 = t(alpha1), ## ret = "value", dfun = dfun) ## surv.dm <- categoricalCUSUM(rotaBB, control = control) @@ -1095,23 +1095,23 @@ ################################################### # Change intercept in the first class (for DM all 5 classes are modeled) delta <- 2 -m1.dm <- m0.dm -m1.dm$coefficients[1,] <- m0.dm$coefficients[1,] + +m1.dm <- m0.dm +m1.dm$coefficients[1,] <- m0.dm$coefficients[1,] + c(-delta,rep(delta/4,4)) # Calculate the alphas of the multinomial-Dirichlet in the two cases alpha0 <- exp(X[phase2,] %*% m0.dm$coefficients) alpha1 <- exp(X[phase2,] %*% m1.dm$coefficients) - + # Use alpha vector as mu magnitude # (not possible to compute it from mu and size) dfun <- function(y, size, mu, log=FALSE) { dLog <- ddirm(t(y), t(mu)) - if (log) { return(dLog) } else {return(exp(dLog))} + if (log) { return(dLog) } else {return(exp(dLog))} } # Threshold h <- 2 -control <- list(range=seq(nrow(rotaBB))[phase2],h=h,pi0=t(alpha0), +control <- list(range=seq(nrow(rotaBB))[phase2],h=h,pi0=t(alpha0), pi1=t(alpha1), ret="value",dfun=dfun) surv.dm <- categoricalCUSUM(rotaBB,control=control) @@ -1134,11 +1134,11 @@ y.max <- max(observed(surv.dm[,1]),upperbound(surv.dm[,1]),observed(surv[,1]),upperbound(surv[,1]),na.rm=TRUE) plotOpts3 <- modifyList(plotOpts,list(x=surv[,1],ylim=c(0,y.max),ylab=expression(C[t]),xlab="")) plotOpts3$legend.opts <- list(x="topleft",bty="n",legend="R",lty=1,lwd=line.lwd,col=alarm.symbol$col,horiz=TRUE,cex=cex.leg) -do.call("plot",plotOpts3) +do.call("plot",plotOpts3) lines( c(0,1e99), rep(h,2),lwd=2,col="darkgray",lty=1) par(family="Times") mtext(side=1,text="Time (weeks)", - las=0,line=3, cex=cex.text) + las=0,line=3, cex=cex.text) ################################################### @@ -1148,11 +1148,11 @@ plotOpts3 <- modifyList(plotOpts,list(x=surv.dm[,1],ylim=c(0,y.max),ylab=expression(C[t]),xlab="")) plotOpts3$legend.opts <- list(x="topleft",bty="n",legend="R",lty=1,lwd=line.lwd,col=alarm.symbol$col,horiz=TRUE,cex=cex.text) y.max <- max(observed(surv.dm[,1]),upperbound(surv.dm[,1]),observed(surv[,1]),upperbound(surv[,1]),na.rm=TRUE) -do.call("plot",plotOpts3) +do.call("plot",plotOpts3) lines( c(0,1e99), rep(h,2),lwd=2,col="darkgray",lty=1) par(family="Times") mtext(side=1,text="Time (weeks)", - las=0,line=3, cex=cex.text) + las=0,line=3, cex=cex.text) ################################################### @@ -1163,7 +1163,7 @@ ## rangeAnalysis <- (today - 4):today ## in2013 <- which(isoWeekYear(epoch(salmNewport))$ISOYear == 2013) ## -## algoParameters <- list(range = rangeAnalysis, noPeriods = 10, +## algoParameters <- list(range = rangeAnalysis, noPeriods = 10, ## populationBool = FALSE, ## b = 4, w = 3, weightsThreshold = 2.58, ## pastWeeksNotIncluded = 26, pThresholdTrend = 1, @@ -1176,11 +1176,11 @@ ## ## start <- isoWeekYear(epoch(salmNewport)[range(range)[1]]) ## end <- isoWeekYear(epoch(salmNewport)[range(range)[2]]) -## caption <- paste("Results of the analysis of reported S. Newport +## caption <- paste("Results of the analysis of reported S. Newport ## counts in two German federal states for the weeks W-", ## start$ISOWeek, "-", start$ISOYear, " - W-", end$ISOWeek, ## "-", end$ISOYear, " performed on ", Sys.Date(), -## ". Bold upperbounds (UB) indicate weeks with alarms.", +## ". Bold upperbounds (UB) indicate weeks with alarms.", ## sep="") ## toLatex(results, caption = caption) @@ -1189,11 +1189,11 @@ ### code chunk number 78: testLabel ################################################### # In this example the sts-object already exists. -# Supply the code with the date of a Monday and look for the +# Supply the code with the date of a Monday and look for the # corresponding index in the sts-object today <- which(epoch(salmNewport)==as.Date("2013-12-23")) -# The analysis will be performed for the given week -# and the 4 previous ones +# The analysis will be performed for the given week +# and the 4 previous ones range <- (today-4):today in2013 <- which(isoWeekYear(epoch(salmNewport))$ISOYear==2013) # Control argument for using the improved method @@ -1208,11 +1208,11 @@ # Export the results as a tex table start <- isoWeekYear(epoch(salmNewport)[range(range)[1]]) end <- isoWeekYear(epoch(salmNewport)[range(range)[2]]) -caption <- paste("Results of the analysis of reported S. Newport +caption <- paste("Results of the analysis of reported S. Newport counts in two German federal states for the weeks W-", start$ISOWeek," ",start$ISOYear," - W-",end$ISOWeek, " ",end$ISOYear," performed on ",Sys.Date(), - ". Bold upperbounds (thresholds) indicate weeks with alarms.", + ". Bold upperbounds (thresholds) indicate weeks with alarms.", sep="") toLatex(results, table.placement="h", size = "normalsize", sanitize.text.function = identity, diff -Nru r-cran-surveillance-1.12.2/inst/doc/monitoringCounts.Rnw r-cran-surveillance-1.13.0/inst/doc/monitoringCounts.Rnw --- r-cran-surveillance-1.12.2/inst/doc/monitoringCounts.Rnw 2016-11-17 10:20:33.000000000 +0000 +++ r-cran-surveillance-1.13.0/inst/doc/monitoringCounts.Rnw 2016-12-20 15:49:38.000000000 +0000 @@ -10,7 +10,7 @@ \newcommand{\NB}{\operatorname{NB}} %% almost as usual -\author{Ma\"elle Salmon\\Robert Koch Institute \And +\author{Ma\"elle Salmon\\Robert Koch Institute \And Dirk Schumacher\\Robert Koch Institute \And Michael H\"ohle\\ Stockholm University,\\Robert Koch Institute } \title{ @@ -65,7 +65,7 @@ E-mail: \email{maelle.salmon@yahoo.se}, \email{mail@dirk-schumacher.net}\\ URL: \url{https://masalmon.github.io/}\\ \phantom{URL: }\url{http://www.dirk-schumacher.net/}\\ - + Michael H\"{o}hle\\ Department of Mathematics\\ Stockholm University\\ @@ -101,43 +101,43 @@ ## create directories for plots and cache dir.create("plots", showWarnings=FALSE) dir.create("monitoringCounts-cache", showWarnings=FALSE) -@ +@ \SweaveOpts{prefix.string=plots/monitoringCounts} \label{sec:1} -The package provides a central S4 data class \code{sts} to capture multivariate or univariate time series. All further methods use objects of this class as an input. +The package provides a central S4 data class \code{sts} to capture multivariate or univariate time series. All further methods use objects of this class as an input. Therefore we first describe how to use the \code{sts} class and then, as all monitoring methods of the package conform to the same syntax, a typical call of a function for aberration detection will be presented. Furthermore, the visualization of time series and of the results of their monitoring is depicted. \subsection{How to store time series and related information} -In \pkg{surveillance}, time series of counts and related information are encoded in a specific S4-class called \code{sts} (\textit{surveillance time series}) that represents +In \pkg{surveillance}, time series of counts and related information are encoded in a specific S4-class called \code{sts} (\textit{surveillance time series}) that represents possibly multivariate time series of counts. Denote the counts as $\left( y_{it} ; i = 1, \ldots,m, t = 1, \ldots, n \right)$, where $n$ is the length of the time series and $m$ is the number of entities, e.g., geographical regions, hospitals or age groups, being -monitored. An example which we shall look at in more details is a time series representing the weekly counts of cases of infection with \textit{Salmonella Newport} in all 16 federal states of Germany +monitored. An example which we shall look at in more details is a time series representing the weekly counts of cases of infection with \textit{Salmonella Newport} in all 16 federal states of Germany from 2004 to 2013 with $n=525$ weeks and $m=16$ geographical units. Infections with \textit{Salmonella Newport}, a subtype of \textit{Salmonella}, can trigger gastroenteritis, prompting the seek of medical care. Infections with \textit{Salmonella} are notifiable in Germany since 2001 with data being forwarded to the RKI by federal states health authorities on behalf of the local health authorities. - + \subsubsection[Slots of the class sts]{Slots of the class \texttt{sts}} The key slots of the \code{sts} class are those describing the observed counts and the corresponding time periods of the aggregation. The observed counts $\left(y_{it}\right)$ are stored in the $n \times m$ matrix \code{observed}. -A number of other slots characterize time. First, \code{epoch} denotes the corresponding time period of the aggregation. If the Boolean \code{epochAsDate} is \code{TRUE}, -\code{epoch} is the numeric representation of \code{Date} objects corresponding to each observation in \code{observed}. If the Boolean \code{epochAsDate} is \code{FALSE}, +A number of other slots characterize time. First, \code{epoch} denotes the corresponding time period of the aggregation. If the Boolean \code{epochAsDate} is \code{TRUE}, +\code{epoch} is the numeric representation of \code{Date} objects corresponding to each observation in \code{observed}. If the Boolean \code{epochAsDate} is \code{FALSE}, \code{epoch} is the time index $1 \leq t \leq n$ of each of these observations. Then, \code{freq} is the number of observations per year: 365 for -daily data, 52 for weekly data and 12 for monthly data. +daily data, 52 for weekly data and 12 for monthly data. Finally, \code{start} is a vector representing the origin of the time series with two values that are the year and the epoch within that year for the first observation of the time series -- \code{c(2014, 1)} for a weekly time series starting on the first week of 2014 for instance. Other slots enable the storage of additional information. Known aberrations are recorded in the Boolean slot \code{state} of the same dimensions as \code{observed} - with \code{TRUE} indicating an outbreak and \code{FALSE} indicating the absence of any known aberration. The monitored population in each of the units is stored in - slot \code{populationFrac}, which gives either proportions or numbers. The geography of the zone under surveillance is accessible through slot \code{map} + with \code{TRUE} indicating an outbreak and \code{FALSE} indicating the absence of any known aberration. The monitored population in each of the units is stored in + slot \code{populationFrac}, which gives either proportions or numbers. The geography of the zone under surveillance is accessible through slot \code{map} which is an object of class \code{SpatialPolygonsDataFrame}~\citep{sp1,sp2} providing a shape of the $m$ areas which are monitored and slot \code{neighbourhood}, which is a symmetric matrix of Booleans size $m^2$ stating the neighborhood -matrix. Slot \code{map} is pertinent when units are geographical units, whereas \code{neighbourhood} could be useful in any case, e.g., for storing a contact matrix between age groups for modeling purposes. +matrix. Slot \code{map} is pertinent when units are geographical units, whereas \code{neighbourhood} could be useful in any case, e.g., for storing a contact matrix between age groups for modeling purposes. Finally, if monitoring has been performed on the data the information on its control arguments and its results are stored in \code{control}, \code{upperbound} and \code{alarm} presented in Section~\ref{sec:howto}. \subsubsection[Creation of an object of class sts]{Creation of an object of class \texttt{sts}} -The creation of a \code{sts} object is straightforward, requiring a call to the function \code{new} together with the slots to be assigned as arguments. The input of data from external files is one possibility for getting the counts +The creation of a \code{sts} object is straightforward, requiring a call to the function \code{new} together with the slots to be assigned as arguments. The input of data from external files is one possibility for getting the counts as it is described in \citet{hoehle-mazick-2010}. To exemplify the process we shall use weekly counts of \textit{Salmonella Newport} in Germany loaded using \code{data("salmNewport")}. Alternatively, one can use coercion methods to convert between the \texttt{ts} class and the \texttt{sts} class. Note that this only converts the content of the slot \texttt{observed}, that is, @@ -172,9 +172,9 @@ cex.leg <- cex.text line.lwd <- 2#1 stsPlotCol <- c("mediumblue","mediumblue","red2") -alarm.symbol <- list(pch=17, col="red2", cex=2,lwd=3) +alarm.symbol <- list(pch=17, col="red2", cex=2,lwd=3) #Define list with arguments to use with do.call("legend", legOpts) -legOpts <- list(x="topleft",legend=c(expression(U[t])),bty="n",lty=1,lwd=line.lwd,col=alarm.symbol$col,horiz=TRUE,cex=cex.leg) +legOpts <- list(x="topleft",legend=c(expression(U[t])),bty="n",lty=1,lwd=line.lwd,col=alarm.symbol$col,horiz=TRUE,cex=cex.leg) #How should the par of each plot look? par.list <- list(mar=c(6,5,5,5),family="Times") #Do this once @@ -186,17 +186,17 @@ ylab="No. of reports", xlab="Time (weeks)",lty=c(1,1,1), legend.opts=legOpts,alarm.symbol=alarm.symbol, xaxis.tickFreq=list("%V"=atChange,"%m"=atChange,"%G"=atChange), - xaxis.labelFreq=list("%Y"=atMedian), + xaxis.labelFreq=list("%Y"=atMedian), xaxis.labelFormat="%Y", par.list=par.list,hookFunc=hookFunc) - + @ <>= -# Load data +# Load data data("salmNewport") @ -<>= -# Plot +<>= +# Plot y.max <- max(aggregate(salmNewport,by="unit")@observed,na.rm=TRUE) plotOpts2 <- modifyList(plotOpts,list(x=salmNewport,legend.opts=NULL,ylim=c(0,y.max),type = observed ~ time),keep.null=TRUE) plotOpts2$par.list <- list(mar=c(6,5,0,5),family="Times") @@ -205,7 +205,7 @@ @ \setkeys{Gin}{height=7cm, width=15cm} \begin{figure} -\begin{center} +\begin{center} <>= <> @ @@ -227,30 +227,30 @@ @ which is shown in Figure~\ref{fig:Newport}. Here, the \code{atChange} and \code{atMedian} functions are small helper functions and the respective tick lengths are controlled by the \pkg{surveillance} specific option \code{surveillance.options("stsTickFactors")}. Actually \code{sts} objects can be plotted using different options: \code{type = observed ~ time} produces the time series for whole Germany as shown in Figure~\ref{fig:Newport}, whereas \code{type = observed ~ time | unit} -is a panelled graph with each panel representing the time series of counts of a federal state as seen in Figure~\ref{fig:unit}. +is a panelled graph with each panel representing the time series of counts of a federal state as seen in Figure~\ref{fig:unit}. \setkeys{Gin}{height=7cm, width=9cm} \begin{figure} %\begin{center} %\hspace*{\fill}% \hspace{-1em} \subfloat[]{ -<>= +<>= y.max <- max(observed(salmNewport[,2]),observed(salmNewport[,3]),na.rm=TRUE) plotOpts2 <- modifyList(plotOpts,list(x=salmNewport[,2],legend.opts=NULL,ylim=c(0,y.max)),keep.null=TRUE) plotOpts2$xaxis.tickFreq <- list("%G"=atChange) do.call("plot",plotOpts2) @ - + \includegraphics[width=9cm]{plots/monitoringCounts-unitPlot1.pdf} }\hspace{-3em}% \subfloat[]{ -<>= +<>= # Plot with special function plotOpts2 <- modifyList(plotOpts,list(x=salmNewport[,3],legend.opts=NULL,ylim=c(0,y.max)),keep.null=TRUE) plotOpts2$xaxis.tickFreq <- list("%G"=atChange) do.call("plot",plotOpts2) -@ +@ \includegraphics[width=9cm]{plots/monitoringCounts-unitPlot2.pdf} } %\hspace*{\fill}% @@ -259,8 +259,8 @@ %\end{center} \end{figure} -Once created one can use typical subset operations on a \code{sts} object: for instance \code{salmNewport[} \code{1:10, "Berlin"]} is a new \code{sts} object -with weekly counts for Berlin during the 10 first weeks of the initial dataset; \code{salmNewport[isoWeekYear(epoch(salmNewport))\$ISOYear<=2010,]} uses the \code{surveillance}'s \code{isoWeekYear()} +Once created one can use typical subset operations on a \code{sts} object: for instance \code{salmNewport[} \code{1:10, "Berlin"]} is a new \code{sts} object +with weekly counts for Berlin during the 10 first weeks of the initial dataset; \code{salmNewport[isoWeekYear(epoch(salmNewport))\$ISOYear<=2010,]} uses the \code{surveillance}'s \code{isoWeekYear()} function to get a \code{sts} object with weekly counts for all federal states up to 2010. Moreover, one can take advantage of the \proglang{R} function \code{aggregate()}. For instance, \code{aggregate(salmNewport,by="unit")} returns a \code{sts} object representing weekly counts of \textit{Salmonella Newport} in Germany as a whole, whereas \code{aggregate(salmNewport, by = "time")} corresponds to the total count of cases in each federal state over the whole period. @@ -272,7 +272,7 @@ \subsubsection{Statistical framework for aberration detection} We introduce the framework for aberration detection on an univariate time series of counts $\left\{y_t,\> t=1,2,\ldots\right\}$. Surveillance aims -at detecting an \textit{aberration}, that is to say, an important change in the process occurring at an unknown time $\tau$. +at detecting an \textit{aberration}, that is to say, an important change in the process occurring at an unknown time $\tau$. This change can be a step increase of the counts of cases or a more gradual change~\citep{Sonesson2003}. Based on the possibility of such a change, for each time $t$ we want to differentiate between the two states \textit{in-control} and \textit{out-of-control}. At any timepoint $t_0\geq 1$, the available information -- i.e., past counts -- is defined as $\bm{y}_{t_0} = \left\{ @@ -281,9 +281,9 @@ Functions for aberration detection thus use past data to estimate $r(\bm{y}_{t_0})$, and compare it to the threshold $g$, above which the current count can be considered as suspicious and thus doomed as \textit{out-of-control}. -Threshold values and alarm Booleans for each timepoint of the monitored range are -saved in the slots \code{upperbound} and \code{alarm}, of the same dimensions as \code{observed}, while the method parameters used for computing the threshold values -and alarm Booleans +Threshold values and alarm Booleans for each timepoint of the monitored range are +saved in the slots \code{upperbound} and \code{alarm}, of the same dimensions as \code{observed}, while the method parameters used for computing the threshold values +and alarm Booleans are stored in the slot \code{control}. \subsubsection{Aberration detection in the package} @@ -296,27 +296,27 @@ Here we shall expand on C1 for which the baseline are the 7 timepoints before the assessed timepoint $t_0$, that is to say $\left(y_{t_0-7},\ldots,y_{t_0-1}\right)$. The expected value is the mean of the baseline. The method is based on a statistic called $C_{t_0}$ defined as -$C_{t_0}= \frac{(y_{t_0}-\bar{y}_{t_0})}{s_{t_0}}$, where $$\bar{y}_{t_0}= \frac{1}{7} \cdot\sum_{i=t_0-7}^{t_0-1} y_i \textnormal{ and } s_{t_0}^2= \frac{1}{7-1} \cdot\sum_{i=t_0-7}^{t_0-1} \left(y_i - \bar{y}_{t_0}\right)^2.$$ +$C_{t_0}= \frac{(y_{t_0}-\bar{y}_{t_0})}{s_{t_0}}$, where $$\bar{y}_{t_0}= \frac{1}{7} \cdot\sum_{i=t_0-7}^{t_0-1} y_i \textnormal{ and } s_{t_0}^2= \frac{1}{7-1} \cdot\sum_{i=t_0-7}^{t_0-1} \left(y_i - \bar{y}_{t_0}\right)^2.$$ Under the null hypothesis of no outbreak, it is assumed that $C_{t_0} \stackrel{H_0}{\sim} {N}(0,1)$. The upperbound $U_{t_0}$ is found by assuming that $y_t$ is normal, estimating parameters by plug-in and then taking the $(1-\alpha)$-th quantile of this distribution, i.e. $U_{t_0}= \bar{y}_{t_0} + z_{1-\alpha}s_{t_0}$, where $z_{1-\alpha}$ is the $(1-\alpha)$-quantile of the standard normal distribution. An alarm is raised if $y_{t_0} > U_{t_0}$. - The output of the algorithm is a \code{sts} object that contains subsets of slots \code{observed}, \code{population} and \code{state} defined by the range of timepoints specified in the input -- + The output of the algorithm is a \code{sts} object that contains subsets of slots \code{observed}, \code{population} and \code{state} defined by the range of timepoints specified in the input -- \textit{e.g} the last 20 timepoints of the time series, -and with the slots \code{upperbound} and \code{alarm} filled by the output of the algorithm. -Information relative to the \code{range} of data to be monitored -and to the parameters of the algorithm, such as \code{alpha} for \code{earsC}, has to be formulated in the slot \code{control}. -This information is also stored in the slot \code{control} of the returned \code{sts} object for later inspection. +and with the slots \code{upperbound} and \code{alarm} filled by the output of the algorithm. +Information relative to the \code{range} of data to be monitored +and to the parameters of the algorithm, such as \code{alpha} for \code{earsC}, has to be formulated in the slot \code{control}. +This information is also stored in the slot \code{control} of the returned \code{sts} object for later inspection. <>= in2011 <- which(isoWeekYear(epoch(salmNewport))$ISOYear == 2011) salmNewportGermany <- aggregate(salmNewport, by = "unit") control <- list(range = in2011, method = "C1", alpha = 0.05) -surv <- earsC(salmNewportGermany, control = control) +surv <- earsC(salmNewportGermany, control = control) plot(surv) @ -<>= +<>= # Range for the monitoring in2011 <- which(isoWeekYear(epoch(salmNewport))$ISOYear==2011) # Aggregate counts over Germany @@ -325,7 +325,7 @@ control <- list(range = in2011, method="C1", alpha=0.05) # Apply earsC function surv <- earsC(salmNewportGermany, control=control) -# Plot the results +# Plot the results #plot(surv) # Plot y.max <- max(observed(surv),upperbound(surv),na.rm=TRUE) @@ -333,8 +333,8 @@ @ \setkeys{Gin}{height=7cm, width=15cm} \begin{figure} -\begin{center} - +\begin{center} + <>= <> @ @@ -347,7 +347,7 @@ \end{figure} The \code{sts} object is easily visualized -using the function \code{plot} as depicted in Figure~\ref{fig:NewportEARS}, which shows the upperbound as a solid line and the alarms -- timepoints +using the function \code{plot} as depicted in Figure~\ref{fig:NewportEARS}, which shows the upperbound as a solid line and the alarms -- timepoints where the upperbound has been exceeded -- as triangles. The four last alarms correspond to a known outbreak in 2011 due to sprouts~\citep{Newport2011}. One sees that the upperbound right after the outbreak is affected by the outbreak: it is very high, so that a smaller outbreak would not be detected. @@ -357,8 +357,8 @@ account when estimating the expected count and the associated threshold. For instance, ignoring an increasing time trend could decrease sensitivity. Inversely, overlooking an annual surge in counts during the summer could decrease specificity. Therefore, it is advisable to use detection methods whose underlying models incorporate essential characteristics of time series of disease count data -such as overdispersion, seasonality, time trend and presence of past outbreaks in the records~\citep{Unkel2012,Shmueli2010}. Moreover, the EARS methods do not compute a proper prediction interval for the -current count. Sounder statistical methods will be reviewed in the next section. +such as overdispersion, seasonality, time trend and presence of past outbreaks in the records~\citep{Unkel2012,Shmueli2010}. Moreover, the EARS methods do not compute a proper prediction interval for the +current count. Sounder statistical methods will be reviewed in the next section. \section[Using surveillance in selected contexts]{Using \pkg{surveillance} in selected contexts} @@ -367,7 +367,7 @@ More than a dozen algorithms for aberration detection are implemented in the package. Among those, this section presents a set of representative algorithms, which are already in routine application at several public health institutions or which we think have the potential to become so. -First we describe the Farrington method introduced by~\citet{farrington96} together with the improvements proposed by~\citet{Noufaily2012}. +First we describe the Farrington method introduced by~\citet{farrington96} together with the improvements proposed by~\citet{Noufaily2012}. As a Bayesian counterpart to these methods we present the BODA method published by~\citet{Manitz2013} which allows the easy integration of covariates. All these methods perform one-timepoint detection in that they detect aberrations only when the count at the currently monitored timepoint is above the threshold. Hence, no accumulation of evidence takes place. As an extension, we introduce an implementation of the negative binomial cumulative sum (CUSUM) of~\citet{hoehle.paul2008} that allows the detection @@ -375,10 +375,10 @@ \subsection{One size fits them all for count data} Two implementations of the Farrington method, which is currently \textit{the} method of choice at European public health institutes \citep{hulth_etal2010}, exist in the package. First, the original method as described in \citet{farrington96} is implemented as the function \code{farrington}. Its use was already described in \citet{hoehle-mazick-2010}. -Now, the newly implemented function \code{farringtonFlexible} supports the use of this \textit{original method} as well as of the \textit{improved method} built on suggestions made by~\citet{Noufaily2012} for improving the specificity without reducing the sensitivity. +Now, the newly implemented function \code{farringtonFlexible} supports the use of this \textit{original method} as well as of the \textit{improved method} built on suggestions made by~\citet{Noufaily2012} for improving the specificity without reducing the sensitivity. -In the function \code{farringtonFlexible} one can choose to use the original method or the improved method by specification of appropriate \code{control} arguments. -Which variant of the algorithm is to be used is determined by the contents of the \code{control} slot. +In the function \code{farringtonFlexible} one can choose to use the original method or the improved method by specification of appropriate \code{control} arguments. +Which variant of the algorithm is to be used is determined by the contents of the \code{control} slot. In the example below, \code{control1} corresponds to the use of the original method and \code{control2} indicates the options for the improved method. <>= # Control slot for the original method @@ -386,7 +386,7 @@ b=4,w=3,weightsThreshold=1,pastWeeksNotIncluded=3, pThresholdTrend=0.05,thresholdMethod="delta",alpha=0.05, limit54=c(0,50)) -# Control slot for the improved method +# Control slot for the improved method control2 <- list(range=in2011,noPeriods=10, b=4,w=3,weightsThreshold=2.58,pastWeeksNotIncluded=26, pThresholdTrend=1,thresholdMethod="nbPlugin",alpha=0.05, @@ -404,22 +404,22 @@ @ In both cases the steps of the algorithm are the same. In a first step, an overdispersed Poisson generalized linear model with log link is fitted to the reference data $\bm{y}_{t_0} \subseteq \left\{ -y_t\>;\> t\leq t_0\right\}$, where $\E(y_t)=\mu_t$ with $\log \mu_t = \alpha + \beta t$ and $\Var(y_t)=\phi\cdot\mu_t$ and where $\phi\geq1$ is ensured. +y_t\>;\> t\leq t_0\right\}$, where $\E(y_t)=\mu_t$ with $\log \mu_t = \alpha + \beta t$ and $\Var(y_t)=\phi\cdot\mu_t$ and where $\phi\geq1$ is ensured. The original method took seasonality into account by using a subset of the available data as reference data for fitting the GLM: \code{w} timepoints centred around the timepoint located $1,2,\ldots,b$ years before $t_0$, amounting to a total $b \cdot (2w+1)$ reference values. However, it was shown in~\citet{Noufaily2012} that the algorithm performs better when using more historical data. In order to do do so without disregarding seasonality, the authors introduced a zero order spline with 11 knots, which can be conveniently represented as a 10-level factor. We have extended this idea in our implementation so that one can choose an arbitrary number of periods -in each year. Thus, $\log \mu_t = \alpha + \beta t +\gamma_{c(t)}$ where $\gamma_{c(t)}$ are the coefficients of a zero order spline with $\mathtt{noPeriods}+1$ knots, which can be conveniently represented as a $\mathtt{noPeriods}$-level factor that reflects seasonality. Here, $c(t)$ is a function indicating in which season or period of the year $t$ belongs to. -The algorithm uses \code{w}, \code{b} and \texttt{noPeriods} to deduce the length of periods so they have -the same length up to rounding. An exception is the reference window centred around $t_0$. Figure~\ref{fig:fPlot} shows a minimal example, where each character corresponds to a different period. Note that setting $\mathtt{noPeriods} = 1$ corresponds to using the original method with only a subset of the data: +in each year. Thus, $\log \mu_t = \alpha + \beta t +\gamma_{c(t)}$ where $\gamma_{c(t)}$ are the coefficients of a zero order spline with $\mathtt{noPeriods}+1$ knots, which can be conveniently represented as a $\mathtt{noPeriods}$-level factor that reflects seasonality. Here, $c(t)$ is a function indicating in which season or period of the year $t$ belongs to. +The algorithm uses \code{w}, \code{b} and \texttt{noPeriods} to deduce the length of periods so they have +the same length up to rounding. An exception is the reference window centred around $t_0$. Figure~\ref{fig:fPlot} shows a minimal example, where each character corresponds to a different period. Note that setting $\mathtt{noPeriods} = 1$ corresponds to using the original method with only a subset of the data: there is only one period defined per year, the reference window around $t_0$ and other timepoints are not included in the model. \setkeys{Gin}{height=3cm, width=7cm} \begin{figure} \subfloat[$\texttt{noPeriods}=2$]{ -<>= +<>= library(ggplot2) library(grid) # for rectanges @@ -456,20 +456,20 @@ annotate("text", label = "Time", x = 170, y = 0, size = 8, colour = "black", family="serif") + # ticks labels -annotate('text',label=c("t[0]-2 %.% freq","t[0]-freq","t[0]"),x = xTicks, - y = yTicksEnd - 10, size = 8,family="serif",parse=T) +annotate('text',label=c("t[0]-2 %.% freq","t[0]-freq","t[0]"),x = xTicks, + y = yTicksEnd - 10, size = 8,family="serif",parse=T) p+ # periods labels -annotate('text',label=c("A","A","A","B","B"),x = xPeriods, - y = rep(6,5), size = 8,family="serif",parse=T) +annotate('text',label=c("A","A","A","B","B"),x = xPeriods, + y = rep(6,5), size = 8,family="serif",parse=T) @ \includegraphics[width=0.45\textwidth]{plots/monitoringCounts-fPlot1.pdf} } \qquad \subfloat[$\texttt{noPeriods}=3$]{ -<>= +<>= yTicksBigEnd2 <- rep(0,4) yTicksBigStart2 <- rep(heightTick,4) newX <- c(xTicks[1:2]+widthRectangles/2+52-widthRectangles,xTicks[1:2]+52/2) @@ -477,63 +477,63 @@ p + geom_segment(aes(x = newX, y = yTicksBigStart2, xend = newX, yend = yTicksBigEnd2), size=1)+ # periods labels -annotate('text',label=c("A","A","A","B","B","C","C"),x = xPeriods, - y = rep(6,7), size = 8,family="serif",parse=T) +annotate('text',label=c("A","A","A","B","B","C","C"),x = xPeriods, + y = rep(6,7), size = 8,family="serif",parse=T) @ \includegraphics[width=0.45\textwidth]{plots/monitoringCounts-fPlot2.pdf} } -\caption{Construction of the noPeriods-level factor to account for seasonality, depending on the value of the half-window size $w$ and of the freq of the -data. Here the number of years to go back in the past $b$ is 2. -Each level of the factor variable corresponds to a period delimited by ticks and is denoted by a character. The windows around $t_0$ are respectively of size $2w+1$,~$2w+1$ -and $w+1$. The segments between them are divided into the other periods so that they +\caption{Construction of the noPeriods-level factor to account for seasonality, depending on the value of the half-window size $w$ and of the freq of the +data. Here the number of years to go back in the past $b$ is 2. +Each level of the factor variable corresponds to a period delimited by ticks and is denoted by a character. The windows around $t_0$ are respectively of size $2w+1$,~$2w+1$ +and $w+1$. The segments between them are divided into the other periods so that they have the same length up to rounding.} \label{fig:fPlot} \end{figure} -Moreover, it was shown in \citet{Noufaily2012} that it is better to exclude the last 26 weeks before $t_0$ from the baseline -in order to avoid reducing sensitivity when an outbreak has started recently before $t_0$. In the \code{farringtonFlexible} function, one controls this by specifying \code{pastWeeksNotIncluded}, which is +Moreover, it was shown in \citet{Noufaily2012} that it is better to exclude the last 26 weeks before $t_0$ from the baseline +in order to avoid reducing sensitivity when an outbreak has started recently before $t_0$. In the \code{farringtonFlexible} function, one controls this by specifying \code{pastWeeksNotIncluded}, which is the number of last timepoints before $t_0$ that are not to be used. The default value is 26. Lastly, in the new implementation a population offset can be included in the GLM by setting \code{populationBool} to \code{TRUE} and supplying the possibly time-varying population size in the \code{population} slot of the \code{sts} object, but this will not be discussed further here. In a second step, the expected number of counts $\mu_{t_0}$ is predicted for the current timepoint $t_0$ using this GLM. An upperbound $U_{t_0}$ is calculated based on this predicted value and its variance. The two versions of the algorithm make different assumptions for this calculation. -The +The original method assumes that a transformation of the prediction error $g\left(y_{t_0}-\hat{\mu}_{t_0}\right)$ is normally distributed, for instance when using the identity transformation $g(x)=x$ one obtains $$y_{t_0} - \hat{\mu}_0 \sim \mathcal{N}(0,\Var(y_{t_0}-\hat{\mu}_0))\cdot$$ -The upperbound of the prediction interval is then calculated +The upperbound of the prediction interval is then calculated based on this distribution. First we have that $$ \Var(y_{t_0}-\hat{\mu}_{t_0}) = \Var(\hat{y}_{t_0}) + \Var(\hat{\mu}_{t_0})=\phi\mu_0+\Var(\hat{\mu}_{t_0}) $$ with $\Var(\hat{y}_{t_0})$ being the variance of an observation and $\Var(\hat{\mu}_{t_0})$ being the variance of the estimate. The threshold, defined as the upperbound of a one-sided $(1-\alpha)\cdot 100\%$ prediction interval, is then $$U_{t_0} = \hat{\mu}_0 + z_{1-\alpha}\widehat{\Var}(y_{t_0}-\hat{\mu}_{t_0})\cdot$$ This method can be used by setting the control option \code{thresholdMethod} equal to "\code{delta}". - However, a weakness -of this procedure is the normality assumption itself, so that an alternative was presented in \citet{Noufaily2012} and implemented as \code{thresholdMethod="Noufaily"}. -The central assumption of this approach is that + However, a weakness +of this procedure is the normality assumption itself, so that an alternative was presented in \citet{Noufaily2012} and implemented as \code{thresholdMethod="Noufaily"}. +The central assumption of this approach is that $y_{t_0} \sim \NB\left(\mu_{t_0},\nu\right)$, -with $\mu_{t_0}$ the mean of the distribution and $\nu=\frac{\mu_{t_0}}{\phi-1}$ its overdispersion parameter. In this parameterization, we still have $\E(y_t)=\mu_t$ and $\Var(y_t)=\phi\cdot\mu_t$ with $\phi>1$ -- otherwise a Poisson distribution is assumed for the -observed count. +with $\mu_{t_0}$ the mean of the distribution and $\nu=\frac{\mu_{t_0}}{\phi-1}$ its overdispersion parameter. In this parameterization, we still have $\E(y_t)=\mu_t$ and $\Var(y_t)=\phi\cdot\mu_t$ with $\phi>1$ -- otherwise a Poisson distribution is assumed for the +observed count. The threshold is defined as a quantile of the negative binomial distribution with plug-in estimates $\hat{\mu}_{t_0}$ and $\hat{\phi}$. Note that this disregards the estimation uncertainty in $\hat{\mu}_{t_0}$ and $\hat{\phi}$. As a consequence, the method "\code{muan}" (\textit{mu} for $\mu$ and \textit{an} for asymptotic normal) tries to solve the problem by using the asymptotic normal distribution of $(\hat{\alpha},\hat{\beta})$ to derive the upper $(1-\alpha)\cdot 100\%$ quantile of the asymptotic normal distribution of $\hat{\mu}_{t_0}=\hat{\alpha}+\hat{\beta}t_0$. Note that this does not reflect all estimation uncertainty because it disregards the estimation uncertainty of $\hat{\phi}$. -Note also that for time series where the variance of the estimator is large, the upperbound also ends up being very large. +Note also that for time series where the variance of the estimator is large, the upperbound also ends up being very large. Thus, the method "\code{nbPlugin}" seems to provide information that is easier to interpret by epidemiologists but with "\code{muan}" being more statistically correct. In a last step, the observed count $y_{t_0}$ is compared to the upperbound $U_{t_0}$ and an alarm is raised if $y_{t_0} > U_{t_0}$. -In both cases the fitting of the GLM involves three important steps. First, the algorithm performs an optional power-transformation for skewness correction and variance stabilisation, +In both cases the fitting of the GLM involves three important steps. First, the algorithm performs an optional power-transformation for skewness correction and variance stabilisation, depending on the value of the parameter \code{powertrans} in the \code{control} slot. Then, the significance of the time trend is checked. The time trend is included only when significant at a chosen level \code{pThresholdTrend}, when there are more than three years reference data and if no overextrapolation occurs because of the time trend. Lastly, past outbreaks are reweighted based on their Anscombe residuals. In \code{farringtonFlexible} the limit for reweighting past counts, \code{weightsThreshold}, can be specified by the user. If the Anscombe residual of a count is higher than \code{weightsThreshold} it is reweighted accordingly in a second fitting of the GLM. \citet{farrington96} used a value of $1$ -whereas \citet{Noufaily2012} advise a value of $2.56$ so that the reweighting procedure is less drastic, because it also shrinks the variance of the observations. +whereas \citet{Noufaily2012} advise a value of $2.56$ so that the reweighting procedure is less drastic, because it also shrinks the variance of the observations. The original method is widely used in public health surveillance~\citep{hulth_etal2010}. The reason for its success is primarily - that it does not need to be fine-tuned for each specific pathogen. It is hence easy to implement it for scanning data for many different pathogens. Furthermore, it does tackle classical issues of surveillance data: overdispersion, presence of past outbreaks that are reweighted, + that it does not need to be fine-tuned for each specific pathogen. It is hence easy to implement it for scanning data for many different pathogens. Furthermore, it does tackle classical issues of surveillance data: overdispersion, presence of past outbreaks that are reweighted, seasonality that is taken into account differently in the two methods. An example of use of the function is shown in Figure~\ref{fig:newportFar} with the code below. <>= -salm.farrington <- farringtonFlexible(salmNewportGermany, control1) +salm.farrington <- farringtonFlexible(salmNewportGermany, control1) salm.noufaily <- farringtonFlexible(salmNewportGermany, control2) @ @@ -544,23 +544,23 @@ \hspace{-1em} %\begin{center} \subfloat[]{ -<>= +<>= # Plot y.max <- max(observed(salm.farrington),upperbound(salm.farrington),observed(salm.noufaily),upperbound(salm.noufaily),na.rm=TRUE) do.call("plot",modifyList(plotOpts,list(x=salm.farrington,ylim=c(0,y.max)))) @ - + \includegraphics[width=9cm]{plots/monitoringCounts-farPlot1.pdf} } \hspace{-3em} \subfloat[]{ -<>= +<>= # Plot do.call("plot",modifyList(plotOpts,list(x=salm.noufaily,ylim=c(0,y.max)))) -@ +@ \includegraphics[width=9cm]{plots/monitoringCounts-farPlot2.pdf} } -\caption{S. Newport in Germany in 2011 monitored by (a) the original method and (b) the improved method. +\caption{S. Newport in Germany in 2011 monitored by (a) the original method and (b) the improved method. For the figure we turned off the option that the threshold is only computed if there were more than 5 cases during the 4 last timepoints including $t_0$. One gets less alarms with the most recent method and still does not miss the outbreak in the summer. Simulations on more time series support the use of the improved method instead of the original method.} @@ -571,48 +571,48 @@ \subsubsection{Similar methods in the package} The package also contains further methods based on a subset of the historical data: \code{bayes}, \code{rki} and \code{cdc}. -See Table~\ref{table:ref} for the corresponding references. Here, \code{bayes} uses a simple conjugate prior-posterior approach and computes the parameters of a +See Table~\ref{table:ref} for the corresponding references. Here, \code{bayes} uses a simple conjugate prior-posterior approach and computes the parameters of a negative binomial distribution based on past values. The procedure \code{rki} makes either the assumption of a normal or a Poisson distribution based on the mean of past counts. Finally, \code{cdc} aggregates weekly data into 4-week-counts and computes a normal distribution based upper confidence interval. None of these methods offer the inclusion of a linear trend, down-weighting of past outbreaks or power transformation of the data. Although these methods are good to have at hand, we personally recommend the use of the improved method implemented in the function \code{farringtonFlexible} because it is rather fast and makes use of more historical data than the other methods. \subsection{A Bayesian refinement} -The \code{farringtonFlexible} function described previously +The \code{farringtonFlexible} function described previously was a first indication that the \textit{monitoring} of surveillance time series requires a good \textit{modeling} of the time series before assessing aberrations. Generalized linear models (GLMs) and generalized additive models (GAMs) are well-established and powerful modeling frameworks for handling the -count data nature and trends of time series in a regression context. +count data nature and trends of time series in a regression context. The \code{boda} procedure~\citep{Manitz2013} continues this line of thinking by extending the simple GLMs used in the \code{farrington} and \code{farringtonFlexible} procedures to a fully fledged Bayesian GAM allowing -for penalized splines, e.g., to describe trends and seasonality, while +for penalized splines, e.g., to describe trends and seasonality, while simultaneously adjusting for previous outbreaks or concurrent processes influencing the case counts. A particular advantage of the Bayesian approach is that it constitutes a seamless -framework for performing both estimation and subsequent prediction: the +framework for performing both estimation and subsequent prediction: the uncertainty in parameter estimation is directly carried forward to the predictive posterior distribution. No asymptotic normal approximations nor plug-in inference -is needed. For fast approximate Bayesian inference we use the \pkg{INLA} \proglang{R} -package~\citep{INLA} to fit the Bayesian GAM. +is needed. For fast approximate Bayesian inference we use the \pkg{INLA} \proglang{R} +package~\citep{INLA} to fit the Bayesian GAM. -Still, monitoring with +Still, monitoring with \code{boda} is substantially slower than using the Farrington procedures. Furthermore, detailed regression modeling is only meaningful if the time series is known to be subject to external influences on which information is available. -Hence, the typical use at a public health institution would be the +Hence, the typical use at a public health institution would be the detailed analysis of a few selected time series, e.g., critical ones or those -with known trend character. +with known trend character. As an example, \citet{Manitz2013} studied the influence of absolute -humidity on the occurence of weekly reported campylobacter cases in Germany. +humidity on the occurence of weekly reported campylobacter cases in Germany. <>= # Load data and create \code{sts}-object data("campyDE") -cam.sts <- new("sts",epoch=as.numeric(campyDE$date), - observed=campyDE$case, state=campyDE$state, +cam.sts <- new("sts",epoch=as.numeric(campyDE$date), + observed=campyDE$case, state=campyDE$state, epochAsDate=TRUE) par(las=1) # Plot @@ -622,21 +622,21 @@ do.call("plot",plotOpts3) par(las=0) #mtext(side=2,text="No. of reports", - # las=0,line=3, cex=cex.text,family="Times") + # las=0,line=3, cex=cex.text,family="Times") par(family="Times") -text(-20, 2600, "No. of\n reports", pos = 3, xpd = T,cex=cex.text) +text(-20, 2600, "No. of\n reports", pos = 3, xpd = T,cex=cex.text) text(510, 2900, "Absolute humidity", pos = 3, xpd = T,cex=cex.text) text(510, 2550, expression(paste("[",g/m^3,"]", sep='')), pos = 3, xpd = T,cex=cex.text) -lines(campyDE$hum*50, col="white", lwd=2) +lines(campyDE$hum*50, col="white", lwd=2) axis(side=4, at=seq(0,2500,by=500),labels=seq(0,50,by=10),las=1,cex.lab=cex.text, cex=cex.text,cex.axis=cex.text,pos=length(epoch(cam.sts))+20) #mtext(side=4,text=expression(paste("Absolute humidity [ ",g/m^3,"]", sep='')), - # las=0,line=1, cex=cex.text,family="Times") + # las=0,line=1, cex=cex.text,family="Times") @ \setkeys{Gin}{height=7cm, width=15cm} \begin{figure} -\begin{center} +\begin{center} <>= <> @@ -652,27 +652,27 @@ data("campyDE") cam.sts <- new("sts", epoch = as.numeric(campyDE$date), observed = campyDE$case, state = campyDE$state, - epochAsDate = TRUE) + epochAsDate = TRUE) plot(cam.sts, legend = NULL, xlab = "time [weeks]", ylab = "No. reported", col = "gray", cex = 2, cex.axis = 2, cex.lab = 2) lines(campyDE$hum * 50, col = "darkblue", lwd = 2) @ The corresponding plot of the weekly time series is shown in -Figure~\ref{fig:campyDE}. We observe a strong association between humidity -and case numbers - an +Figure~\ref{fig:campyDE}. We observe a strong association between humidity +and case numbers - an association which is stronger than with, e.g., temperature or relative humidity. As noted in \citet{Manitz2013} the excess in cases in 2007 is thus partly explained by the high atmospheric humidity. - Furthermore, an increase in case numbers during the 2011 STEC O104:H4 outbreak is observed, which is explained by increased awareness and testing of many gastroenteritits pathogens during that period. The hypothesis is thus that there is no actual increased disease -activity~\citep{bernard_etal2014}. -Unfortunately, the German reporting system only records positive test results without keeping track of the -number of actual tests performed -- otherwise this would have been a natural adjustment variable. Altogether, the series contains several artefacts which + Furthermore, an increase in case numbers during the 2011 STEC O104:H4 outbreak is observed, which is explained by increased awareness and testing of many gastroenteritits pathogens during that period. The hypothesis is thus that there is no actual increased disease +activity~\citep{bernard_etal2014}. +Unfortunately, the German reporting system only records positive test results without keeping track of the +number of actual tests performed -- otherwise this would have been a natural adjustment variable. Altogether, the series contains several artefacts which appear prudent to address when monitoring the campylobacteriosis series. -The GAM in \code{boda} is based on the negative binomial distribution with time-varying expectation and time constant overdispersion parameter, i.e., +The GAM in \code{boda} is based on the negative binomial distribution with time-varying expectation and time constant overdispersion parameter, i.e., \begin{align*} -y_t &\sim \operatorname{NB}(\mu_t,\nu) -\end{align*} +y_t &\sim \operatorname{NB}(\mu_t,\nu) +\end{align*} with $\mu_{t}$ the mean of the distribution and $\nu$ the dispersion parameter~\citep{lawless1987}. Hence, we have $\E(y_t)=\mu_t$ and $\Var(y_t)=\mu_t\cdot(1+\mu_t/\nu)$. The linear predictor is given by \begin{align*} \log(\mu_t) &= \alpha_{0t} + \beta t + \gamma_t + \bm{x}_t^\top \bm{\delta} + \xi z_t, \quad t=1,\ldots,t_0. @@ -680,22 +680,22 @@ Here, the time-varying intercept $\alpha_{0t}$ is described by a penalized spline (e.g., first or second order random walk) and $\gamma_t$ denotes a periodic penalized -spline (as implemented in \code{INLA}) with period equal to the periodicity +spline (as implemented in \code{INLA}) with period equal to the periodicity of the data. Furthermore, $\beta$ characterizes the effect of a possible linear trend (on the log-scale) and $\xi$ is the effect of previous outbreaks. Typically, $z_t$ is -a zero-one process denoting if there was an outbreak in week $t$, but more involved adaptive and non-binary forms are imaginable. Finally, +a zero-one process denoting if there was an outbreak in week $t$, but more involved adaptive and non-binary forms are imaginable. Finally, $\bm{x}_t$ denotes a vector of possibly time-varying covariates, which influence the expected number of cases. Data from timepoints $1,\ldots,t_0-1$ are now used to determine the posterior distribution of all model parameters and subsequently the posterior predictive distribution of $y_{t_0}$ is computed. If the actual observed value of $y_{t_0}$ is above the $(1-\alpha)\cdot 100\%$ quantile of the predictive posterior distribution an alarm is flagged for $t_0$. -Below we illustrate the use +Below we illustrate the use of \code{boda} to monitor the campylobacterioris time series from 2007. -In the first case we include in the model for $\log\left(\mu_t\right)$ penalized splines for trend and +In the first case we include in the model for $\log\left(\mu_t\right)$ penalized splines for trend and seasonality and a simple linear trend. <>= rangeBoda <- which(epoch(cam.sts) >= as.Date("2007-01-01")) control.boda <- list(range = rangeBoda, X = NULL, trend = TRUE, - season = TRUE, prior = "iid", alpha = 0.025, + season = TRUE, prior = "iid", alpha = 0.025, mc.munu = 10000, mc.y = 1000, samplingMethod = "marginals") boda <- boda(cam.sts, control = control.boda) @@ -706,7 +706,7 @@ if (computeALL) { library("INLA") control.boda <- list(range=rangeBoda, X=NULL, trend=TRUE, - season=TRUE, prior='rw1', alpha=0.025, + season=TRUE, prior='rw1', alpha=0.025, mc.munu=10000, mc.y=1000, samplingMethod = "marginals") # boda without covariates: trend + spline + periodic spline @@ -719,18 +719,18 @@ In the second case we instead use only penalized and linear trend components, and, furthermore, include as covariates lags 1--4 of the absolute humidity as well -as zero-one indicators for $t_0$ belonging to the last two weeks +as zero-one indicators for $t_0$ belonging to the last two weeks (\code{christmas}) or first two weeks (\code{newyears}) of the year, -respectively. The later two variables are needed, because there is a +respectively. The later two variables are needed, because there is a systematically changed reporting behavior at the turn of the year (c.f.\ Figure~\ref{fig:campyDE}). Finally, \code{O104period} is an indicator variable on whether the reporting week belongs to the W21--W30 2011 period of increased awareness during the O104:H4 STEC outbreak. No additional correction for past outbreaks is made. <>= -covarNames <- c("l1.hum", "l2.hum", "l3.hum", "l4.hum", +covarNames <- c("l1.hum", "l2.hum", "l3.hum", "l4.hum", "newyears", "christmas", "O104period") -control.boda2 <- modifyList(control.boda, +control.boda2 <- modifyList(control.boda, list(X = campyDE[, covarNames], season = FALSE)) boda.covars <- boda(cam.sts, control = control.boda2) @ @@ -739,7 +739,7 @@ # boda with covariates: trend + spline + lagged hum + indicator variables covarNames <- c(paste("l",1:4,".hum",sep=""),"newyears","christmas", "O104period") -control.boda2 <- modifyList(control.boda, +control.boda2 <- modifyList(control.boda, list(X=campyDE[,covarNames],season=FALSE)) boda.covars <- boda(cam.sts, control=control.boda2) save(boda.covars, file = "monitoringCounts-cache/boda.covars.RData") @@ -748,25 +748,25 @@ } @ -We plot \code{boda.covars} in Figure~\ref{fig:b} and compare the output of the two boda calls with the output of -\code{farrington}, \code{farringtonFlexible} and \code{bayes} in -Figure~\ref{fig:alarmplot}. +We plot \code{boda.covars} in Figure~\ref{fig:b} and compare the output of the two boda calls with the output of +\code{farrington}, \code{farringtonFlexible} and \code{bayes} in +Figure~\ref{fig:alarmplot}. <>= cam.surv <- combineSTS(list(boda.covars=boda.covars,boda=boda,bayes=bayes, - farrington=far,farringtonFlexible=farflex)) + farrington=far,farringtonFlexible=farflex)) plot(cam.surv,type = alarm ~ time) @ Note here that the \code{bayes} procedure is not really useful as the adjustment for seasonality only works -poorly. Moreover, we think that this method produces many false alarms for this time series because it disregards the increasing time trend in number of +poorly. Moreover, we think that this method produces many false alarms for this time series because it disregards the increasing time trend in number of reported cases. Furthermore, it becomes clear that the improved Farrington procedure acts similar to the original procedure, but the improved reweighting and trend inclusion produces fewer alarms. The \code{boda} method is to be seen as a step towards more Bayesian thinking in aberration detection. However, besides its time demands for a detailed modeling, the speed of the procedure is also prohibitive as regards routine application. As a response~\citet{Maelle} introduce a method which has two advantages: it allows to adjust outbreak detection for reporting delays and includes an approximate inference method much faster than the INLA inference method. However, its linear predictor is more in the style of~\citet{Noufaily2012} not allowing for additionnal covariates or penalized options for the intercept. -<>= +<>= # Plot with special function y.max <- max(observed(boda.covars),upperbound(boda.covars),na.rm=TRUE) plotOpts2 <- modifyList(plotOpts,list(x=boda.covars,ylim=c(0,y.max)),keep.null=TRUE) @@ -775,8 +775,8 @@ @ \setkeys{Gin}{height=7cm, width=15cm} \begin{figure} -\begin{center} - +\begin{center} + <>= <> @ @@ -820,7 +820,7 @@ cam.surv <- combineSTS(list(boda.covars=boda.covars,boda=boda,bayes=bayes, farrington=far,farringtonFlexible=farflex)) -par(mar=c(4,8,2.1,2),family="Times") +par(mar=c(4,8,2.1,2),family="Times") plot(cam.surv,type = alarm ~ time,lvl=rep(1,ncol(cam.surv)), alarm.symbol=list(pch=17, col="red2", cex=1,lwd=3), cex.axis=1,xlab="Time (weeks)",cex.lab=1,xaxis.tickFreq=list("%m"=atChange,"%G"=atChange),xaxis.labelFreq=list("%G"=at2ndChange), @@ -829,8 +829,8 @@ \setkeys{Gin}{height=7cm, width=16cm} \begin{figure} -\begin{center} - +\begin{center} + <>= <> @@ -841,33 +841,33 @@ \end{figure} \subsection{Beyond one-timepoint detection} -GLMs as used in the Farrington method are suitable for the purpose of aberration detection since they allow a regression approach for adjusting counts for known phenomena such as trend or seasonality in surveillance data. -Nevertheless, the Farrington method only performs one-timepoint detection. In some contexts it can be more relevant -to detect sustained shifts early, e.g., an outbreak could be characterized at first by counts slightly higher than usual -in subsequent weeks without each weekly count being flagged by one-timepoint detection methods. Control charts inspired by statistical process control (SPC) -e.g., cumulative sums would allow the detection of sustained shifts. Yet they were not tailored to the specific characteristics of surveillance data such as overdispersion or seasonality. +GLMs as used in the Farrington method are suitable for the purpose of aberration detection since they allow a regression approach for adjusting counts for known phenomena such as trend or seasonality in surveillance data. +Nevertheless, the Farrington method only performs one-timepoint detection. In some contexts it can be more relevant +to detect sustained shifts early, e.g., an outbreak could be characterized at first by counts slightly higher than usual +in subsequent weeks without each weekly count being flagged by one-timepoint detection methods. Control charts inspired by statistical process control (SPC) +e.g., cumulative sums would allow the detection of sustained shifts. Yet they were not tailored to the specific characteristics of surveillance data such as overdispersion or seasonality. The method presented in \citet{hoehle.paul2008} conducts a synthesis of both worlds, i.e., traditional surveillance methods and SPC. The method is implemented in the package as the function \code{glrnb}, whose use is explained here. \subsubsection{Definition of the control chart} For the control chart, two distributions are defined, one for each of the two states \textit{in-control} and \textit{out-of-control}, whose likelihoods are compared at each time step. The \textit{in-control} distribution -$f_{\bm{\theta}_0}(y_t|\bm{z}_t)$ with the covariates $\bm{z}_t$ is estimated by a GLM of the Poisson or negative binomial family with a log link, depending on the overdispersion of the data. +$f_{\bm{\theta}_0}(y_t|\bm{z}_t)$ with the covariates $\bm{z}_t$ is estimated by a GLM of the Poisson or negative binomial family with a log link, depending on the overdispersion of the data. In this context, the standard model for the \textit{in-control} mean is -$$\log \mu_{0,t}=\beta_0+\beta_1t+\sum_{s=1}^S\left[\beta_{2s}\cos \left(\frac{2\pi s t}{\mathtt{Period}}\right)+\beta_{2s+1}\sin \left(\frac{2\pi s t}{\mathtt{Period}}\right)\right] $$ -where $S$ is the number of harmonic waves to use and \texttt{Period} is the period of the data as indicated in the \code{control} slot, for instance 52 for weekly data. -However, more flexible linear predictors, e.g., containing splines, concurrent covariates -or an offset could be used on the right hand-side of the equation. +$$\log \mu_{0,t}=\beta_0+\beta_1t+\sum_{s=1}^S\left[\beta_{2s}\cos \left(\frac{2\pi s t}{\mathtt{Period}}\right)+\beta_{2s+1}\sin \left(\frac{2\pi s t}{\mathtt{Period}}\right)\right] $$ +where $S$ is the number of harmonic waves to use and \texttt{Period} is the period of the data as indicated in the \code{control} slot, for instance 52 for weekly data. +However, more flexible linear predictors, e.g., containing splines, concurrent covariates +or an offset could be used on the right hand-side of the equation. The GLM could therefore be made very similar -to the one used by~\citet{Noufaily2012}, with reweighting of past outbreaks and various criteria for including the time trend. - -The parameters of the \textit{in-control} and \textit{out-of-control} models +to the one used by~\citet{Noufaily2012}, with reweighting of past outbreaks and various criteria for including the time trend. + +The parameters of the \textit{in-control} and \textit{out-of-control} models are respectively given by $\bm{\theta}_0$ and $\bm{\theta}_1$. -The \textit{out-of-control} mean is defined as a function of the \textit{in-control} mean, either with a multiplicative shift (additive on the log-scale) whose size $\kappa$ can be given as an input +The \textit{out-of-control} mean is defined as a function of the \textit{in-control} mean, either with a multiplicative shift (additive on the log-scale) whose size $\kappa$ can be given as an input or reestimated at each timepoint $t>1$, $\mu_{1,t}=\mu_{0,t}\cdot \exp(\kappa)$, or with an unknown autoregressive component as in \citet{held-etal-2005}, $\mu_{1,t}=\mu_{0,t}+\lambda y_{t-1}$ with unknown $\lambda>0$. -In \code{glrnb}, timepoints are divided into two intervals: phase 1 and phase 2. The \textit{in-control} mean and overdispersion are estimated with a GLM fitted on phase 1 data, whereas surveillance operates on phase 2 data. +In \code{glrnb}, timepoints are divided into two intervals: phase 1 and phase 2. The \textit{in-control} mean and overdispersion are estimated with a GLM fitted on phase 1 data, whereas surveillance operates on phase 2 data. When $\lambda$ is fixed, one uses a likelihood-ratio (LR) and defines the stopping time for alarm as $$N=\min \left\{ t_0 \geq 1 : \max_{1\leq t \leq t_0} \left[ \sum_{s=t}^{t_0} \log\left\{ \frac{f_{\bm{\theta}_1}(y_s|\bm{z}_s)}{f_{\bm{\theta}_0}(y_s|\bm{z}_s)} \right\} \right] \geq \mathtt{c.ARL} \right\},$$ @@ -882,24 +882,24 @@ For using \code{glrnb} one has two choices to make. First, one has to choose an \textit{in-control} model that will be fitted on phase 1 data. One can either provide the predictions for the vector of \textit{in-control} means \code{mu0} and the overdispersion parameter \code{alpha} by relying on an external fit, or use the built-in GLM estimator, that will use all data before the beginning of the surveillance range to fit a GLM with the number of harmonics \code{S} and a time trend if \code{trend} is \code{TRUE}. -The choice of the exact \textit{in-control} model depends on the data under surveillance. Performing model selection is a compulsory step in practical applications. Then, one needs to tune the surveillance function itself, for one of the two possible change forms, \code{intercept}~or~\code{epi}.~One~can choose either to set \code{theta} to a given value and thus perform LR instead of GLR. The value of \code{theta} has to be adapted to -the specific context in which the algorithm is applied: how big are shifts one wants to detect optimally? Is it better not to specify any and use GLR instead? +The choice of the exact \textit{in-control} model depends on the data under surveillance. Performing model selection is a compulsory step in practical applications. Then, one needs to tune the surveillance function itself, for one of the two possible change forms, \code{intercept}~or~\code{epi}.~One~can choose either to set \code{theta} to a given value and thus perform LR instead of GLR. The value of \code{theta} has to be adapted to +the specific context in which the algorithm is applied: how big are shifts one wants to detect optimally? Is it better not to specify any and use GLR instead? The threshold \texttt{c.ARL} also has to be specified by the user. As explained in \citet{hoehle-mazick-2010} one can compute the threshold for a desired run-length in control through direct Monte Carlo simulation or a Markov chain approximation. Lastly, as mentioned in -\citet{hoehle.paul2008}, a window-limited approach of surveillance, instead of looking at all the timepoints until the first observation, can make computation faster. +\citet{hoehle.paul2008}, a window-limited approach of surveillance, instead of looking at all the timepoints until the first observation, can make computation faster. -Here we apply \code{glrnb} to the time series of report counts of \textit{Salmonella Newport} in Germany by assuming a known multiplicative shift of factor $2$ and by using the built-in estimator to fit an \textit{in-control} model with one harmonic for -seasonality and a trend. This model will be refitted after each alarm, but first we use data from the years before 2011 as reference or \code{phase1}, -and the data from 2011 as data to be monitored or \code{phase2}. The threshold \texttt{c.ARL} was chosen to be 4 as we found with the same approach as \citet{hoehle-mazick-2010} that it made the probability of a false alarm within one year +Here we apply \code{glrnb} to the time series of report counts of \textit{Salmonella Newport} in Germany by assuming a known multiplicative shift of factor $2$ and by using the built-in estimator to fit an \textit{in-control} model with one harmonic for +seasonality and a trend. This model will be refitted after each alarm, but first we use data from the years before 2011 as reference or \code{phase1}, +and the data from 2011 as data to be monitored or \code{phase2}. The threshold \texttt{c.ARL} was chosen to be 4 as we found with the same approach as \citet{hoehle-mazick-2010} that it made the probability of a false alarm within one year smaller than 0.1. Figure~\ref{fig:glrnb}~shows the results of this monitoring. -<>= +<>= phase1 <- which(isoWeekYear(epoch(salmNewportGermany))$ISOYear < 2011) phase2 <- in2011 control = list(range = phase2, c.ARL = 4, theta = log(2), ret = "cases", - mu0 = list(S = 1, trend = TRUE, refit = FALSE)) + mu0 = list(S = 1, trend = TRUE, refit = FALSE)) salmGlrnb <- glrnb(salmNewportGermany, control = control) @ <>= @@ -912,18 +912,18 @@ trend=TRUE, refit=FALSE),c.ARL = 4, theta=log(2),ret="cases") -# Perform monitoring with glrnb +# Perform monitoring with glrnb salmGlrnb <- glrnb(salmNewportGermany,control=control) @ -<>= +<>= # Plot y.max <- max(observed(salmGlrnb),upperbound(salmGlrnb),na.rm=TRUE) -do.call("plot",modifyList(plotOpts,list(x=salmGlrnb,ylim=c(0,y.max)))) +do.call("plot",modifyList(plotOpts,list(x=salmGlrnb,ylim=c(0,y.max)))) @ \setkeys{Gin}{height=7cm, width=15cm} \begin{figure} -\begin{center} - +\begin{center} + <>= <> @@ -934,16 +934,16 @@ \label{fig:glrnb} \end{figure} -The implementation of \code{glrnb} on individual time series was already thoroughly explained in \citet{hoehle-mazick-2010}. Our objective in the present document is rather to provide practical tips -for the implementation of this function on huge amounts of data in public health surveillance applications. Issues of computational speed become very significant in such a context. Our proposal -to reduce the computational burden incurred by this algorithm is -to compute the \textit{in-control} model for each time serie (pathogen, subtype, subtype in a given location, etc.) only once a year and to use this estimation for the computation of a threshold for each time series. - An idea to avoid starting with an initial value of zero in the CUSUM is to use either $\left(\frac{1}{2}\right)\cdot\mathtt{c.ARL}$ as a starting value (fast initial response - CUSUM as presented in~\citet{lucas1982fast}) or to let surveillance run with the new \textit{in-control} model during +The implementation of \code{glrnb} on individual time series was already thoroughly explained in \citet{hoehle-mazick-2010}. Our objective in the present document is rather to provide practical tips +for the implementation of this function on huge amounts of data in public health surveillance applications. Issues of computational speed become very significant in such a context. Our proposal +to reduce the computational burden incurred by this algorithm is +to compute the \textit{in-control} model for each time serie (pathogen, subtype, subtype in a given location, etc.) only once a year and to use this estimation for the computation of a threshold for each time series. + An idea to avoid starting with an initial value of zero in the CUSUM is to use either $\left(\frac{1}{2}\right)\cdot\mathtt{c.ARL}$ as a starting value (fast initial response + CUSUM as presented in~\citet{lucas1982fast}) or to let surveillance run with the new \textit{in-control} model during a buffer period and use the resulting CUSUM as an initial value. One could also choose the maximum of these two possible starting values as a starting value. - During the buffer period alarms would be generated with the old model. Lastly, using GLR is much more computationally intensive than using LR, - whereas LR performs reasonably well on shifts different from the one indicated by \code{theta} as seen in the simulation studies of~\citet{hoehle.paul2008}. Our advice would therefore be to use LR with a reasonable predefined \code{theta}. - The amount of historical data used each year to update the model, the length of the buffer period and the value of \code{theta} have to be fixed for each specific application, e.g., using simulations and/or + During the buffer period alarms would be generated with the old model. Lastly, using GLR is much more computationally intensive than using LR, + whereas LR performs reasonably well on shifts different from the one indicated by \code{theta} as seen in the simulation studies of~\citet{hoehle.paul2008}. Our advice would therefore be to use LR with a reasonable predefined \code{theta}. + The amount of historical data used each year to update the model, the length of the buffer period and the value of \code{theta} have to be fixed for each specific application, e.g., using simulations and/or discussion with experts. \subsubsection{Similar methods in the package} @@ -953,33 +953,33 @@ The package also includes a semi-parametric method \code{outbreakP} that aims at detecting changes from a constant level to a monotonically increasing incidence, for instance the beginning of the influenza season. See Table~\ref{table:ref} for the corresponding references. \subsection{A method for monitoring categorical data} -All monitoring methods presented up to now have been methods for analysing count data. Nevertheless, in public health surveillance one also encounters categorical time series -which are time series where the response variable obtains one of $k\geq2$ different categories (nominal or ordinal). When $k=2$ the time series is binary, for instance representing -a specific outcome in cases such as hospitalization, death or a positive result to some diagnostic test. One can also think of applications with -$k>2$ if one studies, e.g., the age groups of the cases in the context of monitoring a vaccination program: vaccination targeted at children could induce a shift towards older cases which one wants to detect +All monitoring methods presented up to now have been methods for analysing count data. Nevertheless, in public health surveillance one also encounters categorical time series +which are time series where the response variable obtains one of $k\geq2$ different categories (nominal or ordinal). When $k=2$ the time series is binary, for instance representing +a specific outcome in cases such as hospitalization, death or a positive result to some diagnostic test. One can also think of applications with +$k>2$ if one studies, e.g., the age groups of the cases in the context of monitoring a vaccination program: vaccination targeted at children could induce a shift towards older cases which one wants to detect as quickly as possible -- this will be explained thorougly with an example. -The developments of prospective surveillance methods for such categorical time series were up to recently limited to CUSUM-based approaches for -binary data such as those explained in~\citet{Chen1978},~\citet{Reynolds2000} and~\citet{rogerson_yamada2004}. Other than being only suitable for binary data these methods have the drawback of not handling -overdispersion. A method improving on these two limitations while casting the problem into a more comprehending GLM regression framework for categorical data was +The developments of prospective surveillance methods for such categorical time series were up to recently limited to CUSUM-based approaches for +binary data such as those explained in~\citet{Chen1978},~\citet{Reynolds2000} and~\citet{rogerson_yamada2004}. Other than being only suitable for binary data these methods have the drawback of not handling +overdispersion. A method improving on these two limitations while casting the problem into a more comprehending GLM regression framework for categorical data was presented in~\citet{hoehle2010}. It is implemented as the function \code{categoricalCUSUM}. - + The way \code{categoricalCUSUM} operates is very similar to what \code{glrnb} does with fixed \textit{out-of-control} parameter. -First, the parameters in a multivariate GLM for the \textit{in-control} distribution are estimated from the historical data. Then the \textit{out-of-control} distribution is defined by a given -change in the parameters of this GLM, e.g., an intercept change, as explained later. Lastly, prospective monitoring is performed on -current data using a likelihood ratio detector which compares the likelihood of the response under the \textit{in-control} and \textit{out-of-control} distributions. +First, the parameters in a multivariate GLM for the \textit{in-control} distribution are estimated from the historical data. Then the \textit{out-of-control} distribution is defined by a given +change in the parameters of this GLM, e.g., an intercept change, as explained later. Lastly, prospective monitoring is performed on +current data using a likelihood ratio detector which compares the likelihood of the response under the \textit{in-control} and \textit{out-of-control} distributions. \subsubsection{Categorical CUSUM for binomial models} -The challenge when performing these steps with categorical data from surveillance systems -is finding an appropriate model. Binary GLMs as presented in Chapter~6 of \citet{Fahrmeir.etal2013} could be a solution but they do not tackle well the inherent overdispersion in the binomial time series. -Of course one could choose a quasi family but these are not proper statistical distributions making many issues such as prediction complicated. A better alternative -is offered by the use of \textit{generalized additive models for location, scale and shape} \citep[GAMLSS,][]{Rigby2005}, that support distributions such as the beta-binomial distribution, suitable for overdispersed binary data. With GAMLSS one can model the dependency of the mean -- \textit{location} -- -upon explanatory variables but the regression modeling is also extended to other parameters of the distribution, e.g., scale. Moreover any modelled parameter can be put under surveillance, be it the mean (as in the example later developed) +The challenge when performing these steps with categorical data from surveillance systems +is finding an appropriate model. Binary GLMs as presented in Chapter~6 of \citet{Fahrmeir.etal2013} could be a solution but they do not tackle well the inherent overdispersion in the binomial time series. +Of course one could choose a quasi family but these are not proper statistical distributions making many issues such as prediction complicated. A better alternative +is offered by the use of \textit{generalized additive models for location, scale and shape} \citep[GAMLSS,][]{Rigby2005}, that support distributions such as the beta-binomial distribution, suitable for overdispersed binary data. With GAMLSS one can model the dependency of the mean -- \textit{location} -- +upon explanatory variables but the regression modeling is also extended to other parameters of the distribution, e.g., scale. Moreover any modelled parameter can be put under surveillance, be it the mean (as in the example later developed) or the time trend in the linear predictor of the mean. This very flexible modeling framework is implemented in \proglang{R} through the \pkg{gamlss} package~\citep{StasJSS}. As an example we consider the time series of the weekly number of hospitalized cases among all \textit{Salmonella} cases in Germany in Jan 2004--Jan 2014, depicted in -Figure~\ref{fig:cat1}. We use 2004--2012 data to estimate the \textit{in-control} parameters and then perform surveillance on the data from 2013 and early 2014. We start by preprocessing the data. +Figure~\ref{fig:cat1}. We use 2004--2012 data to estimate the \textit{in-control} parameters and then perform surveillance on the data from 2013 and early 2014. We start by preprocessing the data. <>= data("salmHospitalized") isoWeekYearData <- isoWeekYear(epoch(salmHospitalized)) @@ -988,13 +988,13 @@ data2013 <- which(isoWeekYearData$ISOYear == 2013) dataEarly2014 <- which(isoWeekYearData$ISOYear == 2014 & isoWeekYearData$ISOWeek <= 4) - + phase1 <- dataBefore2013 phase2 <- c(data2013, dataEarly2014) weekNumbers <- isoWeekYearData$ISOWeek salmHospitalized.df <- cbind(as.data.frame(salmHospitalized), weekNumbers) -colnames(salmHospitalized.df) <- c("y", "t", "state", "alarm", "n", +colnames(salmHospitalized.df) <- c("y", "t", "state", "alarm", "upperbound","n", "freq", "epochInPeriod", "weekNumber") @ <>= @@ -1008,52 +1008,52 @@ # Prepare data for fitting the model weekNumber <- isoWeekYear(epoch(salmHospitalized))$ISOWeek salmHospitalized.df <- cbind(as.data.frame(salmHospitalized),weekNumber) -colnames(salmHospitalized.df) <- c("y","t","state","alarm","n","freq", +colnames(salmHospitalized.df) <- c("y","t","state","alarm","upperbound","n","freq", "epochInPeriod","weekNumber") @ We assume that the number of hospitalized cases follows a beta-binomial distribution, i.e., -$ y_t \sim \BetaBin(n_t,\pi_t,\sigma_t)$ with $n_t$ the total number of reported cases at time $t$, $\pi_t$ the proportion of these cases that were hospitalized and $\sigma$ the dispersion parameter. In this +$ y_t \sim \BetaBin(n_t,\pi_t,\sigma_t)$ with $n_t$ the total number of reported cases at time $t$, $\pi_t$ the proportion of these cases that were hospitalized and $\sigma$ the dispersion parameter. In this parametrization, $$E(y_t)=n_t \pi_t,\quad \text{and}$$ $$\Var(y_t)=n_t \pi_t(1-\pi_t)\left( 1 + \frac{\sigma(n_t-1)}{\sigma+1} \right)\cdot$$ We choose to model the expectation $n_t \pi_t$ using a beta-binomial model with a logit-link which is a special case of a GAMLSS, i.e., $$\logit(\pi_t)=\bm{z}_t^\top\bm{\beta}$$ where $\bm{z}_t$ is a vector of possibly time-varying covariates and $\bm{\beta}$ a vector of covariate effects in our example. -The proportion of hospitalized cases -varies throughout the year as seen in Figure~\ref{fig:cat1}. -One observes that in the summer the proportion of hospitalized cases is smaller than in other seasons. However, over the holidays in December the proportion of hospitalized cases increases. - Note that the number of non-hospitalized cases drops while the number of hospitalized cases remains constant (data not shown): this might be explained by the fact that cases that are not serious enough to go to the hospital are not seen by general practitioners because sick workers do not need - a sick note during the holidays. Therefore, the \textit{in-control} model should contain these elements, as well as the fact that there is an +The proportion of hospitalized cases +varies throughout the year as seen in Figure~\ref{fig:cat1}. +One observes that in the summer the proportion of hospitalized cases is smaller than in other seasons. However, over the holidays in December the proportion of hospitalized cases increases. + Note that the number of non-hospitalized cases drops while the number of hospitalized cases remains constant (data not shown): this might be explained by the fact that cases that are not serious enough to go to the hospital are not seen by general practitioners because sick workers do not need + a sick note during the holidays. Therefore, the \textit{in-control} model should contain these elements, as well as the fact that there is an increasing trend of the proportion because GPs prescribe less and less stool diagnoses so that more diagnoses are done on hospitalized cases. -We choose a model with an intercept, a time trend, two harmonic terms and a factor variable for the first two weeks of -each year. The variable \code{epochInPeriod} takes into account the fact that not all years have 52 weeks. +We choose a model with an intercept, a time trend, two harmonic terms and a factor variable for the first two weeks of +each year. The variable \code{epochInPeriod} takes into account the fact that not all years have 52 weeks. <>= vars <- c( "y", "n", "t", "epochInPeriod", "weekNumber") m.bbin <- gamlss(cbind(y, n-y) ~ 1 + t - + sin(2 * pi * epochInPeriod) + cos(2 * pi * epochInPeriod) + + sin(2 * pi * epochInPeriod) + cos(2 * pi * epochInPeriod) + sin(4 * pi * epochInPeriod) + cos(4 * pi * epochInPeriod) - + I(weekNumber == 1) + I(weekNumber == 2), + + I(weekNumber == 1) + I(weekNumber == 2), sigma.formula =~ 1, family = BB(sigma.link = "log"), data = salmHospitalized.df[phase1, vars]) @ -The change we aim to detect is defined by a multiplicative change of odds, from $\frac{\pi_t^0}{(1-\pi_t^0)}$ to $R\cdot\frac{\pi_t^0}{(1-\pi_t^0)}$ with $R>0$, similar to what was done in~\citet{Steiner1999} for the logistic regression model. +The change we aim to detect is defined by a multiplicative change of odds, from $\frac{\pi_t^0}{(1-\pi_t^0)}$ to $R\cdot\frac{\pi_t^0}{(1-\pi_t^0)}$ with $R>0$, similar to what was done in~\citet{Steiner1999} for the logistic regression model. This is equivalent to an additive change of the log-odds, $$\logit(\pi_t^1)=\logit(\pi_t^0)+\log R$$ with $\pi_t^0$ being the \textit{in-control} proportion and $\pi_t^1$ the \textit{out-of-control} distribution. The likelihood ratio based CUSUM statistic is now defined as $$C_{t_0}=\max_{1\leq t \leq {t_0}}\left( \sum_{s=t}^{t_0} \log \left( \frac{f(y_s;\bm{z}_s,\bm{\theta}_1)}{f(y_s;\bm{z}_s,\bm{\theta}_0)} \right) \right)$$ -with $\bm{\theta}_0$ and $\bm{\theta}_1$ being the vector in- and \textit{out-of-control} parameters, respectively. Given a threshold \code{h}, an alarm is sounded at the first time when $C_{t_0}>\mathtt{h}$. +with $\bm{\theta}_0$ and $\bm{\theta}_1$ being the vector in- and \textit{out-of-control} parameters, respectively. Given a threshold \code{h}, an alarm is sounded at the first time when $C_{t_0}>\mathtt{h}$. -We set the parameters of the \code{categoricalCUSUM} to optimally detect a doubling of the odds in 2013 and 2014, i.e., $R=2$. Furthermore, we for now set the threshold of the CUSUM at $h=2$. +We set the parameters of the \code{categoricalCUSUM} to optimally detect a doubling of the odds in 2013 and 2014, i.e., $R=2$. Furthermore, we for now set the threshold of the CUSUM at $h=2$. We use the GAMLSS to predict the mean of the \textit{in-control} and \textit{out-of-control} distributions and store them into matrices with two columns among which the second one represents the reference category. <>= -R <- 2 +R <- 2 h <- 2 pi0 <- predict(m.bbin, newdata = salmHospitalized.df[phase2, vars], type = "response") @@ -1064,7 +1064,7 @@ <>= # CUSUM parameters R <- 2 #detect a doubling of the odds for a salmHospitalized being positive -h <- 2 #threshold of the cusum +h <- 2 #threshold of the cusum # Compute \textit{in-control} and out of control mean pi0 <- predict(m.bbin,newdata=salmHospitalized.df[phase2,vars], type="response") @@ -1074,15 +1074,15 @@ pi0m <- rbind(pi0, 1-pi0) pi1m <- rbind(pi1, 1-pi1) @ -Note that the \code{categoricalCUSUM} function is constructed to operate on the observed slot of \code{sts}-objects -which have as columns the number of cases in each category at each timepoint, \textit{i.e.}, each row of the observed slot contains the elements -$(y_{t1},...,y_{tk})$. +Note that the \code{categoricalCUSUM} function is constructed to operate on the observed slot of \code{sts}-objects +which have as columns the number of cases in each category at each timepoint, \textit{i.e.}, each row of the observed slot contains the elements +$(y_{t1},...,y_{tk})$. <>= -populationHosp <- cbind(population(salmHospitalized), +populationHosp <- cbind(population(salmHospitalized), population(salmHospitalized)) -observedHosp <- cbind(observed(salmHospitalized), - population(salmHospitalized) - +observedHosp <- cbind(observed(salmHospitalized), + population(salmHospitalized) - observed(salmHospitalized)) nrowHosp <- nrow(salmHospitalized) salmHospitalized.multi <- new("sts", freq = 52, start = c(2004, 1), @@ -1090,7 +1090,7 @@ epochAsDate = TRUE, observed = observedHosp, populationFrac = populationHosp, - state = matrix(0, nrow = nrowHosp, ncol = 2), + state = matrix(0, nrow = nrowHosp, ncol = 2), multinomialTS = TRUE) @ <>= @@ -1105,71 +1105,71 @@ populationFrac = cbind(population, population), state=matrix(0, nrow=nrow(salmHospitalized), - ncol = 2), + ncol = 2), multinomialTS=TRUE) @ -Furthermore, one needs to define a wrapper for the distribution function in order to have a argument named \code{"mu"} in the function. +Furthermore, one needs to define a wrapper for the distribution function in order to have a argument named \code{"mu"} in the function. -<>= +<>= dBB.cusum <- function(y, mu, sigma, size, log = FALSE) { - return(dBB(if (is.matrix(y)) y[1,] else y, + return(dBB(if (is.matrix(y)) y[1,] else y, if (is.matrix(y)) mu[1,] else mu, - sigma = sigma, bd = size, log = log)) + sigma = sigma, bd = size, log = log)) } @ -<>= -# Function to use as dfun in the categoricalCUSUM +<>= +# Function to use as dfun in the categoricalCUSUM dBB.cusum <- function(y, mu, sigma, size, log = FALSE) { return(dBB( if (is.matrix(y)) y[1,] else y, if (is.matrix(y)) mu[1,] else mu, - sigma = sigma, bd = size, log = log)) + sigma = sigma, bd = size, log = log)) } @ After these preliminary steps, the monitoring can be performed. <>= -controlCat <- list(range = phase2, h = 2, pi0 = pi0m, pi1 = pi1m, +controlCat <- list(range = phase2, h = 2, pi0 = pi0m, pi1 = pi1m, ret = "cases", dfun = dBB.cusum) -salmHospitalizedCat <- categoricalCUSUM(salmHospitalized.multi, +salmHospitalizedCat <- categoricalCUSUM(salmHospitalized.multi, control = controlCat, - sigma = exp(m.bbin$sigma.coef)) + sigma = exp(m.bbin$sigma.coef)) @ <>= # Monitoring controlCat <- list(range = phase2,h = 2,pi0 = pi0m, pi1 = pi1m, ret = "cases", dfun = dBB.cusum) -salmHospitalizedCat <- categoricalCUSUM(salmHospitalized.multi, +salmHospitalizedCat <- categoricalCUSUM(salmHospitalized.multi, control = controlCat, - sigma = exp(m.bbin$sigma.coef)) + sigma = exp(m.bbin$sigma.coef)) @ -The results can be seen in Figure~\ref{fig:catDouble}(a). With the given settings, there are alarms at week -16 in 2004 -and at week 3 in 2004. -The one in 2014 corresponds to the usual peak of the beginning of the year, which was larger than expected this year, maybe because the +The results can be seen in Figure~\ref{fig:catDouble}(a). With the given settings, there are alarms at week +16 in 2004 +and at week 3 in 2004. +The one in 2014 corresponds to the usual peak of the beginning of the year, which was larger than expected this year, maybe because the weekdays of the holidays were particularly worker-friendly so that sick notes were even less needed. -<>= +<>= y.max <- max(observed(salmHospitalized)/population(salmHospitalized),upperbound(salmHospitalized)/population(salmHospitalized),na.rm=TRUE) plotOpts2 <- modifyList(plotOpts,list(x=salmHospitalized,legend.opts=NULL,ylab="",ylim=c(0,y.max)),keep.null=TRUE) plotOpts2$xaxis.tickFreq <- list("%G"=atChange,"%m"=atChange) plotOpts2$par.list <- list(mar=c(6,5,5,5),family="Times",las=1) do.call("plot",plotOpts2) lines(salmHospitalized@populationFrac/4000,col="grey80",lwd=2) -lines(campyDE$hum*50, col="white", lwd=2) +lines(campyDE$hum*50, col="white", lwd=2) axis(side=4, at=seq(0,2000,by=500)/4000,labels=as.character(seq(0,2000,by=500)),las=1, cex=2,cex.axis=1.5,pos=length(observed(salmHospitalized))+20) par(family="Times") -text(-20, 0.6, "Proportion", pos = 3, xpd = T,cex=cex.text) +text(-20, 0.6, "Proportion", pos = 3, xpd = T,cex=cex.text) text(520, 0.6, "Total number of \n reported cases", pos = 3, xpd = T,cex=cex.text) #mtext(side=4,text=expression(paste("Total number of reported cases (thousands)", sep='')), - #las=0,line=1, cex=cex.text) + #las=0,line=1, cex=cex.text) @ \begin{figure} -\begin{center} - +\begin{center} + <>= <> @@ -1179,33 +1179,33 @@ \caption{Weekly proportion of Salmonella cases that were hospitalized in Germany 2004-2014. In addition the corresponding number of reported cases is shown as a light curve.} \label{fig:cat1} \end{figure} -<>= +<>= @ -The value for the threshold \code{h} can be determined following the procedures presented in \citet{hoehle-mazick-2010} for count data, and as in the code exhibited below. Two methods can be used for -determining the probability of a false alarm within a pre-specified number of steps for a given value of the threshold \code{h}: a Monte Carlo method relying on, e.g., 1000 simulations and a Markov Chain approximation of the CUSUM. The former is much more computationally intensive than the latter: +The value for the threshold \code{h} can be determined following the procedures presented in \citet{hoehle-mazick-2010} for count data, and as in the code exhibited below. Two methods can be used for +determining the probability of a false alarm within a pre-specified number of steps for a given value of the threshold \code{h}: a Monte Carlo method relying on, e.g., 1000 simulations and a Markov Chain approximation of the CUSUM. The former is much more computationally intensive than the latter: with the code below, the Monte Carlo method needed approximately 300 times more time than the Markov Chain method. -Since both results are close we recommend the Markov Chain approximation for practical use. The Monte Carlo method works by sampling observed values from the estimated -distribution and performing monitoring with \code{categoricalCUSUM} on this \code{sts} object. As observed values are estimated from the \textit{in-control} distribution every alarm thus obtained is a -false alarm so that the simulations allow to estimate the probability of a false alarm when monitoring \textit{in-control} data over the timepoints of \code{phase2}. The Markov Chain approximation introduced by \citet{brook_evans1972} is implemented as \code{LRCUSUM.runlength} +Since both results are close we recommend the Markov Chain approximation for practical use. The Monte Carlo method works by sampling observed values from the estimated +distribution and performing monitoring with \code{categoricalCUSUM} on this \code{sts} object. As observed values are estimated from the \textit{in-control} distribution every alarm thus obtained is a +false alarm so that the simulations allow to estimate the probability of a false alarm when monitoring \textit{in-control} data over the timepoints of \code{phase2}. The Markov Chain approximation introduced by \citet{brook_evans1972} is implemented as \code{LRCUSUM.runlength} which is already used for \code{glrnb}. Results from both methods can be seen in Figure~\ref{fig:catDouble}(b). We chose a value of 2 for \code{h} so that the probability of a false alarm within the 56 timepoints of \code{phase2} is less than $0.1$. -One first has to set the values of the threshold to be investigated and to prepare the function used for simulation, that draws observed values from the -\textit{in-control} distribution and performs monitoring on the corresponding time series, then indicating if there was at least one alarm. Then 1000 simulations were -performed with a fixed seed value for the sake of reproducibility. Afterwards, we tested the Markov Chain approximation using the function \code{LRCUSUM.runlength} over the same grid +One first has to set the values of the threshold to be investigated and to prepare the function used for simulation, that draws observed values from the +\textit{in-control} distribution and performs monitoring on the corresponding time series, then indicating if there was at least one alarm. Then 1000 simulations were +performed with a fixed seed value for the sake of reproducibility. Afterwards, we tested the Markov Chain approximation using the function \code{LRCUSUM.runlength} over the same grid of values for the threshold. <>= -h.grid <- seq(1, 10, by = 0.5) +h.grid <- seq(1, 10, by = 0.5) simone <- function(sts, h) { y <- rBB(length(phase2), mu = pi0m[1, , drop = FALSE], bd = population(sts)[phase2, ], sigma = exp(m.bbin$sigma.coef)) - observed(sts)[phase2, ] <- cbind(y, sts@populationFrac[phase2, 1] - y) + observed(sts)[phase2, ] <- cbind(y, sts@populationFrac[phase2, 1] - y) one.surv <- categoricalCUSUM(sts, modifyList(controlCat, list(h = h)), sigma = exp(m.bbin$sigma.coef)) return(any(alarms(one.surv)[, 1])) @@ -1214,13 +1214,13 @@ nSims <- 1000 -pMC <- sapply(h.grid, function(h) { - mean(replicate(nSims, simone(salmHospitalized.multi, h))) +pMC <- sapply(h.grid, function(h) { + mean(replicate(nSims, simone(salmHospitalized.multi, h))) }) pMarkovChain <- sapply( h.grid, function(h) { - TA <- LRCUSUM.runlength(mu = pi0m[1,, drop = FALSE], - mu0 = pi0m[1,, drop = FALSE], + TA <- LRCUSUM.runlength(mu = pi0m[1,, drop = FALSE], + mu0 = pi0m[1,, drop = FALSE], mu1 = pi1m[1,, drop = FALSE], n = population(salmHospitalized.multi)[phase2, ], h = h, dfun = dBB.cusum, @@ -1228,10 +1228,10 @@ return(tail(TA$cdf, n = 1)) }) @ -<>= +<>= # Values of the threshold to be investigated -h.grid <- seq(1,10,by=0.5) - +h.grid <- seq(1,10,by=0.5) + # Prepare function for simulations simone <- function(sts, h) { # Draw observed values from the \textit{in-control} distribution @@ -1239,7 +1239,7 @@ bd=population(sts)[phase2,], sigma=exp(m.bbin$sigma.coef)) observed(sts)[phase2,] <- cbind(y,sts@populationFrac[phase2,1] - y) -# Perform monitoring +# Perform monitoring one.surv <- categoricalCUSUM(sts, control=modifyList(controlCat, list(h=h)), sigma=exp(m.bbin$sigma.coef)) # Return 1 if there was at least one alarm @@ -1251,17 +1251,17 @@ # Number of simulations nSims=1000 # Simulations over the possible h values -pMC <- sapply(h.grid, function(h) { +pMC <- sapply(h.grid, function(h) { h <- h - mean(replicate(nSims, simone(salmHospitalized.multi,h))) + mean(replicate(nSims, simone(salmHospitalized.multi,h))) }) # Distribution function to be used by LRCUSUM.runlength dBB.rl <- function(y, mu, sigma, size, log = FALSE) { - dBB(y, mu = mu, sigma = sigma, bd = size, log = log) + dBB(y, mu = mu, sigma = sigma, bd = size, log = log) } # Markov Chain approximation over h.grid pMarkovChain <- sapply( h.grid, function(h) { - TA <- LRCUSUM.runlength(mu=pi0m[1,,drop=FALSE], mu0=pi0m[1,,drop=FALSE], + TA <- LRCUSUM.runlength(mu=pi0m[1,,drop=FALSE], mu0=pi0m[1,,drop=FALSE], mu1=pi1m[1,,drop=FALSE], n=population(salmHospitalized.multi)[phase2,], h=h, dfun=dBB.rl, sigma=exp(m.bbin$sigma.coef)) @@ -1283,7 +1283,7 @@ \hspace{-1em} \subfloat[]{ -<>= +<>= y.max <- max(observed(salmHospitalizedCat[,1])/population(salmHospitalizedCat[,1]),upperbound(salmHospitalizedCat[,1])/population(salmHospitalizedCat[,1]),na.rm=TRUE) plotOpts3 <- modifyList(plotOpts,list(x=salmHospitalizedCat[,1],ylab="Proportion",ylim=c(0,y.max))) plotOpts3$legend.opts <- list(x="top",bty="n",legend=c(expression(U[t])),lty=1,lwd=line.lwd,col=alarm.symbol$col,horiz=TRUE,cex=cex.leg) @@ -1293,12 +1293,12 @@ @ - + \includegraphics[width=9cm]{plots/monitoringCounts-catF.pdf} } \hspace{-3em} \subfloat[]{ -<>= +<>= par(mar=c(6,5,5,5),family="Times") matplot(h.grid, cbind(pMC,pMarkovChain),type="l",ylab=expression(P(T[A] <= 56 * "|" * tau * "=" * infinity)),xlab="Threshold h",col=1,cex=cex.text, cex.axis =cex.text,cex.lab=cex.text) @@ -1308,18 +1308,18 @@ par(family="Times") legend(4,0.08,c("Monte Carlo","Markov chain"), lty=1:2,col=1,cex=cex.text,bty="n") -@ +@ \includegraphics[width=9cm]{plots/monitoringCounts-catARL.pdf} } -\caption{(a) Results of the monitoring with categoricalCUSUM of the proportion of Salmonella cases that were hospitalized in Germany in Jan 2013 - Jan 2014. (b) +\caption{(a) Results of the monitoring with categoricalCUSUM of the proportion of Salmonella cases that were hospitalized in Germany in Jan 2013 - Jan 2014. (b) Probability of a false alarm within the 56 timepoints of the monitoring as a function of the threshold $h$.} \label{fig:catDouble} \end{figure} -The procedure for using the function for multicategorical variables follows the same steps (as illustrated later). Moreover, one could expand the approach -to utilize the multiple regression possibilities offered by GAMLSS. Here we chose to try to detect a change in the mean of the distribution of counts but as GAMLSS provides more general regression tools -than GLM we could also aim at detecting a change in the time trend included in the model for the mean. +The procedure for using the function for multicategorical variables follows the same steps (as illustrated later). Moreover, one could expand the approach +to utilize the multiple regression possibilities offered by GAMLSS. Here we chose to try to detect a change in the mean of the distribution of counts but as GAMLSS provides more general regression tools +than GLM we could also aim at detecting a change in the time trend included in the model for the mean. \subsubsection{Categorical CUSUM for multinomial models} @@ -1335,7 +1335,7 @@ <>= data("rotaBB") -plot(rotaBB, xlab = "Time (months)", +plot(rotaBB, xlab = "Time (months)", ylab = "Proportion of reported cases") @ @@ -1371,7 +1371,7 @@ fun(epoch(rotaBB),observed(rotaBB)[,i],type="l",xlab="Time (months)",ylab="Reported cases",ylim=c(0,max(observed(rotaBB))),col=pal[i],lwd=2) } else { fun(epoch(rotaBB),observed(rotaBB)[,i,drop=FALSE]/rowSums(observed(rotaBB)),type="l",xlab="Time (months)",ylab="Proportion of reported cases",ylim=c(0,max(observed(rotaBB)/rowSums(observed(rotaBB)))),col=pal[i],lwd=2) - } + } } # Add legend axis(1,at=as.numeric(epoch(rotaBB)),label=NA,tck=-0.01) @@ -1388,35 +1388,35 @@ @ Hence, our interest is in prospectively detecting a possible age-shift. Since the vaccine was recommended for routine vaccination in Brandenburg in 2009 we choose to start -the monitoring at that time point. We do so by fitting a multinomial logit-model containing a trend as well as one harmonic wave and use the age group 0--4 years as reference category, +the monitoring at that time point. We do so by fitting a multinomial logit-model containing a trend as well as one harmonic wave and use the age group 0--4 years as reference category, to the data from the years 2002-2008. Different \proglang{R} packages implement such type of modeling, but we shall use the \pkg{MGLM} package~\citep{MGLM}, because it also offers the fitting of extended multinomial regression models allowing for extra dispersion. <>= rotaBB.df <- as.data.frame(rotaBB) - + X <- with(rotaBB.df, cbind(intercept = 1, epoch, - sin1 = sin(2 * pi * epochInPeriod), + sin1 = sin(2 * pi * epochInPeriod), cos1 = cos(2 * pi * epochInPeriod))) -phase1 <- epoch(rotaBB) < as.Date("2009-01-01") +phase1 <- epoch(rotaBB) < as.Date("2009-01-01") phase2 <- !phase1 order <- c(2:5, 1); reorder <- c(5, 1:4) library("MGLM") -m0 <- MGLMreg(as.matrix(rotaBB.df[phase1, order]) ~ -1 + X[phase1, ], - dist = "MN") +m0 <- MGLMreg(as.matrix(rotaBB.df[phase1, order]) ~ -1 + X[phase1, ], + dist = "MN") @ <>= # Convert sts object to data.frame useful for regression modelling rotaBB.df <- as.data.frame(rotaBB) -# Create matrix +# Create matrix X <- with(rotaBB.df,cbind(intercept=1,epoch, sin1=sin(2*pi*epochInPeriod),cos1=cos(2*pi*epochInPeriod))) # Fit model to 2002-2009 data -phase1 <- epoch(rotaBB) < as.Date("2009-01-01") +phase1 <- epoch(rotaBB) < as.Date("2009-01-01") phase2 <- !phase1 # MGLMreg automatically takes the last class as ref so we reorder @@ -1424,13 +1424,13 @@ # Fit multinomial logit model (i.e. dist="MN") to phase1 data library("MGLM") -m0 <- MGLMreg(as.matrix(rotaBB.df[phase1,order])~ -1 + X[phase1,], dist="MN") +m0 <- MGLMreg(as.matrix(rotaBB.df[phase1,order])~ -1 + X[phase1,], dist="MN") @ <<>>= # Set threshold and option object h <- 2 @ -As described in \citet{hoehle2010} we can try to detect a specific shift in the intercept coefficients of the model. For example, a multiplicative shift of factor 7 in the example below, in the odds of each of the four age categories against the reference category is modelled by changing the intercept value of each category. +As described in \citet{hoehle2010} we can try to detect a specific shift in the intercept coefficients of the model. For example, a multiplicative shift of factor 7 in the example below, in the odds of each of the four age categories against the reference category is modelled by changing the intercept value of each category. Based on this, the \textit{in-control} and \textit{out-of-control} proportions are easily computed using the \code{predict} function for \code{MGLMreg} objects. <>= @@ -1443,7 +1443,7 @@ @ <>= m1 <- m0 -# Out-of control model: shift in all intercept coeffs +# Out-of control model: shift in all intercept coeffs m1$coefficients[1,] <- m0$coefficients[1,] + log(2) # Proportion over time for phase2 based on fitted model (re-order back) pi0 <- t(predict(m0, newdata=X[phase2,])[,reorder]) @@ -1452,13 +1452,13 @@ For applying the \code{categoricalCUSUM} function one needs to define a compatible wrapper function for the multinomial as in the binomial example. -With $\bm{\pi}^0$ and $\bm{\pi}^1$ in place one only needs to define a wrapper function, which defines the PMF of the sampling distribution -- in this case the multinomial -- in a \code{categoricalCUSUM} compatible way. +With $\bm{\pi}^0$ and $\bm{\pi}^1$ in place one only needs to define a wrapper function, which defines the PMF of the sampling distribution -- in this case the multinomial -- in a \code{categoricalCUSUM} compatible way. <>= dfun <- function(y, size, mu, log = FALSE) { return(dmultinom(x = y, size = size, prob = mu, log = log)) } -control <- list(range = seq(nrow(rotaBB))[phase2], h = h, pi0 = pi0, +control <- list(range = seq(nrow(rotaBB))[phase2], h = h, pi0 = pi0, pi1 = pi1, ret = "value", dfun = dfun) surv <- categoricalCUSUM(rotaBB,control=control) @ @@ -1466,7 +1466,7 @@ #Number of MC samples nSamples <- 1e4 -#Do MC +#Do MC simone.stop <- function(sts, control) { phase2Times <- seq(nrow(sts))[phase2] #Generate new phase2 data from the fitted in control model @@ -1495,68 +1495,68 @@ @ -With $\bm{\pi}^0$ and $\bm{\pi}^1$ in place one only needs to define a wrapper function, which defines the PMF of the sampling distribution -- in this case the multinomial -- in a \code{categoricalCUSUM} compatible way. +With $\bm{\pi}^0$ and $\bm{\pi}^1$ in place one only needs to define a wrapper function, which defines the PMF of the sampling distribution -- in this case the multinomial -- in a \code{categoricalCUSUM} compatible way. <>= <> @ The resulting CUSUM statistic $C_t$ as a function of time is shown in Figure~\ref{fig:ct}(a). The first time an aberration is detected is July 2009. Using 10000 Monte Carlo simulations we estimate that with the chosen threshold $h=2$ the probability for a false alarm within the 60 time points of \code{phase2} is 0.02. -As the above example shows, the LR based categorical CUSUM is rather flexible in handling any type of multivariate GLM modeling to specify the \textit{in-control} and \textit{out-of-control} proportions. However, it requires a direction of the change to be specified -- for which detection is optimal. One sensitive part of such monitoring is the fit of the multinomial distribution to a multivariate time series of proportions, which usually exhibit extra dispersion when compared to the multinomial. For example comparing the AIC between the multinomial logit-model and a Dirichlet-multinomial model with $\alpha_{ti} = \exp(\bm{x}_t^\top\bm{\beta})$~\citep{MGLM} shows that overdispersion is present. -The Dirichlet distribution is the multicategorical equivalent of the beta-binomial distribution. We exemplify its use in the code below. +As the above example shows, the LR based categorical CUSUM is rather flexible in handling any type of multivariate GLM modeling to specify the \textit{in-control} and \textit{out-of-control} proportions. However, it requires a direction of the change to be specified -- for which detection is optimal. One sensitive part of such monitoring is the fit of the multinomial distribution to a multivariate time series of proportions, which usually exhibit extra dispersion when compared to the multinomial. For example comparing the AIC between the multinomial logit-model and a Dirichlet-multinomial model with $\alpha_{ti} = \exp(\bm{x}_t^\top\bm{\beta})$~\citep{MGLM} shows that overdispersion is present. +The Dirichlet distribution is the multicategorical equivalent of the beta-binomial distribution. We exemplify its use in the code below. <>= -m0.dm <- MGLMreg(as.matrix(rotaBB.df[phase1, 1:5]) ~ -1 + X[phase1, ], +m0.dm <- MGLMreg(as.matrix(rotaBB.df[phase1, 1:5]) ~ -1 + X[phase1, ], dist = "DM") c(m0$AIC, m0.dm$AIC) @ Hence, the above estimated false alarm probability might be too low for the actual monitoring problem, because the variation in the time series is larger than implied by the multinomial. Hence, it appears prudent to repeat the analysis using the more flexible Dirichlet-multinomial model. This is straightforward with \code{categoricalCUSUM} once the \textit{out-of-control} proportions are specified in terms of the model. Such a specification is, however, hampered by the fact that the two models use different parametrizations. -For performing monitoring in this new setting we first need to calculate the $\alpha$'s of the multinomial-Dirichlet for the \textit{in-control} and \textit{out-of-control} +For performing monitoring in this new setting we first need to calculate the $\alpha$'s of the multinomial-Dirichlet for the \textit{in-control} and \textit{out-of-control} distributions. <>= delta <- 2 -m1.dm <- m0.dm -m1.dm$coefficients[1, ] <- m0.dm$coefficients[1, ] + +m1.dm <- m0.dm +m1.dm$coefficients[1, ] <- m0.dm$coefficients[1, ] + c(-delta, rep(delta/4, 4)) alpha0 <- exp(X[phase2,] %*% m0.dm$coefficients) alpha1 <- exp(X[phase2,] %*% m1.dm$coefficients) - + dfun <- function(y, size, mu, log = FALSE) { dLog <- ddirm(t(y), t(mu)) - if (log) { return(dLog) } else { return(exp(dLog)) } + if (log) { return(dLog) } else { return(exp(dLog)) } } h <- 2 -control <- list(range = seq(nrow(rotaBB))[phase2], h = h, - pi0 = t(alpha0), pi1 = t(alpha1), +control <- list(range = seq(nrow(rotaBB))[phase2], h = h, + pi0 = t(alpha0), pi1 = t(alpha1), ret = "value", dfun = dfun) surv.dm <- categoricalCUSUM(rotaBB, control = control) @ <>= # Change intercept in the first class (for DM all 5 classes are modeled) delta <- 2 -m1.dm <- m0.dm -m1.dm$coefficients[1,] <- m0.dm$coefficients[1,] + +m1.dm <- m0.dm +m1.dm$coefficients[1,] <- m0.dm$coefficients[1,] + c(-delta,rep(delta/4,4)) # Calculate the alphas of the multinomial-Dirichlet in the two cases alpha0 <- exp(X[phase2,] %*% m0.dm$coefficients) alpha1 <- exp(X[phase2,] %*% m1.dm$coefficients) - + # Use alpha vector as mu magnitude # (not possible to compute it from mu and size) dfun <- function(y, size, mu, log=FALSE) { dLog <- ddirm(t(y), t(mu)) - if (log) { return(dLog) } else {return(exp(dLog))} + if (log) { return(dLog) } else {return(exp(dLog))} } # Threshold h <- 2 -control <- list(range=seq(nrow(rotaBB))[phase2],h=h,pi0=t(alpha0), +control <- list(range=seq(nrow(rotaBB))[phase2],h=h,pi0=t(alpha0), pi1=t(alpha1), ret="value",dfun=dfun) surv.dm <- categoricalCUSUM(rotaBB,control=control) @ @@ -1572,7 +1572,7 @@ \hspace{-1em} \subfloat[]{ -<>= +<>= surv@observed[,1] <- 0 surv@multinomialTS <- FALSE surv.dm@observed[,1] <- 0 @@ -1580,29 +1580,29 @@ y.max <- max(observed(surv.dm[,1]),upperbound(surv.dm[,1]),observed(surv[,1]),upperbound(surv[,1]),na.rm=TRUE) plotOpts3 <- modifyList(plotOpts,list(x=surv[,1],ylim=c(0,y.max),ylab=expression(C[t]),xlab="")) plotOpts3$legend.opts <- list(x="topleft",bty="n",legend="R",lty=1,lwd=line.lwd,col=alarm.symbol$col,horiz=TRUE,cex=cex.leg) -do.call("plot",plotOpts3) +do.call("plot",plotOpts3) lines( c(0,1e99), rep(h,2),lwd=2,col="darkgray",lty=1) par(family="Times") mtext(side=1,text="Time (weeks)", - las=0,line=3, cex=cex.text) + las=0,line=3, cex=cex.text) @ - + \includegraphics[width=9cm]{plots/monitoringCounts-ctPlot1.pdf} } \hspace{-3em} \subfloat[]{ -<>= +<>= plotOpts3 <- modifyList(plotOpts,list(x=surv.dm[,1],ylim=c(0,y.max),ylab=expression(C[t]),xlab="")) plotOpts3$legend.opts <- list(x="topleft",bty="n",legend="R",lty=1,lwd=line.lwd,col=alarm.symbol$col,horiz=TRUE,cex=cex.text) y.max <- max(observed(surv.dm[,1]),upperbound(surv.dm[,1]),observed(surv[,1]),upperbound(surv[,1]),na.rm=TRUE) -do.call("plot",plotOpts3) +do.call("plot",plotOpts3) lines( c(0,1e99), rep(h,2),lwd=2,col="darkgray",lty=1) par(family="Times") mtext(side=1,text="Time (weeks)", - las=0,line=3, cex=cex.text) -@ + las=0,line=3, cex=cex.text) +@ \includegraphics[width=9cm]{plots/monitoringCounts-ctPlot2.pdf} } \caption{Categorical CUSUM statistic $C_t$. Once $C_t>\Sexpr{h}$ an alarm is sounded and the statistic is reset. In (a) surveillance uses the multinomial distribution and in (b) surveillance uses the Dirichlet-multinomial distribution.} @@ -1610,33 +1610,33 @@ \end{figure} -The resulting CUSUM statistic $C_t$ using the Dirichlet multinomial distribution is shown in Figure~\ref{fig:ct}(b). -We notice a rather similar behavior even though the shift-type specified by this model is slightly different than in the model of Figure~\ref{fig:ct}(a). +The resulting CUSUM statistic $C_t$ using the Dirichlet multinomial distribution is shown in Figure~\ref{fig:ct}(b). +We notice a rather similar behavior even though the shift-type specified by this model is slightly different than in the model of Figure~\ref{fig:ct}(a). \subsubsection{Categorical data in routine surveillance} -The multidimensionality of data available in public health surveillance creates many opportunities for the application of categorical time series: one could, e.g., look at the sex ratio of cases of a given disease, at the age group distribution, -at the regions sending data, etc. If one is interested in monitoring with respect to a categorical variable, a choice has to be made between monitoring each time series individually, -for instance a time series of \textit{Salmonella} cases for each age category, or to monitor the distribution of cases with respect to that factor jointly \textit{via} \code{categoricalCUSUM}. A downside of the latter -solution is that one has to specify the change parameter \code{R} in advance, which can be quite a hurdle if one has no pre-conceived idea of what could happen for, say, the age shift after the introduction of a vaccine. Alternatively, one could employ an ensemble of monitors or monitor an aggregate. However, more straightforward applications could be found in the (binomial) surveillance of positive diagnostics if one were to obtain data about tests performed by laboratories and not only about confirmed cases. An alternative would be to apply +The multidimensionality of data available in public health surveillance creates many opportunities for the application of categorical time series: one could, e.g., look at the sex ratio of cases of a given disease, at the age group distribution, +at the regions sending data, etc. If one is interested in monitoring with respect to a categorical variable, a choice has to be made between monitoring each time series individually, +for instance a time series of \textit{Salmonella} cases for each age category, or to monitor the distribution of cases with respect to that factor jointly \textit{via} \code{categoricalCUSUM}. A downside of the latter +solution is that one has to specify the change parameter \code{R} in advance, which can be quite a hurdle if one has no pre-conceived idea of what could happen for, say, the age shift after the introduction of a vaccine. Alternatively, one could employ an ensemble of monitors or monitor an aggregate. However, more straightforward applications could be found in the (binomial) surveillance of positive diagnostics if one were to obtain data about tests performed by laboratories and not only about confirmed cases. An alternative would be to apply \code{farringtonFlexible} while using the number of tests as \code{populationOffset}. \subsubsection{Similar methods in the package} The package also offers another CUSUM method suitable for binary data, \code{pairedbinCUSUM} that implements the method introduced by~\citet{Steiner1999}, which does not, however, take overdispersion into account as well as \code{glrnb}. The algorithm \code{rogerson} also supports the analysis of binomial data. See Table~\ref{table:ref} for the corresponding references. \subsection{Other algorithms implemented in the package} -We conclude this description of surveillance methods by giving an overview of all algorithms implemented in the package with the corresponding references in Table~\ref{table:ref}. +We conclude this description of surveillance methods by giving an overview of all algorithms implemented in the package with the corresponding references in Table~\ref{table:ref}. One can refer to the relative reference articles and to the reference manual of the package for more information about each method. - + Criteria for choosing a method in practice are numerous. First one needs to ponder on the amount of historical data at hand -- for instance the EARS methods only need data for the last -timepoints whereas the Farrington methods use data up to $b$ years in the past. Then one should consider the amount of past data used by the algorithm -- historical reference methods use only a subset of the past data, namely +timepoints whereas the Farrington methods use data up to $b$ years in the past. Then one should consider the amount of past data used by the algorithm -- historical reference methods use only a subset of the past data, namely the timepoints located around the same timepoint in the past years, whereas other methods use all past data included in the reference data. This can be a criterion of choice since one can prefer using all available data. It is also important to decide whether one wants to detect one-timepoint aberration or more prolonged shifts. -And lastly, an important criterion is how much work needs to be done for finetuning the algorithm for each specific time series. +And lastly, an important criterion is how much work needs to be done for finetuning the algorithm for each specific time series. -The package on the one hand provides the means for analysing nearly all type of surveillance data +The package on the one hand provides the means for analysing nearly all type of surveillance data and on the other hand makes the comparison of algorithms possible. This is useful in practical applications when those algorithms are implemented into routine use, which will be the topic of Section~\ref{sec:routine}. @@ -1674,21 +1674,21 @@ \label{sec:3} Combining \pkg{surveillance} with other \proglang{R} packages and programs is easy, allowing the integration of -the aberration detection into a comprehensive surveillance system to be used in routine practice. In our opinion, such a surveillance system has to at least support the following process: loading data from local databases, analysing them within \pkg{surveillance} -and sending the results of this analysis to the end-user who is typically an epidemiologist in charge of the specific pathogen. This section exemplifies the integration of the package +the aberration detection into a comprehensive surveillance system to be used in routine practice. In our opinion, such a surveillance system has to at least support the following process: loading data from local databases, analysing them within \pkg{surveillance} +and sending the results of this analysis to the end-user who is typically an epidemiologist in charge of the specific pathogen. This section exemplifies the integration of the package into a whole analysis stack, first through the introduction of a simple workflow from data query to a \code{Sweave}~\citep{sweave} or \pkg{knitr}~\citep{knitr} report of signals, and secondly through the presentation of the more elaborate system in use at the German Robert Koch Institute. \subsection{A simple surveillance system} Suppose you have a database with surveillance time series but little resources to build a surveillance system encompassing all the above stages. Using \proglang{R} and \code{Sweave} or \code{knitr} for \LaTeX~you can still set up - a simple surveillance analysis without having to do everything by hand. You only need to input the data into \proglang{R} and create \code{sts} objects for each time series of interest - as explained thoroughly in~\citet{hoehle-mazick-2010}. Then, after choosing a surveillance algorithm, say \code{farringtonFlexible}, and - feeding it with the appropriate \code{control} argument, you can get a \code{sts} object with upperbounds and alarms for each of your time series of interest over the \code{range} - supplied in \code{control}. For defining the range automatically one could use the \proglang{R} function \code{Sys.Date()} to get today's date. - These steps can be introduced as a code chunk in a \code{Sweave} or \code{knitr} code that will translate it into a report that you can send to the epidemiologists in charge of the respective pathogen whose cases are monitored. - - Below is an example of a short code segment showing the analysis of the \textit{S. Newport} weekly counts of cases in the German federal states Baden-W\"{u}rttemberg and North Rhine-Westphalia -with the improved method implemented in \code{farringtonFlexible}. The package provides a \code{toLatex} method for \code{sts} objects that produces a table with the observed number of counts and upperbound for each column in + a simple surveillance analysis without having to do everything by hand. You only need to input the data into \proglang{R} and create \code{sts} objects for each time series of interest + as explained thoroughly in~\citet{hoehle-mazick-2010}. Then, after choosing a surveillance algorithm, say \code{farringtonFlexible}, and + feeding it with the appropriate \code{control} argument, you can get a \code{sts} object with upperbounds and alarms for each of your time series of interest over the \code{range} + supplied in \code{control}. For defining the range automatically one could use the \proglang{R} function \code{Sys.Date()} to get today's date. + These steps can be introduced as a code chunk in a \code{Sweave} or \code{knitr} code that will translate it into a report that you can send to the epidemiologists in charge of the respective pathogen whose cases are monitored. + + Below is an example of a short code segment showing the analysis of the \textit{S. Newport} weekly counts of cases in the German federal states Baden-W\"{u}rttemberg and North Rhine-Westphalia +with the improved method implemented in \code{farringtonFlexible}. The package provides a \code{toLatex} method for \code{sts} objects that produces a table with the observed number of counts and upperbound for each column in \code{observed}, where alarms can be highlighted by for instance bold text. The resulting table is shown in Table~\ref{tableResults}. <>= data("salmNewport") @@ -1696,7 +1696,7 @@ rangeAnalysis <- (today - 4):today in2013 <- which(isoWeekYear(epoch(salmNewport))$ISOYear == 2013) -algoParameters <- list(range = rangeAnalysis, noPeriods = 10, +algoParameters <- list(range = rangeAnalysis, noPeriods = 10, populationBool = FALSE, b = 4, w = 3, weightsThreshold = 2.58, pastWeeksNotIncluded = 26, pThresholdTrend = 1, @@ -1709,21 +1709,21 @@ start <- isoWeekYear(epoch(salmNewport)[range(range)[1]]) end <- isoWeekYear(epoch(salmNewport)[range(range)[2]]) -caption <- paste("Results of the analysis of reported S. Newport +caption <- paste("Results of the analysis of reported S. Newport counts in two German federal states for the weeks W-", start$ISOWeek, "-", start$ISOYear, " - W-", end$ISOWeek, "-", end$ISOYear, " performed on ", Sys.Date(), - ". Bold upperbounds (UB) indicate weeks with alarms.", + ". Bold upperbounds (UB) indicate weeks with alarms.", sep="") toLatex(results, caption = caption) @ <>= # In this example the sts-object already exists. -# Supply the code with the date of a Monday and look for the +# Supply the code with the date of a Monday and look for the # corresponding index in the sts-object today <- which(epoch(salmNewport)==as.Date("2013-12-23")) -# The analysis will be performed for the given week -# and the 4 previous ones +# The analysis will be performed for the given week +# and the 4 previous ones range <- (today-4):today in2013 <- which(isoWeekYear(epoch(salmNewport))$ISOYear==2013) # Control argument for using the improved method @@ -1738,11 +1738,11 @@ # Export the results as a tex table start <- isoWeekYear(epoch(salmNewport)[range(range)[1]]) end <- isoWeekYear(epoch(salmNewport)[range(range)[2]]) -caption <- paste("Results of the analysis of reported S. Newport +caption <- paste("Results of the analysis of reported S. Newport counts in two German federal states for the weeks W-", start$ISOWeek," ",start$ISOYear," - W-",end$ISOWeek, " ",end$ISOYear," performed on ",Sys.Date(), - ". Bold upperbounds (thresholds) indicate weeks with alarms.", + ". Bold upperbounds (thresholds) indicate weeks with alarms.", sep="") toLatex(results, table.placement="h", size = "normalsize", sanitize.text.function = identity, @@ -1753,21 +1753,21 @@ caption=caption,label="tableResults") @ -The advantage of this approach is that it can be made automatic. The downside of such a system is that the report is not interactive, for instance one cannot click on the cases and get the linelist. Nevertheless, this is a workable solution in -many cases -- especially when human and financial resources are narrow. +The advantage of this approach is that it can be made automatic. The downside of such a system is that the report is not interactive, for instance one cannot click on the cases and get the linelist. Nevertheless, this is a workable solution in +many cases -- especially when human and financial resources are narrow. In the next section, we present a more advanced surveillance system built on the package. \subsection{Automatic detection of outbreaks at the Robert Koch Institute} \label{sec:RKI} The package \pkg{surveillance} was used as a core building block for designing and implementing the automated outbreak detection system at the RKI in Germany~\citep{Dirk}. The text below describes the system as it was in early 2014. -Due to the Infection Protection Act (IfSG) the RKI daily receives over 1,000 notifiable disease reports. The system analyses about half a million time series per day to identify possible aberrations in the reported number of cases. +Due to the Infection Protection Act (IfSG) the RKI daily receives over 1,000 notifiable disease reports. The system analyses about half a million time series per day to identify possible aberrations in the reported number of cases. Structurally, it consists of two components: an analytical process written in \proglang{R} that daily monitors the data and a reporting component that compiles and communicates the results to the epidemiologists. The analysis task in the described version of the system relied on \pkg{surveillance} and three other \proglang{R} packages, namely \pkg{data.table}, \pkg{RODBC} and \pkg{testthat} as described in the following. The data-backend is an OLAP-system~\citep{SSAS} and relational databases, which are queried using \pkg{RODBC}~\citep{rodbc2013}. The case reports are then rapidly aggregated into univariate time series using \pkg{data.table}~\citep{datatable2013}. To each time series we apply the \code{farringtonFlexible} algorithm on univariate \code{sts} objects and store the analysis results in another SQL-database. We make intensive use of \pkg{testthat}~\citep{testthat2013} for automatic testing of the component. -Although \proglang{R} is not the typical language to write bigger software components for production, choosing \proglang{R} in combination with \pkg{surveillance} enabled us to quickly develop the analysis workflow. We can hence report positive experience using \proglang{R} also for larger software components in production. +Although \proglang{R} is not the typical language to write bigger software components for production, choosing \proglang{R} in combination with \pkg{surveillance} enabled us to quickly develop the analysis workflow. We can hence report positive experience using \proglang{R} also for larger software components in production. The reporting component was realized using Microsoft Reporting Services~\citep{SSRS}, because this technology is widely used within the RKI. It allows quick development of reports and works well with existing Microsoft Office tools, which the end-user, the epidemiologist, is used to. For example, one major requirement by the epidemiologists was to have the results compiled as Excel documents. @@ -1801,7 +1801,7 @@ \section*{Acknowledgments} -The authors would like to express their gratitude to all contributors to the package, in particular +The authors would like to express their gratitude to all contributors to the package, in particular Juliane Manitz, University of G\"{o}ttingen, Germany, for her work on the \texttt{boda} code and Angela Noufaily, The Open University, Milton Keynes, UK, for providing us the code used in her article that we extended for \texttt{farringtonFlexible}. The work of M. Salmon was financed by a PhD grant of the RKI. \bibliography{monitoringCounts,references} Binary files /tmp/tmpFLLNnA/6m_V_jmLxa/r-cran-surveillance-1.12.2/inst/doc/surveillance.pdf and /tmp/tmpFLLNnA/NJ1xxKg7_3/r-cran-surveillance-1.13.0/inst/doc/surveillance.pdf differ Binary files /tmp/tmpFLLNnA/6m_V_jmLxa/r-cran-surveillance-1.12.2/inst/doc/twinSIR.pdf and /tmp/tmpFLLNnA/NJ1xxKg7_3/r-cran-surveillance-1.13.0/inst/doc/twinSIR.pdf differ Binary files /tmp/tmpFLLNnA/6m_V_jmLxa/r-cran-surveillance-1.12.2/inst/doc/twinstim.pdf and /tmp/tmpFLLNnA/NJ1xxKg7_3/r-cran-surveillance-1.13.0/inst/doc/twinstim.pdf differ diff -Nru r-cran-surveillance-1.12.2/inst/NEWS.Rd r-cran-surveillance-1.13.0/inst/NEWS.Rd --- r-cran-surveillance-1.12.2/inst/NEWS.Rd 2016-11-14 13:28:54.000000000 +0000 +++ r-cran-surveillance-1.13.0/inst/NEWS.Rd 2016-12-20 10:37:55.000000000 +0000 @@ -8,6 +8,76 @@ \encoding{latin1} +\section{Changes in surveillance version 1.13.0 (2016-12-20)}{ + +\subsection{NEW FEATURES}{ + \itemize{ + \item \code{earsC} now has two new arguments thanks to Howard + Burkom: the number of past time units to be used in calculation is + now not always 7, it can be chosen in the \code{baseline} parameter. + Furthermore, the \code{minSigma} parameter allows to get a threshold + in the case of sparse data. When one doesn't give any value for those + two parameters, the algorithm works like it used to. + + \item \code{animate.sts()} gained support for date labels in the + bottom \code{timeplot}. + + \item \code{stsplot_space()} and \code{animate.sts()} can now + generate incidence maps based on the population information + stored in the supplied \code{"sts"} object. + Furthermore, \code{animate.sts()} now supports time-varying + population numbers. + } +} + +\subsection{MINOR CHANGES}{ + \itemize{ + \item \code{hhh4()} guards against the misuse of + \code{family = factor("Poisson")} for univariate time series. + Previously, this resulted in a negative binomial model by + definition, but is now interpreted as \code{family = "Poisson"} + (with a warning). + } +} + +\subsection{BUG FIXES}{ + \itemize{ + \item \code{animate.sts()} now supports objects with missing values + (with a warning). Furthermore, the automatic color breaks have been + improved for incidence maps, also in \code{stsplot_space()}. + + \item The \code{as.data.frame}-method for the \code{"sts"} class, + applied to classical time-index-based \code{"sts"} objects + (\code{epochAsDate=FALSE}), ignored a \code{start} epoch different + from 1 when computing the \code{epochInPeriod} indexes. + Furthermore, the returned \code{epochInPeriod} now is a fraction of + \code{freq}, for consistency with the result for objects with + \code{epochAsDate=TRUE}. + + \item \code{simulate.hhh4()} did not handle shared overdispersion + parameters correctly. The different parameters were simply recycled + to the number of units, ignoring the factor specification from the + model's \code{family}. [spotted by Johannes Bracher] + + \item Simulations from \emph{endemic-only} \code{"hhh4"} models + with unit-specific overdispersion parameters used wrong + variances. [spotted by Johannes Bracher] + + \item \code{oneStepAhead()} predictions of \code{type} + \code{"rolling"} (or \code{"first"}) were incorrect for time points + \code{tp} (\code{tp[1]}) beyond the originally fitted time range + (in that they were based on the original time range only). + This usage of \code{oneStepAhead()} was never really supported and + is now catched when checking the \code{tp} argument. + + \item \code{plot.hhh4simslist()} ignored its \code{par.settings} + argument if \code{groups=NULL} (default). + } +} + +} + + \section{Changes in surveillance version 1.12.2 (2016-11-14)}{ \subsection{NEW FEATURES}{ @@ -190,8 +260,8 @@ \subsection{NEW FEATURES}{ \itemize{ \item \code{update.epidata()} can now handle a distance matrix - \code{D} in the form of a classed \code{"Matrix"} - (suggested by George Wood). + \code{D} in the form of a classed \code{"Matrix"}. + [suggested by George Wood] \item \code{glrnb()} can now handle \code{ret="cases"} for the generalized likelihood ratio detector based on the negative binomial @@ -393,7 +463,7 @@ \item A consistency check in \code{as.epidata.default()} failed for SI-type data (and, more generally, for all data which ended with an - I-event in the last time block). Spotted by George Wood. + I-event in the last time block). [spotted by George Wood] } } @@ -525,7 +595,7 @@ (only if \code{W} or \code{tiles}, respectively, contained holes). \item Non-convergent endemic-only \code{twinstim} models - produced an error (spotted by Bing Zhang). + produced an error. [spotted by Bing Zhang] \item The \code{"owin"}-method of \code{intersectPolyCircle} could have returned a rectangle-type \code{"owin"} instead of a polygon. diff -Nru r-cran-surveillance-1.12.2/man/algo.glrnb.Rd r-cran-surveillance-1.13.0/man/algo.glrnb.Rd --- r-cran-surveillance-1.12.2/man/algo.glrnb.Rd 2016-05-17 20:08:16.000000000 +0000 +++ r-cran-surveillance-1.13.0/man/algo.glrnb.Rd 2016-11-29 09:36:33.000000000 +0000 @@ -40,8 +40,9 @@ fine-tune the model one can instead specify \code{mu0} as a list with two components: \describe{ - \item{\code{S}}{number of harmonics to include} - \item{\code{trend}}{include a term \code{t} in the GLM model} + \item{\code{S}}{integer number of harmonics to include + (typically 1 or 2)} + \item{\code{trend}}{A Boolean indicating whether to include a term \code{t} in the GLM model} } The fitting is controlled by the \code{estimateGLRNbHook} function. The in-control mean model is re-fitted after every @@ -105,7 +106,7 @@ \value{ \code{algo.glrpois} simply calls \code{algo.glrnb} with \code{control$alpha} set to 0. - + \code{algo.glrnb} returns a list of class \code{survRes} (surveillance result), which includes the alarm value for recognizing an outbreak (1 for alarm, 0 for no alarm), diff -Nru r-cran-surveillance-1.12.2/man/algo.outbreakP.Rd r-cran-surveillance-1.13.0/man/algo.outbreakP.Rd --- r-cran-surveillance-1.12.2/man/algo.outbreakP.Rd 2016-11-16 23:13:39.000000000 +0000 +++ r-cran-surveillance-1.13.0/man/algo.outbreakP.Rd 2016-12-08 12:18:20.000000000 +0000 @@ -76,7 +76,7 @@ The code is an extended R port of the Java code by Marianne \enc{Frisén}{Frisen} and Linus \enc{Schiöler}{Schioeler} from the CASE project available under the GNU GPL License v3. See - \url{http://case.folkhalsomyndigheten.se/} for further details on the CASE + \url{https://case.folkhalsomyndigheten.se/} for further details on the CASE project. A manual on how to use an Excel implementation of the method is available at \url{http://economics.handels.gu.se/english/Units+and+Centra/statistical_research_unit/software}. diff -Nru r-cran-surveillance-1.12.2/man/categoricalCUSUM.Rd r-cran-surveillance-1.13.0/man/categoricalCUSUM.Rd --- r-cran-surveillance-1.12.2/man/categoricalCUSUM.Rd 2016-05-17 20:08:16.000000000 +0000 +++ r-cran-surveillance-1.13.0/man/categoricalCUSUM.Rd 2016-11-28 14:15:36.000000000 +0000 @@ -93,12 +93,17 @@ #Fit beta-binomial model using GAMLSS abattoir.df <- as.data.frame(abattoir) - colnames(abattoir.df) <- c("y","t","state","alarm","n") + + #Replace the observed and epoch column names to something more convenient + dict <- c("observed"="y", "epoch"="t", "population"="n") + replace <- dict[colnames(abattoir.df)] + colnames(abattoir.df)[!is.na(replace)] <- replace[!is.na(replace)] + m.bbin <- gamlss( cbind(y,n-y) ~ 1 + t + - + sin(2*pi/52*t) + cos(2*pi/52*t) + - + sin(4*pi/52*t) + cos(4*pi/52*t), sigma.formula=~1, - family=BB(sigma.link="log"), - data=abattoir.df[phase1,c("n","y","t")]) + + sin(2*pi/52*t) + cos(2*pi/52*t) + + + sin(4*pi/52*t) + cos(4*pi/52*t), sigma.formula=~1, + family=BB(sigma.link="log"), + data=abattoir.df[phase1,c("n","y","t")]) #CUSUM parameters R <- 2 #detect a doubling of the odds for a test being positive diff -Nru r-cran-surveillance-1.12.2/man/earsC.Rd r-cran-surveillance-1.13.0/man/earsC.Rd --- r-cran-surveillance-1.12.2/man/earsC.Rd 2013-03-15 23:04:19.000000000 +0000 +++ r-cran-surveillance-1.13.0/man/earsC.Rd 2016-12-03 12:45:51.000000000 +0000 @@ -2,7 +2,8 @@ \alias{earsC} \encoding{latin1} -\title{Surveillance for a count data time series using the EARS C1, C2 or C3 method.} +\title{Surveillance for a count data time series using the EARS C1, C2 + or C3 method and its extensions} \description{ % The function takes \code{range} values of the surveillance time @@ -11,103 +12,121 @@ This is then compared to the observed number of counts. If the observation is above a specific quantile of the prediction interval, then an alarm is raised. This method is especially useful - for data without many reference values, since it only needs counts from the recent past. + for data without many historic values, since it only needs counts from the recent past. % } -\usage{ earsC(sts, control = list(range = NULL, method = "C1", - alpha = 0.001)) +\usage{ +earsC(sts, control = list(range = NULL, method = "C1", + baseline = 7, minSigma = 0, + alpha = 0.001)) } \arguments{ \item{sts}{object of class sts (including the \code{observed} and the \code{state} time series) , which is to be monitored.} \item{control}{Control object \describe{ - \item{\code{range}}{Specifies the index of all timepoints which - should be tested. If \code{range} is \code{NULL} the maximum number - of possible timepoints is used. This number depends on the method chosen. - For C1 all timepoints from timepoint 8 can be assessed, for C2 from - timepoint 10 and for C3 from timepoint 12.} - \item{\code{method}}{String indicating which method to use: \cr - "C1" for EARS C1-MILD method, "C2" for EARS C2-MEDIUM method, - "C3" for EARS C3-HIGH method. By default if \code{method} is \code{NULL} C1 is chosen.} - + \item{\code{range}}{Specifies the index in the \code{sts} object of + all the timepoints which + should be monitored. If \code{range} is \code{NULL} the maximum number + of possible timepoints is used (this number depends on the method chosen): + \describe{ + \item{C1}{all timepoints from the observation with index \code{baseline + 1} + can be monitored,} + \item{C2}{timepoints from index \code{baseline + 3} can be + monitored,} + \item{C3}{timepoints starting from the index \code{baseline + 5} can be monitored.} + } + } + \item{\code{method}}{String indicating which method to use: \cr + \describe{ + \item{\code{"C1"}}{for EARS C1-MILD method (Default),} + \item{\code{"C2"}}{for EARS C2-MEDIUM method,} + \item{\code{"C3"}}{for EARS C3-HIGH method.} + } + See Details for further information about the methods. + } +\item{\code{baseline}}{how many time points to use for calculating the baseline, see details} +\item{\code{minSigma}}{By default 0. If \code{minSigma} is higher than 0, for C1 and +C2, the quantity zAlpha * minSigma is then the alerting threshold if the +baseline is zero. Howard Burkom suggests using a value of 0.5 or 1 for sparse data.} \item{\code{alpha}}{An approximate (two-sided) \eqn{(1-\alpha)\cdot 100\%} prediction - interval is calculated. By default if \code{alpha} is \code{NULL} 0.001 is assumed + interval is calculated. By default if \code{alpha} is \code{NULL} + the value 0.001 is assumed for C1 and C2 whereas 0.025 is assumed for C3. These different choices are the one made at the CDC.} % } } } \details{ - The three methods are different in terms of baseline used for calculation of + The three methods are different in terms of baseline used for calculation of the expected value and in terms of method for calculating the expected value: \itemize{ - \item in C1 and C2 the expected value is the moving average of counts over - the sliding window of the baseline and the prediction interval depends on the - standard derivation of counts over this window. They can be considered as + \item in C1 and C2 the expected value is the moving average of counts over + the sliding window of the baseline and the prediction interval depends on the + standard derivation of the observed counts in this window. They can be considered as Shewhart control charts with a small sample used for calculations. - \item in C3 the expected value is based on the sum over 3 timepoints - (assessed timepoints and the two previous timepoints) of the discrepancy - between observations and predictions, predictions being calculated with C2 method. -This method shares a common point with CUSUM method -(adding discrepancies between predictions and observations over several timepoints) + \item in C3 the expected value is based on the sum over 3 timepoints + (assessed timepoints and the two previous timepoints) of the discrepancy + between observations and predictions, predictions being calculated with + the C2 method. +This method has similarities with a CUSUM method due to it +adding discrepancies between predictions and observations over several timepoints, but is not a CUSUM (sum over 3 timepoints, not accumulation over a whole range), - even if it sometimes presented as such. + even if it sometimes is presented as such. } -Here is what the function does for each method: +Here is what the function does for each method, see the literature + sources for further details: \enumerate{ -\item For C1 the baseline are the 7 timepoints before the assessed timepoint t, - t-7 to t-1. The expected value is the mean of the baseline. An approximate +\item For C1 the baseline are the \code{baseline} (default 7) timepoints before the assessed timepoint t, + t-\code{baseline} to t-1. The expected value is the mean of the baseline. An approximate (two-sided) \eqn{(1-\alpha)\cdot 100\%} prediction interval is calculated based on the - assumption that the difference between the expected value and the observed + assumption that the difference between the expected value and the observed value divided by the standard derivation of counts over the sliding window, called \eqn{C_1(t)}, follows a standard normal distribution in the absence of outbreaks: \deqn{C_1(t)= \frac{Y(t)-\bar{Y}_1(t)}{S_1(t)},} where -\deqn{\bar{Y}_1(t)= \frac{1}{7} \sum_{i=t-1}^{t-7} Y(i)} +\deqn{\bar{Y}_1(t)= \frac{1}{\code{baseline}} \sum_{i=t-1}^{t-\code{baseline}} Y(i)} and -\deqn{ S^2_1(t)= \frac{1}{6} \sum_{i=t-1}^{t-7} [Y(i) - \bar{Y}_1(i)]^2.} +\deqn{ S^2_1(t)= \frac{1}{6} \sum_{i=t-1}^{t-\code{baseline}} [Y(i) - \bar{Y}_1(i)]^2.} Then under the null hypothesis of no outbreak, - \deqn{C_1(t) \mathcal \sim {N}(0,1)} + \deqn{C_1(t) \mathcal \> \sim \> {N}(0,1)} An alarm is raised if \deqn{C_1(t)\ge z_{1-\alpha}} -with \eqn{z_{1-\alpha}} the \eqn{(1-\alpha)^{th}} quantile of the centered -reduced normal law. \cr +with \eqn{z_{1-\alpha}} the \eqn{(1-\alpha)^{th}} quantile of the standard normal distribution. \cr The upperbound \eqn{U_1(t)} is then defined by: \deqn{U_1(t)= \bar{Y}_1(t) + z_{1-\alpha}S_1(t).} -\item C2 is very close to C1 apart from a 2-day lag in the baseline definition. -Indeed for C2 the baseline are 7 timepoints with a 2-day lag before the assessed -timepoint t, t-9 to t-3. The expected value is the mean of the baseline. An approximate +\item C2 is very similar to C1 apart from a 2-day lag in the baseline definition. +In other words the baseline for C2 is \code{baseline} (Default: 7) timepoints with a 2-day lag before the monitored +timepoint t, i.e. \eqn{(t-\code{baseline}-2)} to \eqn{t-3}. The expected value is the mean of the baseline. An approximate (two-sided) \eqn{(1-\alpha)\cdot 100\%} prediction interval is calculated based on the - assumption that the difference between the expected value and the observed + assumption that the difference between the expected value and the observed value divided by the standard derivation of counts over the sliding window, called \eqn{C_2(t)}, follows a standard normal distribution in the absence of outbreaks: -\deqn{C_2(t)= \frac{Y(t)-\bar{Y}_2(t)}{S_2(t)},} +\deqn{C_2(t)= \frac{Y(t)-\bar{Y}_2(t)}{S_2(t)},} where -\deqn{\bar{Y}_2(t)= \frac{1}{7} \sum_{i=t-3}^{t-9} Y(i)} +\deqn{\bar{Y}_2(t)= \frac{1}{\code{baseline}} \sum_{i=t-3}^{t-\code{baseline}-2} Y(i)} and -\deqn{ S^2_2(t)= \frac{1}{6} \sum_{i=t-3}^{t-9} [Y(i) - \bar{Y}_2(i)]^2.} +\deqn{ S^2_2(t)= \frac{1}{\code{baseline}-1} \sum_{i=t-3}^{t-\code{baseline}-2} [Y(i) - \bar{Y}_2(i)]^2.} Then under the null hypothesis of no outbreak, \deqn{C_2(t) \mathcal \sim {N}(0,1)} An alarm is raised if \deqn{C_2(t)\ge z_{1-\alpha},} -with \eqn{z_{1-\alpha}} the \eqn{(1-\alpha)^{th}} quantile of the centered -reduced normal law. \cr +with \eqn{z_{1-\alpha}} the \eqn{(1-\alpha)^{th}} quantile of the standard normal distribution. \cr + +The upperbound \eqn{U_{2}(t)} is then defined by: -The upperbound \eqn{U_2(t)} is then defined by: - \deqn{U_2(t)= \bar{Y}_2(t) + z_{1-\alpha}S_2(t).} - - \item C3 is quite different from the two other methods but it is based on C2. + \deqn{U_{2}(t)= \bar{Y}_{2}(t) + z_{1-\alpha}S_{2}(t).} + + \item C3 is quite different from the two other methods, but it is based on C2. Indeed it uses \eqn{C_2(t)} from timepoint t and the two previous timepoints. - This means the baseline are timepoints t-11 to t-3. - The statistic \eqn{C_3(t)} is the sum of discrepancies between observations and + This means the baseline consists of the timepoints \eqn{t-(\code{baseline}+4)} to \eqn{t-3}. + The statistic \eqn{C_3(t)} is the sum of discrepancies between observations and predictions. -\deqn{C_3(t)= \sum_{i=t}^{t-2} \max(0,C_2(i)-1)} +\deqn{C_3(t)= \sum_{i=t}^{t-2} \max(0,C_2(i)-1)} Then under the null hypothesis of no outbreak, \deqn{C_3(t) \mathcal \sim {N}(0,1)} An alarm is raised if \deqn{C_3(t)\ge z_{1-\alpha},} -with \eqn{z_{1-\alpha}} the \eqn{(1-\alpha)^{th}} quantile of the centered -reduced normal law. \cr +with \eqn{z_{1-\alpha}} the \eqn{(1-\alpha)^{th}} quantile of the standard normal distribution. \cr The upperbound \eqn{U_3(t)} is then defined by: \deqn{U_3(t)= \bar{Y}_2(t) + S_2(t)\left(z_{1-\alpha}-\sum_{i=t-1}^{t-2} \max(0,C_2(i)-1)\right).} @@ -124,28 +143,31 @@ disProgObj <- sim.pointSource(p = 0.99, r = 0.5, length = 208, A = 1, alpha = 1, beta = 0, phi = 0, frequency = 1, state = NULL, K = 1.7) -stsObj = disProg2sts( disProgObj) +stsObj <- disProg2sts( disProgObj) -#Call function and show result -res1 <- earsC(stsObj, control = list(range = 20:208,method="C1")) -plot(res1,legend.opts=list(horiz=TRUE,x="topright"),dx.upperbound=0) +# Call earsC function and show result +res1 <- earsC(stsObj, control = list(range = 20:208, method="C1")) +plot(res1, legend.opts=list(horiz=TRUE, x="topright")) -# compare upperbounds depending on alpha +# Compare C3 upperbounds depending on alpha res3 <- earsC(stsObj, control = list(range = 20:208,method="C3",alpha = 0.001)) -plot(res3@upperbound,t='l') +plot(upperbound(res3), type='l') res3 <- earsC(stsObj, control = list(range = 20:208,method="C3")) -lines(res3@upperbound,col='red') - - +lines(upperbound(res3), col='red') } -\author{M. Salmon} +\author{M. Salmon, H. Burkom} \keyword{classif} \source{ Fricker, R.D., Hegler, B.L, and Dunfee, D.A. (2008). Comparing syndromic surveillance detection methods: EARS versus a CUSUM-based methodology, 27:3407-3429, Statistics in medicine. + + Salmon, M., Schumacher, D. and \enc{Höhle}{Hoehle}, M. (2016): + Monitoring count time series in \R: Aberration detection in public + health surveillance. \emph{Journal of Statistical Software}, + \bold{70} (10), 1-35. \doi{10.18637/jss.v070.i10} } diff -Nru r-cran-surveillance-1.12.2/man/hhh4_simulate_plot.Rd r-cran-surveillance-1.13.0/man/hhh4_simulate_plot.Rd --- r-cran-surveillance-1.12.2/man/hhh4_simulate_plot.Rd 2015-10-16 12:33:42.000000000 +0000 +++ r-cran-surveillance-1.13.0/man/hhh4_simulate_plot.Rd 2016-12-20 10:20:18.000000000 +0000 @@ -27,7 +27,8 @@ \method{plot}{hhh4simslist}(x, type = c("size", "time"), ..., groups = NULL, par.settings = list()) -plotHHH4sims_size(x, horizontal = TRUE, trafo = NULL, observed = TRUE, ...) +plotHHH4sims_size(x, horizontal = TRUE, trafo = NULL, observed = TRUE, + names = base::names(x), ...) plotHHH4sims_time(x, average = mean, individual = length(x) == 1, conf.level = if (individual) 0.95 else NULL, @@ -78,6 +79,9 @@ Alternatively, a list with graphical parameters can be specified to modify the default values. } + \item{names}{ + a character vector of names for \code{x}. + } \item{average}{ scalar-valued function to apply to the simulated counts at each time point. @@ -98,7 +102,8 @@ a bar for the initial number of cases is added to the plot. } \item{legend}{ - a logical or a list of parameters for \code{\link{legend}}. + a logical, a character vector (providing names for \code{x}), + or a list of parameters for \code{\link{legend}}. } \item{xlim,ylim}{ vectors of length 2 determining the axis limits. diff -Nru r-cran-surveillance-1.12.2/man/hhh4_validation.Rd r-cran-surveillance-1.13.0/man/hhh4_validation.Rd --- r-cran-surveillance-1.12.2/man/hhh4_validation.Rd 2016-03-30 14:11:07.000000000 +0000 +++ r-cran-surveillance-1.13.0/man/hhh4_validation.Rd 2016-12-13 13:22:41.000000000 +0000 @@ -59,8 +59,10 @@ subsequent predictions, whereas \code{"final"} will just use \code{result} to calculate these. The latter case thus gives nothing else than a subset of - \code{result$fitted.values}, if the \code{tp}'s are part of the - fitted subset \code{result$control$subset}.} + \code{result$fitted.values}. + %% if the \code{tp}'s are part of the + %% fitted subset \code{result$control$subset}. + } \item{which.start}{ Which initial parameter values should be used when successively refitting the model to subsets of the data (up to time point diff -Nru r-cran-surveillance-1.12.2/man/sts_animate.Rd r-cran-surveillance-1.13.0/man/sts_animate.Rd --- r-cran-surveillance-1.12.2/man/sts_animate.Rd 2016-07-27 01:39:40.000000000 +0000 +++ r-cran-surveillance-1.13.0/man/sts_animate.Rd 2016-12-01 14:11:02.000000000 +0000 @@ -1,7 +1,7 @@ \name{sts_animate} \alias{animate.sts} \title{ - Animated Maps and Time Series of Disease Incidence + Animated Maps and Time Series of Disease Counts or Incidence } \description{ @@ -10,8 +10,8 @@ implemented by the function \code{\link{stsplot_spacetime}}. Maps generated by \code{\link{stsplot_space}} are sequentially plotted along time (optionally showing cumulative - counts), with an optional time series chart below the map to track the - epidemic curve. + counts/incidence), with an optional time series chart below the map + to track the epidemic curve. It is worth using functionality of the \pkg{animation} package (e.g., \code{\link[animation]{saveHTML}}) to directly export the animation into a useful format. @@ -39,7 +39,11 @@ The default \code{tps=NULL} means the whole time period \code{1:nrow(object)}. } \item{cumulative}{ - logical specifying if the cumulative counts over time should be plotted. + logical specifying if the cumulative counts/incidence over time + should be plotted. The cumulative incidence is relative to the + population from the first time point \code{tps[1]} throughout the + whole animation, while \code{cumulative=FALSE} computes the + incidence from the current population numbers. } \item{population,at,\dots}{ arguments for \code{\link{stsplot_space}}. @@ -52,10 +56,12 @@ The argument \code{height} gives the relative height of the time series plot (default: 0.3), the logical value \code{fill} indicates whether to make the panel as big as possible (default: FALSE), - and the arguments \code{inactive} and + the arguments \code{inactive} and \code{active} are lists of graphical parameters (e.g., \code{col}) determining the appearance of the bars (e.g., default color is grey - when inactive and black when active). + when inactive and black when active), + and the boolean \code{as.Date} determines whether dates should be + put on the x-axis (instead of the \code{tps} indexes). } \item{sleep}{ time to wait (\code{Sys.sleep}) between subsequent snapshots (only if @@ -104,14 +110,21 @@ \examples{ data("measlesWeserEms") -## sequential plot of the counts by region in weeks 12-16 only (for speed) +## animate the weekly counts of measles (during weeks 12-16 only, for speed) if (require("animation")) { oldwd <- setwd(tempdir()) # to not clutter up the current working dir - saveHTML(animate(measlesWeserEms, tps=12:16, cumulative=FALSE), + saveHTML(animate(measlesWeserEms, tps=12:16), title="Evolution of the measles epidemic in the Weser-Ems region", ani.width=500, ani.height=600) setwd(oldwd) } + +## animate the weekly incidence of measles (per 100'000 inhabitants), +## and label the time series plot with dates in a specified format +animate(measlesWeserEms, tps=12:16, + population = measlesWeserEms@map$POPULATION / 100000, + timeplot = list(as.Date = TRUE, + scales = list(x = list(format = "\%G/\%V")))) } \keyword{hplot} diff -Nru r-cran-surveillance-1.12.2/man/stsplot_space.Rd r-cran-surveillance-1.13.0/man/stsplot_space.Rd --- r-cran-surveillance-1.12.2/man/stsplot_space.Rd 2014-12-09 14:46:27.000000000 +0000 +++ r-cran-surveillance-1.13.0/man/stsplot_space.Rd 2016-12-01 14:11:02.000000000 +0000 @@ -1,7 +1,7 @@ \name{stsplot_space} \alias{stsplot_space} \title{ - Map of Disease Incidence During a Given Period + Map of Disease Counts/Incidence accumulated over a Given Period } \description{ @@ -10,7 +10,7 @@ \code{plot(stsObj, type=observed~unit, ...)} calls the function documented below. It produces an \code{\link{spplot}} where regions are color-coded according to disease incidence -(either absolute counts or relative to population) during a given +(either absolute counts or relative to population) over a given time period. } @@ -46,9 +46,25 @@ empty and is not applicable if \code{x} is a matrix of counts). } \item{population}{ - an optional numeric vector of population numbers in the - \code{ncol(x)} regions. If given, incidence values instead of - absolute counts are plotted. + if \code{NULL} (default), the map shows the region-specific numbers + of cases accumulated over \code{tps}. For a disease incidence map, + \code{population} can be specified in three ways: + \itemize{ + \item a numeric vector of population numbers in the + \code{ncol(x)} regions, used to divide the disease counts. + \item a matrix of population counts of dimension \code{dim(x)} + (such as \code{population(x)} in an \code{"sts"} object). + This will produce the cumulative incidence over \code{tps} + relative to the population at the first time point, i.e., only + \code{population[tps[1],]} is used. + \item [if \code{is(x, "sts")}] + a scalar specifying how \code{population(x)} should be scaled for + use as the population matrix, i.e., + \code{population(x)/population} is used. For instance, if + \code{population(x)} contains raw population numbers, + \code{population=1000} would produce the incidence per 1000 + inhabitants. + } } \item{main}{ a main title for the plot. If \code{NULL} and \code{x} is of @@ -61,8 +77,8 @@ } \item{at}{ either a number of levels (default: 10) for the categorization - (color-coding) of counts, or specific break points to use, or a - named list of a number of levels (\code{"n"}), a transformer + (color-coding) of counts/incidence, or specific break points to use, or, + a named list of a number of levels (\code{"n"}), a transformer (\code{"trafo"}) of class \code{"\link[scales]{trans}"} defined by package \pkg{scales}, and optional further arguments for \code{\link{pretty}}. The default is the square root transformation @@ -128,7 +144,7 @@ # compare with old implementation plot(measlesWeserEms, type=observed~1|unit) -# plot incidence with region labels +# plot cumulative incidence (per 100000 inhabitants), with region labels plot(measlesWeserEms, type=observed~unit, population=measlesWeserEms@map$POPULATION / 100000, labels=list(labels="GEN", cex=0.7, font=3)) diff -Nru r-cran-surveillance-1.12.2/MD5 r-cran-surveillance-1.13.0/MD5 --- r-cran-surveillance-1.12.2/MD5 2016-11-17 13:34:37.000000000 +0000 +++ r-cran-surveillance-1.13.0/MD5 2016-12-20 17:31:10.000000000 +0000 @@ -1,5 +1,5 @@ -9ce5e45a56fbaa40da60aae30b464069 *DESCRIPTION -f2187d3c8b1df35b9332d5824b5b488b *NAMESPACE +4b401a2a087f69f46bff5c14e857e8d1 *DESCRIPTION +91f7e41cc5c5664e8973c52cab98119b *NAMESPACE 0d71ebe8822996bb5c9c8521c2a12804 *R/AllClass.R ee2dc6e882782a1dc60a5c44fb31c5be *R/AllGeneric.R c18cc82eec3698f76db1ceb2dcf157bf *R/LRCUSUM.runlength.R @@ -24,7 +24,7 @@ 51d65e4566c1924aa1f309a9b911eb96 *R/calibration_null.R cb17c08401b9b954f0c0a2f0b51f613c *R/catCUSUM.R 09b2e1ae80408fc68c5f9d6b7e87de71 *R/checkDerivatives.R -c65e9c1ed5054d3ab72e8e96d4151eae *R/earsC.R +2b42bffe99b6544f9cb964972507b606 *R/earsC.R f09e759e6b523b146a8b43dd38aa2a6a *R/epidata.R 2a1d84a4aefde7ad692ac7515b19b1f7 *R/epidataCS.R 117f82f6e97605811922d7c8644b98c8 *R/epidataCS_aggregate.R @@ -39,16 +39,16 @@ 3ca2a3350552438778d972bdf9f4e336 *R/glm_epidataCS.R 5a9797f6921b53e03e4368b3da08c82a *R/gpc.poly-methods.R a5870a6bcd798a89c834058c40e34bb0 *R/graphs.R -a8c991684608634f36ee83b75ff2ed15 *R/hhh4.R +68e06449c428e9d37db2ca802372b6d4 *R/hhh4.R 3bb545ebfc4c2c2bb61e76e443c17214 *R/hhh4_W.R 2e29adf62fc0fb8d436917580b4a2a43 *R/hhh4_W_np.R 0bd5cf8af546c99f5f922c86aee073b2 *R/hhh4_W_powerlaw.R 53d08453cc08c52279a3d44e12c6f5bb *R/hhh4_calibration.R -f2540d3e0e136411b8d063deb3eb9177 *R/hhh4_methods.R -359036cfbbffc28cc9aa66943d177664 *R/hhh4_oneStepAhead.R +d5db2bc0ff166da68078b86c0164d781 *R/hhh4_methods.R +2c611a887502be0a7d961bc00acc2431 *R/hhh4_oneStepAhead.R 4accb71a56b228658f5b946f9f33f0b2 *R/hhh4_plot.R -4bb22d2df33f6b10cee9cdfb669fb0d6 *R/hhh4_simulate.R -a4741bd69b7b708cf4258ab49e58cd97 *R/hhh4_simulate_plot.R +6278c4cdabc32d203be9aa690f9b7f5e *R/hhh4_simulate.R +d232570c5b76c244ab146340f16de2fe *R/hhh4_simulate_plot.R 06b3c6c38ad2153300e43b14882dc00d *R/hhh4_simulate_scores.R 13a43831746902664c4e2c6507b7c671 *R/intersectPolyCircle.R a88f139b35ea30b5c3239aa02fbcef24 *R/isoWeekYear.R @@ -79,12 +79,12 @@ 51c8d382619c1f5398f98582f1a0cbaa *R/stsBP.R 92650e87fbe44030becb5b45777cd2dd *R/stsNC.R 84c91795860f65013a2e0dcb7348df80 *R/stsNClist_animate.R -fbf201b429ac5ac85ad76b5f5ee791e3 *R/sts_animate.R -6613f8f2827611775049bdcd715b033d *R/sts_coerce.R +5fd0c4f0f72c5dae7ae6063a0bbeb9ed *R/sts_animate.R +2a59fda60422f1505e44059fa2ecf6d0 *R/sts_coerce.R c777f4369f689861980f40d8554657fd *R/sts_creation.R 0e4fcbe1170e878d419515cd1b0f8ef2 *R/sts_observation.R e3fc5471a0507efecbd96b6abf8d69b3 *R/sts_toLatex.R -d1276ac5f133c7c5b7d90df1ce5f18b6 *R/stsplot_space.R +7f710053aec12b86565240ff7a63372e *R/stsplot_space.R a6230bb9d9de5a65d2842103aa735893 *R/stsplot_spacetime.R 99a0ff793281139487d8df9bfb1bf829 *R/stsplot_time.R 837eefa65010f2dd43829ab5f773c867 *R/sysdata.rda @@ -115,7 +115,7 @@ e21d2f4625b12923af2f563af1cc824f *R/untie.R 339a9b5872a2d937bb4404e1c4b5472b *R/wrap_univariate.R fe53ed74b86cc4b99a587eabc4048bc6 *R/zzz.R -00f7d47fa0c4a2f5be38515a2d7a1495 *build/partial.rdb +1ab20b26ae800daa3170979532f61a52 *build/partial.rdb 368af961b750a908810233c9aa6a8cc2 *build/vignette.rds 39cd4adbe3c05e3bed5a29e962a30724 *data/MMRcoverageDE.RData bc57ed2de6c59d625e8ff1dc4bcc534d *data/abattoir.RData @@ -162,31 +162,31 @@ 7f2f2d9f03cffe1bbe7c004863e62583 *demo/00Index 150489e6ef221568c2edb8ebc24ea661 *demo/biosurvbook.R 9a0f5af9df8d008ee9d5a3e5d2da784f *demo/cost.R -fa59424a487546d722957a3360bdddf1 *demo/fluBYBW.R +99018ac2ab271c6bdaf6af57bcb03045 *demo/fluBYBW.R b2902a1e266db7088cec5a16d5f3d6ee *inst/CITATION -b961fa63e06233fa9f2d78a0c2a66ac3 *inst/NEWS.Rd +46c0198a300f95e077fe024200cb9794 *inst/NEWS.Rd 9eee30ebd03c7f6f85368b82345d7be3 *inst/THANKS c5ffc7c2c2d10b15a8ae830709f58bf3 *inst/doc/glrnb.R 2d08c0b2c87b001ea42eff9905796f98 *inst/doc/glrnb.Rnw -7daf6dd8238faa80c8c924bd20d324ce *inst/doc/glrnb.pdf +da5a058824f37481ee249f02f3dc4a79 *inst/doc/glrnb.pdf 1f78473db118c23e4b15665a2e017602 *inst/doc/hhh4.R 93eb5958fc0f87ed3cef8337d738e329 *inst/doc/hhh4.Rnw -1dd5c476dda04a9dcc3d98d81a3eafe0 *inst/doc/hhh4.pdf +d601fc04c613367c13662ac5113094bd *inst/doc/hhh4.pdf df32d43821169d5ec9ecf5213dd134b2 *inst/doc/hhh4_spacetime.R 3859b16febc32d8fea783130b46b612d *inst/doc/hhh4_spacetime.Rnw -b93b43f24495feb5ae965003bc76da3c *inst/doc/hhh4_spacetime.pdf -4c566d5bfec49a869fb4fb9150513610 *inst/doc/monitoringCounts.R -3067d17f05f80fddbce649ea0f088813 *inst/doc/monitoringCounts.Rnw -c9802ca6c6cfde7e06ac10bd45aeb3b0 *inst/doc/monitoringCounts.pdf +8ce0b7fa759815329cef2f861affc92d *inst/doc/hhh4_spacetime.pdf +fe06b714006d430d35b2172ad03b0d27 *inst/doc/monitoringCounts.R +733ec2308167dd712579a8cbd82a0ce4 *inst/doc/monitoringCounts.Rnw +cc8b3c03e6e41cc602758df2310d1087 *inst/doc/monitoringCounts.pdf e4723096379b7f3398e761e578ae0fe4 *inst/doc/surveillance.R df303888be863f9a8995fb32d79fff8e *inst/doc/surveillance.Rnw -f5169861e9a95e2a276b266792acbcfc *inst/doc/surveillance.pdf +6432f9fe01263d8a45c9fca1ce7a7abe *inst/doc/surveillance.pdf fe0cf08a224669c084910c5c19f6ff4a *inst/doc/twinSIR.R 0f0a09be6f26656078bc248e4461cf88 *inst/doc/twinSIR.Rnw -dada12e7b8d455e21a1ffa3e16b3a9f9 *inst/doc/twinSIR.pdf +522f0e1708c51eb9cd2e157ae0490e3e *inst/doc/twinSIR.pdf 325b2f355993f2d3cbc40e2f3ae3e1cb *inst/doc/twinstim.R 8cf84064516f7b0af4ef05ed21439b1f *inst/doc/twinstim.Rnw -99bad13e09fca06180bcd021cd778c38 *inst/doc/twinstim.pdf +3965000d0f75e94f7f0b19bcf842cbb5 *inst/doc/twinstim.pdf 01e880f0dcb85b78a1c2be29862d464f *inst/extdata/counts_flu_BYBW.txt 7475135b03ccdf173429ee21b85ca4f8 *inst/extdata/h1_nrwrp.txt 6b1da7976e5a3a83fa4aef34a85fbc3b *inst/extdata/k1.txt @@ -228,11 +228,11 @@ 43b5e5c1d6ec4e0cc7efa582d4a1f047 *man/algo.farrington.assign.weights.Rd 5bc4e2f69607461bce07378ae4cd1e71 *man/algo.farrington.fitGLM.Rd 5c26bb44dd9f7b4ba6ea36aa12c223d4 *man/algo.farrington.threshold.Rd -cff3f8f8e771d1b6d6a865ff9be96766 *man/algo.glrnb.Rd +09314090be9985980c742a5abad296cd *man/algo.glrnb.Rd 328f9c86d5ec8b9f03a044ebdde890d5 *man/algo.hhh.Rd b026ec58b86d0c9cf15262ba13f5149e *man/algo.hhh.grid.Rd 1fbddc2cefa403c2a745bb25594a93a0 *man/algo.hmm.Rd -d99fa49529f7dd1c2d05eb8b229dcc09 *man/algo.outbreakP.Rd +c62fdc5bc3aefdde79704119341edfa2 *man/algo.outbreakP.Rd d4382bd4c2510a27374d4cf9eb9bd789 *man/algo.quality.Rd a8d32e124ec656138b4d95b18ba01864 *man/algo.rki.Rd 7b4d8231127a12bad082bb9e978ba51a *man/algo.rogerson.Rd @@ -248,7 +248,7 @@ 8d16a3531ecbb9f8e3abbb9a664c1236 *man/bodaDelay.Rd 55c1c52dbe2f9227b76df24d43f590b2 *man/calibration.Rd e8c2d163f664e9fb251e4c501cf9f4b9 *man/campyDE.Rd -9fc47ea97ded0101279543dbafdc8de2 *man/categoricalCUSUM.Rd +6293efa013d93ce611b93e5c575b2d03 *man/categoricalCUSUM.Rd df6455aa6be9dfc476040bf1b9fb999c *man/checkResidualProcess.Rd 27ef79270ab1c247cc00247a25d96b65 *man/coeflist.Rd 2aa23e12ccfff25d1f3fa7ea1a061503 *man/compMatrix.writeTable.Rd @@ -258,7 +258,7 @@ a09a1f8d94afb6f99365294cb3c59390 *man/deleval.Rd c629a963f9544c4a04d5317179f49c89 *man/disProg2sts.Rd 80329fe7efe094d8833545c66af03bae *man/discpoly.Rd -ff1f4ba162982f12db8d92af84e84289 *man/earsC.Rd +7e557bf7fb1f3dbe5a489e40d68a325c *man/earsC.Rd e27fcca6f14c93e3e26d98ffaa5363a1 *man/enlargeData.Rd 8029404827ac69012ed8fe88aea30683 *man/epidata.Rd 3de853e91076ed11610f8f39a45cba54 *man/epidataCS.Rd @@ -290,9 +290,9 @@ cbeaa3fefe6277c82fccce76dfda642f *man/hhh4_plot.Rd 4a4fa202f05de5855e78a9fa2269ade1 *man/hhh4_predict.Rd 62f5c7765bd90e5f1fa36de3f81bca82 *man/hhh4_simulate.Rd -7965b7cc2cde43404367962040693916 *man/hhh4_simulate_plot.Rd +8e250a701bcd87fa9fbc773e475189ed *man/hhh4_simulate_plot.Rd 8445683f04a3f6f9298f40fcfa826cad *man/hhh4_update.Rd -f637569220d0b1f8732bc9ffd7986d99 *man/hhh4_validation.Rd +24cc3a053fc04a795124e41ca4d1955b *man/hhh4_validation.Rd d0d32f0fb5ce379ec1f73f8863ba449a *man/husO104Hosp.Rd 9214d9944521f0535cb901bba88d544e *man/imdepi.Rd 7c6c7d7f79f9570fbc7acdf91af9462a *man/influMen.Rd @@ -359,11 +359,11 @@ 0c14b85d6e51bb29a01955783b8edf9a *man/stsNewport.Rd 12f381b8971e40942556064737297adb *man/stsSlots.Rd bd422b571337f471e6277b897c6251cb *man/stsXtrct.Rd -37be54e6d882420838c38439d25079dd *man/sts_animate.Rd +14d0070b6ce87409feed6d4aa4ef6e3e *man/sts_animate.Rd 157630a0b5e470b8b4f2050223909ef8 *man/sts_creation.Rd 1edb3a0998d110941339fda0c0e17c94 *man/sts_observation.Rd 3281c5636e2cbf792caeb084dd52a580 *man/stsplot.Rd -d16b7f11082f8f9c2264091daa05779a *man/stsplot_space.Rd +3210146e5e0abf7f228a4bc0023ed1d9 *man/stsplot_space.Rd 53ff4e9c5cb0828295d4691c9eac102f *man/stsplot_spacetime.Rd 78b2ef1aa112ed986abbdcd930fe3136 *man/stsplot_time.Rd 7a51052cfe961bb040b176c81339441b *man/sumNeighbours.Rd @@ -413,12 +413,13 @@ 60ef91ae1644f77a78421bc167836b0a *tests/testthat/test-bodaDelay.R 65fe94672dc5a141562bf5800b112adf *tests/testthat/test-calibration.R 220d17007747ef51c8e3d04f04bceee3 *tests/testthat/test-determineSources.R +192d042f92c8f67fa4a4ab3c1153044a *tests/testthat/test-earsc.R 50d327d505769853ce3efe712b7a4471 *tests/testthat/test-farringtonFlexible.R 9815b459b6e7a451cc98e4fedf2d7bbd *tests/testthat/test-formatDate.R 54f7e2d4384b24dafa6f3fb1b42cdf27 *tests/testthat/test-hhh4+algo.hhh.R b395815d92dd6ad1c112ef1cab023447 *tests/testthat/test-hhh4+derivatives.R e14fb292c7c8d198cdb4ba419ca1ec0b *tests/testthat/test-hhh4_ARasNE.R -ad1bddd36f44add44f9ccb0219ab45ce *tests/testthat/test-hhh4_NegBinGrouped.R +16afff9a2c16f9c280568fdc7ca7ea28 *tests/testthat/test-hhh4_NegBinGrouped.R 96ca6abb7df9823cf14ca582882c5db8 *tests/testthat/test-nbOrder.R 2859de753c985ea8a85bfc1ee268c804 *tests/testthat/test-plapply.R ce563d4a669128bff28f05673f6fd96b *tests/testthat/test-siafs.R @@ -436,11 +437,11 @@ e96ace90d8c7da6ca841b64e38c6fb05 *vignettes/monitoringCounts-cache/pMC.RData 4a4e3a07341972e333a16615c1687f1e *vignettes/monitoringCounts-cache/pMarkovChain.RData b17c5546c383d603c5a9824db248b1d5 *vignettes/monitoringCounts-cache/rlsims-multinom.RData -3067d17f05f80fddbce649ea0f088813 *vignettes/monitoringCounts.Rnw +733ec2308167dd712579a8cbd82a0ce4 *vignettes/monitoringCounts.Rnw aafa3c727f8d58b7cc14d84c808f3330 *vignettes/monitoringCounts.bib e9c380be86b55b9fb5ffabd1c3548a9d *vignettes/references.bib 3e456dc01a1189e7f6f2005b0cc4bd54 *vignettes/surveillance-cache.RData -41a8e134f266d6b3111fa1ccbd2d68af *vignettes/surveillance-hmm.pdf +4e5dcc73c40f6360fed4fbac7ae98d6f *vignettes/surveillance-hmm.pdf df303888be863f9a8995fb32d79fff8e *vignettes/surveillance.Rnw 01099bf54aeaf8a031440283443e994a *vignettes/twinSIR-cache.RData 0f0a09be6f26656078bc248e4461cf88 *vignettes/twinSIR.Rnw diff -Nru r-cran-surveillance-1.12.2/NAMESPACE r-cran-surveillance-1.13.0/NAMESPACE --- r-cran-surveillance-1.12.2/NAMESPACE 2016-07-18 22:04:40.000000000 +0000 +++ r-cran-surveillance-1.13.0/NAMESPACE 2016-11-29 12:09:45.000000000 +0000 @@ -1,490 +1,490 @@ -### Load C code - -useDynLib(surveillance) - -importFrom(Rcpp, evalCpp) # see vignette("Rcpp-package", package="Rcpp") -## although Rcpp is only used on C-level we need to "ensure that Rcpp is loaded -## so any dynamic linking to its code can be resolved. (There may be none, but -## there could be, now or in future.)" (B. Ripley, 2013-09-08) - -############### -### IMPORTS ### -############### - -### Import all packages listed as Depends -### (for utils and polyCub: only selected methods are imported) - -import(methods, grDevices, graphics, stats) - -## sp classes & utilities (bbox, coordinates, dimensions, overlay, plot, ...) -## (we "Depend" on package sp since it defines essential data classes & methods) -import(sp) - -## we define own methods for generating xtable()'s, which we want to be useable -import(xtable) - - -### required generics for own methods (that's why we "Depend" on these packages) - -## importFrom(stats, coef, vcov, logLik, nobs, residuals, confint, AIC, extractAIC, -## profile, simulate, update, terms, add1, drop1, predict, as.stepfun) -importFrom(utils, head, tail, toLatex) - - -### required functions from utils and stats - -## importFrom(stats, pnorm, cov2cor, ks.test, formula, rnorm, runif, step, dist, -## update.formula, terms.formula, rpois, rnbinom, setNames, -## na.omit, as.formula, pnbinom, qnbinom, qnorm, sd, glm, optim, -## poisson, ppois, qpois, predict.glm, summary.glm, quasipoisson, -## glm.fit) ## and many more... -importFrom(utils, packageVersion, modifyList, capture.output, read.table, data, - setTxtProgressBar, txtProgressBar, sessionInfo, head.matrix, - str, flush.console, write.table, as.roman, tail.matrix, - methods) - - -### sampling from mv.Gausian for OSAIC weights (twinSIR) and iafplot (twinstim) - -importFrom(MASS, mvrnorm) - - -### disProg-specific - -importFrom(MASS, glm.nb) # for algo.glrnb -##importFrom(msm, msm, hmmPois, viterbi.msm) # for algo.hmm() -##importFrom(spc, xcusum.arl, xcusum.crit) # for find.kh() -## (packages msm and spc are now "suggested", not imported) - - -### hhh4-specific - -importFrom(MASS, ginv, negative.binomial) -importFrom(Matrix, Matrix) -importClassesFrom(Matrix, ddiMatrix) -importMethodsFrom(Matrix, coerce, forceSymmetric) -## sparse matrix methods provide a significant speed-up in marFisher -importFrom(nlme, fixef, ranef) -export(fixef, ranef) # we define corresponding methods for "hhh4" models - - -### twinSIR-specific - -# for use in computing OSAIC weights by simulation -#importFrom(quadprog, solve.QP) # moved to "Suggests" - - -### twinstim-specific - -importFrom(spatstat, area.owin, as.im.function, coords.ppp, diameter, - diameter.owin, disc, edges, inside.owin, intersect.owin, - is.polygonal, as.polygonal, nncross.ppp, ppp, runifpoint, - shift.owin, spatstat.options, vertices) -importFrom(spatstat, marks) -export(marks) # we define an epidataCS-method -importFrom(spatstat, multiplicity) -export(multiplicity) # we define a Spatial-method - -## Note: we depend on instead of import package polyCub to avoid "::" references -## in some of the siaf.* generators which create functions in .GlobalEnv -importFrom(polyCub, polyCub, .polyCub.iso, polyCub.SV, polyCub.midpoint, xylist) -importMethodsFrom(polyCub, coerce) - -importFrom(MASS, kde2d, truehist) - - - -############### -### EXPORTS ### -############### - - -### general exports - -export(surveillance.options, reset.surveillance.options) -export(animate) # new S3-generic -export(R0) # new S3-generic -export(intensityplot) # new S3-generic -export(formatPval) # yapf -- yet another p-value formatter -export(anscombe.residuals) -export(magic.dim, primeFactors, bestCombination) # similar to n2mfrow -export(isoWeekYear) #extract ISO 8601 date -export(formatDate) #ISO 8601 compatible %G and %V format( ) function. -export(refvalIdxByDate) -export(ks.plot.unif) -export(checkResidualProcess) # for twinstim and twinSIR -export(qlomax) # quantile function of the Lomax distribution -export(plapply) - -# spatial utilities -export(discpoly) -#export(runifdisc) # CAVE: spatstat has similar function of same name -export(unionSpatialPolygons) -export(inside.gpc.poly) -S3method(scale, gpc.poly) # redefined method for gpc.poly in spatial_stuff.R -S3method(diameter, gpc.poly) -export(nbOrder) -export(poly2adjmat) -export(polyAtBorder) -export(layout.labels) -export(layout.scalebar) - -# randomly break tied event times or coordinates -export(untie) # new S3-generic -#export(untie.default, untie.matrix, untie.epidataCS) -S3method(untie, default) -S3method(untie, matrix) -S3method(untie, epidataCS) - -# intersection of a polygonal and a circular domain -export(intersectPolyCircle) -S3method(intersectPolyCircle, owin) -S3method(intersectPolyCircle, SpatialPolygons) -S3method(intersectPolyCircle, gpc.poly) - -# little helper: multiplicity of points -S3method(multiplicity, Spatial) - -# list coefficients by model component -export(coeflist) -S3method(coeflist, default) -S3method(coeflist, twinstim) -S3method(coeflist, simEpidataCS) -S3method(coeflist, hhh4) - -# Spatio-temporal cluster detection -export(stcd) - -# tests for space-time interaction -export(knox) -S3method(print, knox) -S3method(plot, knox) -S3method(xtable, knox) -S3method(toLatex, knox) -export(stKtest) -S3method(plot, stKtest) - -# PIT histograms -export(pit) -export(pit.default) -S3method(pit, default) -S3method(pit, oneStepAhead) -S3method(pit, hhh4) -S3method(plot, pit) - -# calibration test for Poisson or NegBin predictions -export(calibrationTest) -S3method(calibrationTest, default) -export(calibrationTest.default) -export(dss, logs, rps) # ses, nses - - -### sts(BP|NC)-specific - -export(sts) -exportClasses(sts, stsBP) -export(linelist2sts) -export(animate_nowcasts) - -# conversion of "sts" objects -S3method(as.ts, sts) -export(as.xts.sts) # no registered S3-method since we only suggest "xts" - -# generics for sts class defined in sts.R -exportMethods("[", plot) -exportMethods(toLatex) -exportMethods(dim, dimnames, epochInYear, year) -exportMethods(aggregate) -exportMethods(as.data.frame) - -# methods for accessing/replacing slots of an sts object (cf. AllGeneric.R) -exportMethods(epoch,observed,alarms,upperbound,population,control,multinomialTS,neighbourhood) -exportMethods("epoch<-","observed<-","alarms<-","upperbound<-","population<-","control<-","multinomialTS<-","neighbourhood<-") -# methods for accessing/replacing slots of an stsNC object -exportMethods(reportingTriangle,delayCDF,score,predint) - -# plot variants -export(stsplot_space) -export(stsplot_time, stsplot_time1, stsplot_alarm) -export(addFormattedXAxis, atChange, at2ndChange, atMedian) #for time axis formatting -export(stsplot_spacetime) # old implementation of (animated) map -S3method(animate, sts) # S3-method for an S4 class, see ?Methods - -# outbreak detection algorithms (sts-interfaces) -export(wrap.algo, farrington, bayes, rki, cusum, glrpois, glrnb, outbreakP, boda) # FIXME: rogerson, hmm ?? -export(earsC) -export(farringtonFlexible) -export(categoricalCUSUM, pairedbinCUSUM, pairedbinCUSUM.runlength) -export(nowcast, backprojNP) -export(bodaDelay) - - -# sts creation functions -export(sts_creation) -export(sts_observation) - -### disProg-specific - -export(create.disProg, readData, toFileDisProg) -S3method(print, disProg) -S3method(plot, disProg) -S3method(plot, disProg.one) -S3method(aggregate, disProg) - -export(sim.pointSource, sim.seasonalNoise) -export(LRCUSUM.runlength, arlCusum, find.kh, findH, hValues, findK) -export(compMatrix.writeTable, correct53to52, enlargeData) -export(makePlot) -export(estimateGLRNbHook) -export(algo.compare, algo.quality, algo.summary) - -## outbreak detection algorithms (old disProg implementations) -export(algo.bayes, algo.bayes1, algo.bayes2, algo.bayes3, - algo.bayesLatestTimepoint, - algo.call, - algo.cdc, algo.cdcLatestTimepoint, - algo.cusum, - algo.farrington, - algo.glrnb, algo.glrpois, - algo.hhh, algo.hhh.grid, - algo.hmm, - algo.outbreakP, - algo.rki, algo.rki1, algo.rki2, algo.rki3, algo.rkiLatestTimepoint, - algo.rogerson, - algo.twins) - -## auxiliary functions for algo.farrington (FIXME: why export these internals?) -export(algo.farrington.assign.weights, - algo.farrington.fitGLM, algo.farrington.fitGLM.fast, - algo.farrington.fitGLM.populationOffset, algo.farrington.threshold) - - -S3method(plot, atwins) -S3method(plot, survRes) -S3method(plot, survRes.one) -S3method(print, algoQV) -S3method(xtable, algoQV) - -export(test, testSim) # FIXME: remove these test functions? -> Demo? - - -### conversion between old disProg and new sts classes - -export(disProg2sts) -export(sts2disProg) - - -### twinSIR-specific - -export(cox) -export(as.epidata) -S3method(as.epidata, data.frame) -export(as.epidata.data.frame) -S3method(as.epidata, default) -export(as.epidata.default) -export(intersperse) -export(twinSIR) -export(stateplot) -export(simEpidata) - -S3method(update, epidata) -S3method("[", epidata) -S3method(print, epidata) -S3method(summary, epidata) -S3method(print, summary.epidata) -S3method(plot, epidata) -S3method(animate, epidata) -S3method(plot, summary.epidata) -S3method(animate, summary.epidata) - -S3method(print, twinSIR) -S3method(summary, twinSIR) -S3method(print, summary.twinSIR) -S3method(plot, twinSIR) -S3method(intensityplot, twinSIR) -export(intensityplot.twinSIR) # for convenience -S3method(profile, twinSIR) -S3method(plot, profile.twinSIR) -S3method(vcov, twinSIR) -S3method(logLik, twinSIR) -S3method(AIC, twinSIR) -S3method(extractAIC, twinSIR) -S3method(simulate, twinSIR) -export(simulate.twinSIR) # for convenience -S3method(residuals, twinSIR) - -S3method(intensityplot, simEpidata) -export(intensityplot.simEpidata) # for convenience - - -### twinstim-specific - -export(as.epidataCS) -export(glm_epidataCS) -export(twinstim) -export(simEpidataCS) -export(siaf, - siaf.constant, siaf.step, - siaf.gaussian, - siaf.powerlaw, siaf.powerlawL, siaf.student) -export(tiaf, - tiaf.constant, tiaf.step, - tiaf.exponential) -export(epidataCS2sts) -export(epitest) -S3method(coef, epitest) -S3method(plot, epitest) -export(getSourceDists) - -S3method(nobs, epidataCS) -S3method("[", epidataCS) -S3method(update, epidataCS) -export(update.epidataCS) # for convenience -export(permute.epidataCS) -S3method(head, epidataCS) -S3method(tail, epidataCS) -S3method(print, epidataCS) -S3method(subset, epidataCS) -S3method(summary, epidataCS) -S3method(print, summary.epidataCS) -S3method(as.stepfun, epidataCS) -S3method(animate, epidataCS) -export(animate.epidataCS) # for convenience -S3method(marks, epidataCS) -export(marks.epidataCS) # for convenience since its a foreign generic -S3method(plot, epidataCS) -export(epidataCSplot_time, epidataCSplot_space) -S3method(as.epidata, epidataCS) -export(as.epidata.epidataCS) # for convenience - -S3method(print, twinstim) -S3method(summary, twinstim) -export(summary.twinstim) # for convenience -S3method(print, summary.twinstim) -S3method(toLatex, summary.twinstim) -S3method(xtable, summary.twinstim) -export(xtable.summary.twinstim) # for xtable.twinstim -S3method(xtable, twinstim) -S3method(plot, twinstim) -export(iafplot) -export(intensity.twinstim) -S3method(intensityplot, twinstim) -export(intensityplot.twinstim) # for convenience -S3method(profile, twinstim) -S3method(coef, summary.twinstim) -S3method(vcov, twinstim) -S3method(vcov, summary.twinstim) -S3method(logLik, twinstim) -S3method(extractAIC, twinstim) -S3method(nobs, twinstim) -S3method(simulate, twinstim) -export(simulate.twinstim) # for convenience -S3method(R0, twinstim) -export(simpleR0) -S3method(residuals, twinstim) -S3method(update, twinstim) -export(update.twinstim) # for convenience -S3method(terms, twinstim) -S3method(all.equal, twinstim) - -export(stepComponent) -S3method(terms, twinstim_stependemic) -S3method(terms, twinstim_stepepidemic) -S3method(update, twinstim_stependemic) -S3method(update, twinstim_stepepidemic) -S3method(add1, twinstim) -S3method(add1, twinstim_stependemic) -S3method(add1, twinstim_stepepidemic) -S3method(drop1, twinstim) -S3method(drop1, twinstim_stependemic) -S3method(drop1, twinstim_stepepidemic) - -S3method(residuals, simEpidataCS) -S3method(R0, simEpidataCS) -S3method(intensityplot, simEpidataCS) -export(intensityplot.simEpidataCS) # for convenience -S3method(print, simEpidataCSlist) -S3method("[[", simEpidataCSlist) -S3method(plot, simEpidataCSlist) - - -### algo.hhh-specific - -export(algo.hhh) -export(algo.hhh.grid) -export(create.grid) - -S3method(print, ah) -S3method(coef, ah) -S3method(predict, ah) -S3method(residuals, ah) -S3method(logLik, ah) - -S3method(print, ahg) -S3method(coef, ahg) -S3method(predict, ahg) -S3method(residuals, ahg) -S3method(logLik, ahg) - -export(simHHH, simHHH.default) -S3method(simHHH, default) -S3method(simHHH, ah) - - -### hhh4-specific - -## main functions -export(hhh4) -export(addSeason2formula) -export(zetaweights, W_powerlaw) -export(W_np) -export(oneStepAhead) -export(scores) -export(permutationTest) - -## S3-methods -S3method(print, hhh4) -S3method(summary, hhh4) -S3method(print, summary.hhh4) -S3method(nobs, hhh4) -S3method(logLik, hhh4) -S3method(formula, hhh4) -S3method(terms, hhh4) -S3method(coef, hhh4) -S3method(vcov, hhh4) -S3method(fixef, hhh4) -S3method(ranef, hhh4) -S3method(confint, hhh4) -S3method(residuals, hhh4) -S3method(predict, hhh4) -S3method(update, hhh4) -S3method(all.equal, hhh4) -S3method(simulate, hhh4) -S3method(plot, hhh4) -export(plotHHH4_fitted, plotHHH4_fitted1, - plotHHH4_season, getMaxEV_season, - plotHHH4_maxEV, getMaxEV, - plotHHH4_maps, plotHHH4_ri, - plotHHH4_neweights) - -#S3method(scores, default) # rather an internal auxiliary function, undocumented -S3method(scores, hhh4) -S3method(scores, oneStepAhead) -S3method(calibrationTest, hhh4) -S3method(calibrationTest, oneStepAhead) - -## methods for simulations from hhh4 fits -S3method(aggregate, hhh4sims) -S3method(plot, hhh4sims) -export(as.hhh4simslist) -S3method(as.hhh4simslist, hhh4sims) -S3method(as.hhh4simslist, list) -S3method(as.hhh4simslist, hhh4simslist) -S3method("[", hhh4simslist) -S3method("[[", hhh4simslist) -S3method(aggregate, hhh4simslist) -S3method(plot, hhh4simslist) -export(plotHHH4sims_size) -export(plotHHH4sims_time) -S3method(scores, hhh4sims) -S3method(scores, hhh4simslist) +### Load C code + +useDynLib(surveillance) + +importFrom(Rcpp, evalCpp) # see vignette("Rcpp-package", package="Rcpp") +## although Rcpp is only used on C-level we need to "ensure that Rcpp is loaded +## so any dynamic linking to its code can be resolved. (There may be none, but +## there could be, now or in future.)" (B. Ripley, 2013-09-08) + +############### +### IMPORTS ### +############### + +### Import all packages listed as Depends +### (for utils and polyCub: only selected methods are imported) + +import(methods, grDevices, graphics, stats) + +## sp classes & utilities (bbox, coordinates, dimensions, overlay, plot, ...) +## (we "Depend" on package sp since it defines essential data classes & methods) +import(sp) + +## we define own methods for generating xtable()'s, which we want to be useable +import(xtable) + + +### required generics for own methods (that's why we "Depend" on these packages) + +## importFrom(stats, coef, vcov, logLik, nobs, residuals, confint, AIC, extractAIC, +## profile, simulate, update, terms, add1, drop1, predict, as.stepfun) +importFrom(utils, head, tail, toLatex) + + +### required functions from utils and stats + +## importFrom(stats, pnorm, cov2cor, ks.test, formula, rnorm, runif, step, dist, +## update.formula, terms.formula, rpois, rnbinom, setNames, +## na.omit, as.formula, pnbinom, qnbinom, qnorm, sd, glm, optim, +## poisson, ppois, qpois, predict.glm, summary.glm, quasipoisson, +## glm.fit) ## and many more... +importFrom(utils, packageVersion, modifyList, capture.output, read.table, data, + setTxtProgressBar, txtProgressBar, sessionInfo, head.matrix, + str, flush.console, write.table, as.roman, tail.matrix, + methods) + + +### sampling from mv.Gausian for OSAIC weights (twinSIR) and iafplot (twinstim) + +importFrom(MASS, mvrnorm) + + +### disProg-specific + +importFrom(MASS, glm.nb) # for algo.glrnb +##importFrom(msm, msm, hmmPois, viterbi.msm) # for algo.hmm() +##importFrom(spc, xcusum.arl, xcusum.crit) # for find.kh() +## (packages msm and spc are now "suggested", not imported) + + +### hhh4-specific + +importFrom(MASS, ginv, negative.binomial) +importFrom(Matrix, Matrix) +importClassesFrom(Matrix, ddiMatrix) +importMethodsFrom(Matrix, coerce, forceSymmetric) +## sparse matrix methods provide a significant speed-up in marFisher +importFrom(nlme, fixef, ranef) +export(fixef, ranef) # we define corresponding methods for "hhh4" models + + +### twinSIR-specific + +# for use in computing OSAIC weights by simulation +#importFrom(quadprog, solve.QP) # moved to "Suggests" + + +### twinstim-specific + +importFrom(spatstat, area.owin, as.im.function, coords.ppp, diameter, + diameter.owin, disc, edges, inside.owin, intersect.owin, + is.polygonal, as.polygonal, nncross.ppp, ppp, runifpoint, + shift.owin, spatstat.options, vertices) +importFrom(spatstat, marks) +export(marks) # we define an epidataCS-method +importFrom(spatstat, multiplicity) +export(multiplicity) # we define a Spatial-method + +## Note: we depend on instead of import package polyCub to avoid "::" references +## in some of the siaf.* generators which create functions in .GlobalEnv +importFrom(polyCub, polyCub, .polyCub.iso, polyCub.SV, polyCub.midpoint, xylist) +importMethodsFrom(polyCub, coerce) + +importFrom(MASS, kde2d, truehist) + + + +############### +### EXPORTS ### +############### + + +### general exports + +export(surveillance.options, reset.surveillance.options) +export(animate) # new S3-generic +export(R0) # new S3-generic +export(intensityplot) # new S3-generic +export(formatPval) # yapf -- yet another p-value formatter +export(anscombe.residuals) +export(magic.dim, primeFactors, bestCombination) # similar to n2mfrow +export(isoWeekYear) #extract ISO 8601 date +export(formatDate) #ISO 8601 compatible %G and %V format( ) function. +export(refvalIdxByDate) +export(ks.plot.unif) +export(checkResidualProcess) # for twinstim and twinSIR +export(qlomax) # quantile function of the Lomax distribution +export(plapply) + +# spatial utilities +export(discpoly) +#export(runifdisc) # CAVE: spatstat has similar function of same name +export(unionSpatialPolygons) +export(inside.gpc.poly) +S3method(scale, gpc.poly) # redefined method for gpc.poly in spatial_stuff.R +S3method(diameter, gpc.poly) +export(nbOrder) +export(poly2adjmat) +export(polyAtBorder) +export(layout.labels) +export(layout.scalebar) + +# randomly break tied event times or coordinates +export(untie) # new S3-generic +#export(untie.default, untie.matrix, untie.epidataCS) +S3method(untie, default) +S3method(untie, matrix) +S3method(untie, epidataCS) + +# intersection of a polygonal and a circular domain +export(intersectPolyCircle) +S3method(intersectPolyCircle, owin) +S3method(intersectPolyCircle, SpatialPolygons) +S3method(intersectPolyCircle, gpc.poly) + +# little helper: multiplicity of points +S3method(multiplicity, Spatial) + +# list coefficients by model component +export(coeflist) +S3method(coeflist, default) +S3method(coeflist, twinstim) +S3method(coeflist, simEpidataCS) +S3method(coeflist, hhh4) + +# Spatio-temporal cluster detection +export(stcd) + +# tests for space-time interaction +export(knox) +S3method(print, knox) +S3method(plot, knox) +S3method(xtable, knox) +S3method(toLatex, knox) +export(stKtest) +S3method(plot, stKtest) + +# PIT histograms +export(pit) +export(pit.default) +S3method(pit, default) +S3method(pit, oneStepAhead) +S3method(pit, hhh4) +S3method(plot, pit) + +# calibration test for Poisson or NegBin predictions +export(calibrationTest) +S3method(calibrationTest, default) +export(calibrationTest.default) +export(dss, logs, rps) # ses, nses + + +### sts(BP|NC)-specific + +export(sts) +exportClasses(sts, stsBP) +export(linelist2sts) +export(animate_nowcasts) + +# conversion of "sts" objects +S3method(as.ts, sts) +export(as.xts.sts) # no registered S3-method since we only suggest "xts" + +# generics for sts class defined in sts.R +exportMethods("[", plot) +exportMethods(toLatex) +exportMethods(dim, dimnames, epochInYear, year) +exportMethods(aggregate) +exportMethods(as.data.frame) + +# methods for accessing/replacing slots of an sts object (cf. AllGeneric.R) +exportMethods(epoch,observed,alarms,upperbound,population,control,multinomialTS,neighbourhood) +exportMethods("epoch<-","observed<-","alarms<-","upperbound<-","population<-","control<-","multinomialTS<-","neighbourhood<-") +# methods for accessing/replacing slots of an stsNC object +exportMethods(reportingTriangle,delayCDF,score,predint) + +# plot variants +export(stsplot_space) +export(stsplot_time, stsplot_time1, stsplot_alarm) +export(addFormattedXAxis, atChange, at2ndChange, atMedian) #for time axis formatting +export(stsplot_spacetime) # old implementation of (animated) map +S3method(animate, sts) # S3-method for an S4 class, see ?Methods + +# outbreak detection algorithms (sts-interfaces) +export(wrap.algo, farrington, bayes, rki, cusum, glrpois, glrnb, outbreakP, boda) # FIXME: rogerson, hmm ?? +export(earsC) +export(farringtonFlexible) +export(categoricalCUSUM, pairedbinCUSUM, pairedbinCUSUM.runlength) +export(nowcast, backprojNP) +export(bodaDelay) + + +# sts creation functions +export(sts_creation) +export(sts_observation) + +### disProg-specific + +export(create.disProg, readData, toFileDisProg) +S3method(print, disProg) +S3method(plot, disProg) +S3method(plot, disProg.one) +S3method(aggregate, disProg) + +export(sim.pointSource, sim.seasonalNoise) +export(LRCUSUM.runlength, arlCusum, find.kh, findH, hValues, findK) +export(compMatrix.writeTable, correct53to52, enlargeData) +export(makePlot) +export(estimateGLRNbHook) +export(algo.compare, algo.quality, algo.summary) + +## outbreak detection algorithms (old disProg implementations) +export(algo.bayes, algo.bayes1, algo.bayes2, algo.bayes3, + algo.bayesLatestTimepoint, + algo.call, + algo.cdc, algo.cdcLatestTimepoint, + algo.cusum, + algo.farrington, + algo.glrnb, algo.glrpois, + algo.hhh, algo.hhh.grid, + algo.hmm, + algo.outbreakP, + algo.rki, algo.rki1, algo.rki2, algo.rki3, algo.rkiLatestTimepoint, + algo.rogerson, + algo.twins) + +## auxiliary functions for algo.farrington (FIXME: why export these internals?) +export(algo.farrington.assign.weights, + algo.farrington.fitGLM, algo.farrington.fitGLM.fast, + algo.farrington.fitGLM.populationOffset, algo.farrington.threshold) + + +S3method(plot, atwins) +S3method(plot, survRes) +S3method(plot, survRes.one) +S3method(print, algoQV) +S3method(xtable, algoQV) + +export(test, testSim) # FIXME: remove these test functions? -> Demo? + + +### conversion between old disProg and new sts classes + +export(disProg2sts) +export(sts2disProg) + + +### twinSIR-specific + +export(cox) +export(as.epidata) +S3method(as.epidata, data.frame) +export(as.epidata.data.frame) +S3method(as.epidata, default) +export(as.epidata.default) +export(intersperse) +export(twinSIR) +export(stateplot) +export(simEpidata) + +S3method(update, epidata) +S3method("[", epidata) +S3method(print, epidata) +S3method(summary, epidata) +S3method(print, summary.epidata) +S3method(plot, epidata) +S3method(animate, epidata) +S3method(plot, summary.epidata) +S3method(animate, summary.epidata) + +S3method(print, twinSIR) +S3method(summary, twinSIR) +S3method(print, summary.twinSIR) +S3method(plot, twinSIR) +S3method(intensityplot, twinSIR) +export(intensityplot.twinSIR) # for convenience +S3method(profile, twinSIR) +S3method(plot, profile.twinSIR) +S3method(vcov, twinSIR) +S3method(logLik, twinSIR) +S3method(AIC, twinSIR) +S3method(extractAIC, twinSIR) +S3method(simulate, twinSIR) +export(simulate.twinSIR) # for convenience +S3method(residuals, twinSIR) + +S3method(intensityplot, simEpidata) +export(intensityplot.simEpidata) # for convenience + + +### twinstim-specific + +export(as.epidataCS) +export(glm_epidataCS) +export(twinstim) +export(simEpidataCS) +export(siaf, + siaf.constant, siaf.step, + siaf.gaussian, + siaf.powerlaw, siaf.powerlawL, siaf.student) +export(tiaf, + tiaf.constant, tiaf.step, + tiaf.exponential) +export(epidataCS2sts) +export(epitest) +S3method(coef, epitest) +S3method(plot, epitest) +export(getSourceDists) + +S3method(nobs, epidataCS) +S3method("[", epidataCS) +S3method(update, epidataCS) +export(update.epidataCS) # for convenience +export(permute.epidataCS) +S3method(head, epidataCS) +S3method(tail, epidataCS) +S3method(print, epidataCS) +S3method(subset, epidataCS) +S3method(summary, epidataCS) +S3method(print, summary.epidataCS) +S3method(as.stepfun, epidataCS) +S3method(animate, epidataCS) +export(animate.epidataCS) # for convenience +S3method(marks, epidataCS) +export(marks.epidataCS) # for convenience since its a foreign generic +S3method(plot, epidataCS) +export(epidataCSplot_time, epidataCSplot_space) +S3method(as.epidata, epidataCS) +export(as.epidata.epidataCS) # for convenience + +S3method(print, twinstim) +S3method(summary, twinstim) +export(summary.twinstim) # for convenience +S3method(print, summary.twinstim) +S3method(toLatex, summary.twinstim) +S3method(xtable, summary.twinstim) +export(xtable.summary.twinstim) # for xtable.twinstim +S3method(xtable, twinstim) +S3method(plot, twinstim) +export(iafplot) +export(intensity.twinstim) +S3method(intensityplot, twinstim) +export(intensityplot.twinstim) # for convenience +S3method(profile, twinstim) +S3method(coef, summary.twinstim) +S3method(vcov, twinstim) +S3method(vcov, summary.twinstim) +S3method(logLik, twinstim) +S3method(extractAIC, twinstim) +S3method(nobs, twinstim) +S3method(simulate, twinstim) +export(simulate.twinstim) # for convenience +S3method(R0, twinstim) +export(simpleR0) +S3method(residuals, twinstim) +S3method(update, twinstim) +export(update.twinstim) # for convenience +S3method(terms, twinstim) +S3method(all.equal, twinstim) + +export(stepComponent) +S3method(terms, twinstim_stependemic) +S3method(terms, twinstim_stepepidemic) +S3method(update, twinstim_stependemic) +S3method(update, twinstim_stepepidemic) +S3method(add1, twinstim) +S3method(add1, twinstim_stependemic) +S3method(add1, twinstim_stepepidemic) +S3method(drop1, twinstim) +S3method(drop1, twinstim_stependemic) +S3method(drop1, twinstim_stepepidemic) + +S3method(residuals, simEpidataCS) +S3method(R0, simEpidataCS) +S3method(intensityplot, simEpidataCS) +export(intensityplot.simEpidataCS) # for convenience +S3method(print, simEpidataCSlist) +S3method("[[", simEpidataCSlist) +S3method(plot, simEpidataCSlist) + + +### algo.hhh-specific + +export(algo.hhh) +export(algo.hhh.grid) +export(create.grid) + +S3method(print, ah) +S3method(coef, ah) +S3method(predict, ah) +S3method(residuals, ah) +S3method(logLik, ah) + +S3method(print, ahg) +S3method(coef, ahg) +S3method(predict, ahg) +S3method(residuals, ahg) +S3method(logLik, ahg) + +export(simHHH, simHHH.default) +S3method(simHHH, default) +S3method(simHHH, ah) + + +### hhh4-specific + +## main functions +export(hhh4) +export(addSeason2formula) +export(zetaweights, W_powerlaw) +export(W_np) +export(oneStepAhead) +export(scores) +export(permutationTest) + +## S3-methods +S3method(print, hhh4) +S3method(summary, hhh4) +S3method(print, summary.hhh4) +S3method(nobs, hhh4) +S3method(logLik, hhh4) +S3method(formula, hhh4) +S3method(terms, hhh4) +S3method(coef, hhh4) +S3method(vcov, hhh4) +S3method(fixef, hhh4) +S3method(ranef, hhh4) +S3method(confint, hhh4) +S3method(residuals, hhh4) +S3method(predict, hhh4) +S3method(update, hhh4) +S3method(all.equal, hhh4) +S3method(simulate, hhh4) +S3method(plot, hhh4) +export(plotHHH4_fitted, plotHHH4_fitted1, + plotHHH4_season, getMaxEV_season, + plotHHH4_maxEV, getMaxEV, + plotHHH4_maps, plotHHH4_ri, + plotHHH4_neweights) + +#S3method(scores, default) # rather an internal auxiliary function, undocumented +S3method(scores, hhh4) +S3method(scores, oneStepAhead) +S3method(calibrationTest, hhh4) +S3method(calibrationTest, oneStepAhead) + +## methods for simulations from hhh4 fits +S3method(aggregate, hhh4sims) +S3method(plot, hhh4sims) +export(as.hhh4simslist) +S3method(as.hhh4simslist, hhh4sims) +S3method(as.hhh4simslist, list) +S3method(as.hhh4simslist, hhh4simslist) +S3method("[", hhh4simslist) +S3method("[[", hhh4simslist) +S3method(aggregate, hhh4simslist) +S3method(plot, hhh4simslist) +export(plotHHH4sims_size) +export(plotHHH4sims_time) +S3method(scores, hhh4sims) +S3method(scores, hhh4simslist) diff -Nru r-cran-surveillance-1.12.2/R/earsC.R r-cran-surveillance-1.13.0/R/earsC.R --- r-cran-surveillance-1.12.2/R/earsC.R 2013-06-26 14:15:21.000000000 +0000 +++ r-cran-surveillance-1.13.0/R/earsC.R 2016-12-02 20:38:39.000000000 +0000 @@ -22,10 +22,10 @@ # it defines an upperbound based on this value and on the variability # of past values # and then it compares the observed value with the upperbound. -# If the observed value is greater than the upperbound +# If the observed value is greater than the upperbound # then an alert is flagged. # Three methods are implemented. -# They do not use the same amount of past data +# They do not use the same amount of past data # and are expected to have different specificity and sensibility # from C1 to C3 # the amount of past data used increases, @@ -50,37 +50,53 @@ ###################################################################### earsC <- function(sts, control = list(range = NULL, method = "C1", - alpha = 0.001)) { + baseline = 7, minSigma = 0, + alpha = 0.001)) { ###################################################################### #Handle I/O ###################################################################### - #If list elements are empty fill them! + if (is.null(control[["baseline", exact = TRUE]])) { + control$baseline <- 7 + } + + if (is.null(control[["minSigma", exact = TRUE]])) { + control$minSigma <- 0 + } + baseline <- control$baseline + minSigma <- control$minSigma + + if(minSigma < 0) { + stop("The minimum sigma parameter (minSigma) needs to be positive") + } + if (baseline < 3) { + stop("Minimum baseline to use is 3.") + } # Method - if (is.null(control[["method",exact=TRUE]])) { + if (is.null(control[["method", exact = TRUE]])) { control$method <- "C1" } - + # Extracting the method method <- match.arg( control$method, c("C1","C2","C3"),several.ok=FALSE) - + # Range # By default it will take all possible weeks # which is not the same depending on the method if (is.null(control[["range",exact=TRUE]])) { if (method == "C1"){ - control$range <- c(8:dim(sts@observed)[1]) + control$range <- seq(from=baseline+1, to=dim(sts@observed)[1],by=1) } if (method == "C2"){ - control$range <- c(10:dim(sts@observed)[1]) + control$range <- seq(from=baseline+3, to=dim(sts@observed)[1],by=1) } if (method == "C3"){ - control$range <- c(12:dim(sts@observed)[1]) - } + control$range <- seq(from=baseline+5, to=dim(sts@observed)[1],by=1) + } } - + # zAlpha if (is.null(control[["alpha",exact=TRUE]])) { # C1 and C2: Risk of 1st type error of 10-3 @@ -97,13 +113,13 @@ # Calculating the threshold zAlpha zAlpha <- qnorm((1-control$alpha)) - - + + #Deduce necessary amount of data from method - maxLag <- switch(method, C1=7, C2=9, C3=11) - + maxLag <- switch(method, C1 = baseline, C2 = baseline+2, C3 = baseline+4) + # Order range in case it was not given in the right order - control$range = sort (control$range) + control$range = sort(control$range) ###################################################################### #Loop over all columns in the sts object @@ -112,58 +128,63 @@ for (j in 1:ncol(sts)) { # check if the vector observed includes all necessary data: maxLag values. - if((control$range[1] - maxLag) < 1) { + if((control$range[1] - maxLag) < 1) { stop("The vector of observed is too short!") - } + } ###################################################################### # Method C1 or C2 ###################################################################### - if (method %in% c("C1","C2")) { + if(method == "C1"){ + # construct the matrix for calculations + ndx <- as.vector(outer(control$range, + baseline:1, FUN = "-")) + refVals <- matrix(observed(sts)[,j][ndx], ncol = baseline) + + sts@upperbound[control$range, j] <- apply(refVals,1, mean) + + zAlpha * pmax(apply(refVals, 1, sd), minSigma) + } + + if (method == "C2") { + # construct the matrix for calculations + ndx <- as.vector(outer(control$range, + (baseline + 2):3, FUN = "-")) + refVals <- matrix(observed(sts)[,j][ndx], ncol = baseline) - # Create a matrix with time-lagged vectors - refVals <- NULL - for (lag in maxLag:(maxLag-6)) { - refVals <- cbind(refVals, observed(sts)[(control$range-lag),j]) - } - - # calculate the upperbound - sts@upperbound[control$range,j] <- apply(refVals,1,mean)+ - zAlpha*apply(refVals,1,sd) + sts@upperbound[control$range, j] <- apply(refVals,1, mean) + + zAlpha * pmax(apply(refVals, 1, sd), minSigma) } - - if (method=="C3") { - # Create a matrix with time-lagged vectors - refVals <- NULL - rangeC2 = ((min(control$range) - 2) : max(control$range)) - for (lag in 9:3) { - refVals <- cbind(refVals, observed(sts)[(rangeC2-lag),j]) - } - - # Calculate C2 - C2 <- (observed(sts)[rangeC2,j] - apply(refVals,1,mean)) / apply(refVals,1,sd) - # Calculate the upperbound - # first calculate the parts of the formula with the maximum of C2 and 0 for # two time lags. - partUpperboundLag2 = pmax(rep(0,length=length(C2)-2),C2[1:(length(C2)-2)]-1) - partUpperboundLag1 = pmax(rep(0,length=length(C2)-2),C2[2:(length(C2)-1)]-1) - - sts@upperbound[control$range,j] <- observed(sts)[control$range,j] + - apply(as.matrix(refVals[3:length(C2),]),1,sd)*(zAlpha - (partUpperboundLag2 + partUpperboundLag1)) - - # Upperbound must be superior to 0 which is not always the case - #with this formula - sts@upperbound[control$range,j] = pmax(rep(0,length(control$range)),sts@upperbound[control$range,j]) - } # end of loop over j - } #end of loop over cols in sts - #Make sts return object - control$name <- paste("EARS_",method,sep="") + if (method == "C3") { + # refVals <- NULL + rangeC2 = ((min(control$range) - 2):max(control$range)) + ##HB replacing loop: + ndx <- as.vector(outer(rangeC2, (baseline + 2):3, FUN = "-")) + refVals <- matrix(observed(sts)[,j][ndx], ncol = baseline) + + ##HB using argument 'minSigma' to avoid dividing by zero, huge zscores: + C2 <- (observed(sts)[rangeC2, j] - + apply(refVals, 1, mean))/ + pmax(apply(refVals, 1, sd), minSigma) + + partUpperboundLag2 <- pmax(rep(0, length = length(C2) - 2), + C2[1:(length(C2) - 2)] - 1) + + partUpperboundLag1 <- pmax(rep(0, length = length(C2) - 2), + C2[2:(length(C2) - 1)] - 1) + ##HB using argument 'minSigma' to avoid alerting threshold that is zero or too small + sts@upperbound[control$range, j] <- observed(sts)[control$range, j] + + pmax(apply(as.matrix(refVals[3:length(C2), ]),1, sd),minSigma) * + (zAlpha - (partUpperboundLag2 + partUpperboundLag1)) + sts@upperbound[control$range, j] = pmax(rep(0, length(control$range)), + sts@upperbound[control$range, j]) + } + } + + #Copy administrative information + control$name <- paste("EARS_", method, sep = "") control$data <- paste(deparse(substitute(sts))) sts@control <- control - #Where are the alarms? - sts@alarm[control$range,] <- matrix(observed(sts)[control$range,]>upperbound(sts)[control$range,] ) - - #Done - return(sts[control$range,]) + sts@alarm[control$range, ] <- matrix(observed(sts)[control$range, ] > upperbound(sts)[control$range, ]) + return(sts[control$range, ]) } - diff -Nru r-cran-surveillance-1.12.2/R/hhh4_methods.R r-cran-surveillance-1.13.0/R/hhh4_methods.R --- r-cran-surveillance-1.12.2/R/hhh4_methods.R 2016-04-06 12:21:54.000000000 +0000 +++ r-cran-surveillance-1.13.0/R/hhh4_methods.R 2016-12-13 13:28:26.000000000 +0000 @@ -6,8 +6,8 @@ ### Standard methods for hhh4-fits ### ### Copyright (C) 2010-2012 Michaela Paul, 2012-2016 Sebastian Meyer -### $Revision: 1697 $ -### $Date: 2016-04-06 14:21:54 +0200 (Wed, 06. Apr 2016) $ +### $Revision: 1814 $ +### $Date: 2016-12-13 14:28:26 +0100 (Tue, 13. Dec 2016) $ ################################################################################ ## NOTE: we also apply print.hhh4 in print.summary.hhh4() @@ -374,9 +374,14 @@ } ## restrict fit to those epochs of control$subset which are <=subset.upper - if (isScalar(subset.upper)) + if (isScalar(subset.upper)) { + if (subset.upper > max(control$subset)) # potentially unintended usage + warning("using the original subset since 'subset.upper' is beyond") control$subset <- control$subset[control$subset <= subset.upper] - + if (length(control$subset) == 0) + stop("'subset.upper' is smaller than the lower bound ", + "of the original subset") + } ## fit the updated model or just return the modified control list if (evaluate) { diff -Nru r-cran-surveillance-1.12.2/R/hhh4_oneStepAhead.R r-cran-surveillance-1.13.0/R/hhh4_oneStepAhead.R --- r-cran-surveillance-1.12.2/R/hhh4_oneStepAhead.R 2016-04-01 19:40:25.000000000 +0000 +++ r-cran-surveillance-1.13.0/R/hhh4_oneStepAhead.R 2016-12-13 15:53:46.000000000 +0000 @@ -6,8 +6,8 @@ ### Compute one-step-ahead predictions (means) at a series of time points ### ### Copyright (C) 2011-2012 Michaela Paul, 2012-2016 Sebastian Meyer -### $Revision: 1687 $ -### $Date: 2016-04-01 21:40:25 +0200 (Fri, 01. Apr 2016) $ +### $Revision: 1816 $ +### $Date: 2016-12-13 16:53:46 +0100 (Tue, 13. Dec 2016) $ ################################################################################ @@ -37,18 +37,23 @@ model <- result[["terms"]] if (is.null(model)) model <- result$terms <- with(result, interpretControl(control, stsObj)) - nTime <- model$nTime - nUnits <- model$nUnits + nTime <- model$nTime # = nrow(result$stsObj) + nUnits <- model$nUnits # = ncol(result$stsObj) dimPsi <- model$nOverdisp withPsi <- dimPsi > 0L psiIdx <- model$nFE + model$nd + seq_len(dimPsi) ## check that tp is within the time period of the data - maxlag <- if (is.null(result$lags) || all(is.na(result$lags))) - 1L else max(result$lags, na.rm=TRUE) - stopifnot(tp %in% seq.int(maxlag,nTime-1L), length(tp) %in% 1:2) - if (length(tp) == 1) tp <- c(tp, max(model$subset)-1) - tps <- tp[1]:tp[2] + stopifnot(length(tp) %in% 1:2) + tpRange <- c(min(model$subset), max(model$subset)-1L) # supported range + if (type == "final") { # no re-fitting necessary + stopifnot(tp >= 0, tp <= nTime-1L) + } else if (any(tp < tpRange[1L] | tp > tpRange[2L])) { + stop("the time range defined by 'tp' must be a subset of ", + tpRange[1L], ":", tpRange[2L]) # because of how subset.upper works + } + if (length(tp) == 1) tp <- c(tp, tpRange[2L]) + tps <- tp[1L]:tp[2L] ntps <- length(tps) observed <- model$response[tps+1,,drop=FALSE] rownames(observed) <- tps+1 @@ -178,7 +183,7 @@ ## with shared overdispersion parameters we need to expand psi to ncol(pred) if (dimPsi > 1L && dimPsi != nUnits) { - psi <- psi[,model$indexPsi] + psi <- psi[,model$indexPsi,drop=FALSE] } ## done diff -Nru r-cran-surveillance-1.12.2/R/hhh4.R r-cran-surveillance-1.13.0/R/hhh4.R --- r-cran-surveillance-1.12.2/R/hhh4.R 2016-06-23 13:12:40.000000000 +0000 +++ r-cran-surveillance-1.13.0/R/hhh4.R 2016-12-12 10:07:39.000000000 +0000 @@ -7,8 +7,8 @@ ### The function allows the incorporation of random effects and covariates. ### ### Copyright (C) 2010-2012 Michaela Paul, 2012-2016 Sebastian Meyer -### $Revision: 1752 $ -### $Date: 2016-06-23 15:12:40 +0200 (Thu, 23. Jun 2016) $ +### $Revision: 1809 $ +### $Date: 2016-12-12 11:07:39 +0100 (Mon, 12. Dec 2016) $ ################################################################################ ## Error message issued in loglik, score and fisher functions upon NA parameters @@ -279,8 +279,16 @@ if (is.factor(control$family)) { stopifnot(length(control$family) == nUnit) - control$family <- droplevels(control$family) - names(control$family) <- colnames(stsObj) + ## guard against misuse as family = factor("Poisson"), e.g., if taken + ## from a data.frame of control options with "stringsAsFactors" + if (nUnit == 1 && as.character(control$family) %in% defaultControl$family) { + control$family <- as.character(control$family) + warning("'family = factor(\"", control$family, "\")' is interpreted ", + "as 'family = \"", control$family, "\"'") + } else { + control$family <- droplevels(control$family) + names(control$family) <- colnames(stsObj) + } } else { control$family <- match.arg(control$family, defaultControl$family) } @@ -891,16 +899,17 @@ ## auxiliary function used in penScore and penFisher ## it sums colSums(x) within the groups defined by f (of length ncol(x)) +## and returns these sums in the order of levels(f) .colSumsGrouped <- function (x, f, na.rm = TRUE) { nlev <- nlevels(f) - if (nlev == 1L) { # all columns belong to the same group + if (nlev == 1L) { # all columns belong to the same group ("NegBin1") sum(x, na.rm = na.rm) } else { dimx <- dim(x) colsums <- .colSums(x, dimx[1L], dimx[2L], na.rm = na.rm) - if (nlev == dimx[2L]) { # each column is its own group - colsums[order(f)] + if (nlev == dimx[2L]) { # each column separately ("NegBinM" or factor) + colsums[order(f)] # for NegBinM, order(f)==1:nlev, not in general } else { # sum colsums within groups unlist(lapply( X = split.default(colsums, f, drop = FALSE), diff -Nru r-cran-surveillance-1.12.2/R/hhh4_simulate_plot.R r-cran-surveillance-1.13.0/R/hhh4_simulate_plot.R --- r-cran-surveillance-1.12.2/R/hhh4_simulate_plot.R 2015-10-16 12:47:57.000000000 +0000 +++ r-cran-surveillance-1.13.0/R/hhh4_simulate_plot.R 2016-12-20 10:25:32.000000000 +0000 @@ -6,9 +6,9 @@ ### Plots for an array "hhh4sims" of simulated counts from an "hhh4" model, ### or a list thereof as produced by different "hhh4" models (same period!) ### -### Copyright (C) 2013-2015 Sebastian Meyer -### $Revision: 1493 $ -### $Date: 2015-10-16 14:47:57 +0200 (Fri, 16. Oct 2015) $ +### Copyright (C) 2013-2016 Sebastian Meyer +### $Revision: 1820 $ +### $Date: 2016-12-20 11:25:32 +0100 (Tue, 20. Dec 2016) $ ################################################################################ plot.hhh4sims <- function (x, ...) { @@ -132,7 +132,9 @@ check_groups <- function (groups, units) { - if (isTRUE(groups)) { + if (is.null(groups)) { + factor(rep.int("overall", length(units))) + } else if (isTRUE(groups)) { factor(units, levels = units) } else { stopifnot(length(groups) == length(units)) @@ -144,37 +146,41 @@ groups = NULL, par.settings = list()) { FUN <- paste("plotHHH4sims", match.arg(type), sep = "_") - if (is.null(groups)) - return(do.call(FUN, list(quote(x), ...))) - - ## stratified plots by groups of units groups <- check_groups(groups, colnames(attr(x, "stsObserved"))) - + ngroups <- nlevels(groups) if (is.list(par.settings)) { - par.defaults <- list(mfrow = sort(n2mfrow(nlevels(groups))), - mar = c(4,4,2,0.5)+.1, las = 1) + par.defaults <- list(mar = c(4,4,2,0.5)+.1, las = 1) + if (ngroups > 1) + par.defaults$mfrow <- sort(n2mfrow(ngroups)) par.settings <- modifyList(par.defaults, par.settings) opar <- do.call("par", par.settings) on.exit(par(opar)) } - - invisible(sapply( - X = levels(groups), - FUN = function (group) { - x_group <- x[, which(group == groups) , ] - do.call(FUN, list(quote(x_group), ..., main = group)) - }, - simplify = FALSE, USE.NAMES = TRUE)) + + if (ngroups == 1) { + do.call(FUN, list(quote(x), ...)) + } else { # stratified plots by groups of units + invisible(sapply( + X = levels(groups), + FUN = function (group) { + x_group <- x[, which(group == groups) , ] # [-method has drop=F + do.call(FUN, list(quote(x_group), ..., main = group)) + }, + simplify = FALSE, USE.NAMES = TRUE)) + } } ### simulated final size distribution as boxplots aggregated over all units plotHHH4sims_size <- function (x, horizontal = TRUE, trafo = NULL, - observed = TRUE, ...) + observed = TRUE, names = base::names(x), ...) { x <- as.hhh4simslist(x) - if (horizontal) x <- rev(x) + if (horizontal) { + names <- rev(names) + x <- rev(x) + } if (is.null(trafo)) #trafo <- scales::identity_trans() trafo <- list(name = "identity", transform = identity) if (isTRUE(observed)) observed <- list() @@ -200,7 +206,8 @@ ## generate boxplots boxplot.args <- modifyList(defaultArgs, list(...)) boxplot.args$horizontal <- horizontal - do.call("boxplot", c(list(nsimstrafo), boxplot.args)) + boxplot.args$names <- names + do.call("boxplot", c(list(x=nsimstrafo), boxplot.args)) ## add means if (horizontal) { @@ -321,7 +328,14 @@ ## add legend if (!identical(FALSE, legend)) { - legendArgs <- list(x="topright", legend=names(x), bty="n", + xnames <- if (is.vector(legend, mode = "character")) { + if (length(legend) != length(x)) + warning("'length(legend)' should be ", length(x)) + legend + } else { + names(x) + } + legendArgs <- list(x="topright", legend=xnames, bty="n", col=col, lwd=matplot.args$lwd, lty=matplot.args$lty) if (is.list(legend)) legendArgs <- modifyList(legendArgs, legend) diff -Nru r-cran-surveillance-1.12.2/R/hhh4_simulate.R r-cran-surveillance-1.13.0/R/hhh4_simulate.R --- r-cran-surveillance-1.12.2/R/hhh4_simulate.R 2016-03-16 11:11:48.000000000 +0000 +++ r-cran-surveillance-1.13.0/R/hhh4_simulate.R 2016-12-12 11:31:07.000000000 +0000 @@ -6,8 +6,8 @@ ### Simulate from a HHH4 model ### ### Copyright (C) 2012 Michaela Paul, 2013-2016 Sebastian Meyer -### $Revision: 1635 $ -### $Date: 2016-03-16 12:11:48 +0100 (Wed, 16. Mar 2016) $ +### $Revision: 1810 $ +### $Date: 2016-12-12 12:31:07 +0100 (Mon, 12. Dec 2016) $ ################################################################################ @@ -61,7 +61,11 @@ ## get fitted components nu_it (with offset), phi_it, lambda_it, t in subset model <- terms.hhh4(object) means <- meanHHH(theta, model, subset=subset) + + ## extract overdispersion parameters (simHHH4 assumes psi->0 means Poisson) psi <- splitParams(theta,model)$overdisp + if (length(psi) > 1) # "NegBinM" or shared overdispersion parameters + psi <- psi[model$indexPsi] ## weight matrix/array of the ne component neweights <- getNEweights(object, coefW(theta)) @@ -124,20 +128,31 @@ nTime <- nrow(end) nUnits <- ncol(end) + ## check and invert psi since rnbinom() uses different parametrization + size <- if (length(psi) == 0 || + isTRUE(all.equal(psi, 0, check.attributes=FALSE))) { + NULL # Poisson + } else { + if (!length(psi) %in% c(1, nUnits)) + stop("'length(psi)' must be ", + paste(unique(c(1, nUnits)), collapse = " or "), + " (number of units)") + 1/psi + } + ## simulate from Poisson or NegBin model - rdistr <- if (length(psi)==0 || - isTRUE(all.equal(psi, 0, check.attributes=FALSE))) { + rdistr <- if (is.null(size)) { rpois } else { - psi.inv <- 1/psi # since R uses different parametrization - ## draw 'n' samples from NegBin with mean vector 'mean' (length=nUnits) - ## and overdispersion psi such that Variance = mean + psi*mean^2 + ## unit-specific 'mean's and variance = mean + psi*mean^2 ## where 'size'=1/psi and length(psi) == 1 or length(mean) - function(n, mean) rnbinom(n, mu = mean, size = psi.inv) + function(n, mean) rnbinom(n, mu = mean, size = size) } ## if only endemic component -> simulate independently if (all(ar + ne == 0)) { + if (!is.null(size)) + size <- matrix(size, nTime, nUnits, byrow = TRUE) return(matrix(rdistr(length(end), end), nTime, nUnits)) } @@ -176,7 +191,7 @@ checkCoefs <- function (object, coefs, reparamPsi=TRUE) { - theta <- coef(object, reparamPsi=reparamPsi) #-> computes 1/exp(logpsi) + theta <- coef(object, reparamPsi=reparamPsi) if (length(coefs) != length(theta)) stop(sQuote("coefs"), " must be of length ", length(theta)) names(coefs) <- names(theta) diff -Nru r-cran-surveillance-1.12.2/R/sts_animate.R r-cran-surveillance-1.13.0/R/sts_animate.R --- r-cran-surveillance-1.12.2/R/sts_animate.R 2016-07-27 02:16:06.000000000 +0000 +++ r-cran-surveillance-1.13.0/R/sts_animate.R 2016-12-01 14:11:02.000000000 +0000 @@ -6,8 +6,8 @@ ### Animated map (and time series chart) of an sts-object (or matrix of counts) ### ### Copyright (C) 2013-2016 Sebastian Meyer -### $Revision: 1761 $ -### $Date: 2016-07-27 04:16:06 +0200 (Wed, 27. Jul 2016) $ +### $Revision: 1802 $ +### $Date: 2016-12-01 15:11:02 +0100 (Thu, 01. Dec 2016) $ ################################################################################ @@ -25,8 +25,15 @@ message("Advice: use facilities of the \"animation\" package, e.g.,\n", " saveHTML() to view the animation in a web browser.") + if (is.null(tps)) + tps <- seq_len(nrow(object)) + if (!is.null(population)) { # get population matrix + population <- parse_population_argument(population, object) + } + ## determine color breaks (checkat() is defined in stsplot_space.R) - at <- checkat(at, data=.rangeOfDataToPlot(object, tps, cumulative, population)) + at <- checkat(at, data=.rangeOfDataToPlot(object, tps, cumulative, population), + counts=is.null(population)) ## style of the additional temporal plot if (is.list(timeplot)) { @@ -37,8 +44,6 @@ stopifnot(timeplot_height > 0, timeplot_height < 1) } - if (is.null(tps)) - tps <- seq_len(nrow(object)) if (verbose) pb <- txtProgressBar(min=0, max=length(tps), initial=0, style=3) grobs <- vector(mode = "list", length = length(tps)) @@ -100,7 +105,8 @@ stsplot_timeSimple <- function (x, tps = NULL, highlight = integer(0), inactive = list(col="gray", lwd=1), - active = list(col=1, lwd=4), ...) + active = list(col=1, lwd=4), + as.Date = x@epochAsDate, ...) { observed <- if (inherits(x, "sts")) observed(x) else x if (is.null(tps)) { @@ -108,6 +114,10 @@ } else { observed <- observed[tps,,drop=FALSE] } + epoch <- if (inherits(x, "sts")) epoch(x, as.Date = as.Date)[tps] else tps + + if (anyNA(observed)) + warning("ignoring NA counts in time series plot") ## build highlight-specific style vectors (col, lwd, ...) stopifnot(is.list(inactive), is.list(active)) @@ -124,7 +134,8 @@ key.axis.padding = 0) ) xyplot.args <- modifyList( - c(list(x=rowSums(observed) ~ tps, type="h", ylab="", xlab="", + c(list(x = rowSums(observed, na.rm = TRUE) ~ epoch, + type = "h", ylab = "", xlab = "", par.settings = par_no_top_padding), styleargs), list(...)) @@ -134,17 +145,18 @@ ### determine data range for automatic color breaks 'at' -.rangeOfDataToPlot <- function (object, tps = NULL, cumulative = FALSE, +.rangeOfDataToPlot <- function (object, tps, cumulative = FALSE, population = NULL) { observed <- if (inherits(object, "sts")) observed(object) else object - if (!is.null(tps)) { - observed <- observed[tps,,drop=FALSE] - } - if (!is.null(population)) { # compute prevalence - stopifnot(is.vector(population, mode="numeric"), - length(population) == ncol(object)) - observed <- observed / rep(population, each=nrow(observed)) + observed <- observed[tps,,drop=FALSE] + if (!is.null(population)) { # compute (cumulative) incidence + observed <- if (cumulative) { + observed / rep(population[tps[1L],], each = nrow(observed)) + } else { + observed / population[tps,,drop=FALSE] + } } - range(if (cumulative) c(observed[1L,], colSums(observed)) else observed) + range(if (cumulative) c(observed[1L,], colSums(observed)) else observed, + na.rm = TRUE) } diff -Nru r-cran-surveillance-1.12.2/R/sts_coerce.R r-cran-surveillance-1.13.0/R/sts_coerce.R --- r-cran-surveillance-1.12.2/R/sts_coerce.R 2016-04-01 19:06:11.000000000 +0000 +++ r-cran-surveillance-1.13.0/R/sts_coerce.R 2016-11-29 11:46:00.000000000 +0000 @@ -10,13 +10,13 @@ start <- start(from) if (length(start) == 1) stop("could not convert time series start() to (year, index) form") - + ## Remove "tsp" attribute and "ts"/"mts" class tsp(from) <- NULL ## "tsp<-"(x,NULL) is documented to also remove "ts" and "mts" classes ## but in R < 3.3.0, it did not remove "mts" (see PR#16769) from <- unclass(from) - + ## Create the sts object .sts(observed = from, start = start, freq = freq) }) @@ -53,41 +53,38 @@ # to freq (e.g. used for regression model) ###################################################################### -setMethod("as.data.frame", signature(x="sts"), function(x,row.names = NULL, optional = FALSE, ...) { +setMethod("as.data.frame", signature(x="sts"), function(x,row.names = NULL, optional = FALSE, as.Date=x@epochAsDate, ...) { #Convert object to data frame and give names - res <- data.frame("observed"=x@observed, "epoch"=x@epoch, "state"=x@state, "alarm"=x@alarm,"population"=x@populationFrac) + res <- data.frame("observed"=x@observed, "epoch"=epoch(x,as.Date=as.Date), "state"=x@state, "alarm"=x@alarm,"upperbound"=x@upperbound,"population"=x@populationFrac) if (ncol(x) > 1) { colnames(res) <- c(paste("observed.",colnames(x@observed),sep=""),"epoch", paste("state.",colnames(x@observed),sep=""), paste("alarm.",colnames(x@observed),sep=""), + paste("upperbound.",colnames(x@upperbound),sep=""), paste("population.",colnames(x@observed),sep="")) } else { - colnames(res) <- c("observed","epoch","state","alarm","population") + colnames(res) <- c("observed","epoch","state","alarm","upperbound","population") } - - #Add a column denoting the number of week - if (x@epochAsDate) { - #Convert to date - date <- as.Date(x@epoch, origin="1970-01-01") - epochStr <- switch( as.character(x@freq), + + #Find out how many epochs there are each year + res$freq <- if (x@epochAsDate) { + date <- epoch(x) + epochStr <- switch( as.character(x@freq), "12" = "%m", "52" = "%V", "365" = "%j") - - #Find out how many epochs there are each year years <- unique(as.numeric(formatDate(date,"%Y"))) dummyDates <- as.Date(paste(rep(years,each=6),"-12-",26:31,sep="")) maxEpoch <- tapply( as.numeric(formatDate(dummyDates, epochStr)), rep(years,each=6), max) - #Assign this to result - res$freq <- maxEpoch[pmatch(formatDate(date,"%Y"),names(maxEpoch),duplicates.ok=TRUE)] - res$epochInPeriod <- as.numeric(formatDate(date,epochStr)) / res$freq - } else { - #Otherwise just replicate the fixed frequency - res$freq <- x@freq - res$epochInPeriod <- x@epoch %% res$freq + maxEpoch[pmatch(formatDate(date,"%Y"),names(maxEpoch),duplicates.ok=TRUE)] + } else { # just replicate the fixed frequency + x@freq } - + + #Add a column denoting the epoch fraction within the current year + res$epochInPeriod <- epochInYear(x) / res$freq + return(res) }) diff -Nru r-cran-surveillance-1.12.2/R/stsplot_space.R r-cran-surveillance-1.13.0/R/stsplot_space.R --- r-cran-surveillance-1.12.2/R/stsplot_space.R 2016-03-16 12:37:42.000000000 +0000 +++ r-cran-surveillance-1.13.0/R/stsplot_space.R 2016-12-01 14:11:02.000000000 +0000 @@ -6,23 +6,22 @@ ### Snapshot map (spplot) of an sts-object or matrix of counts ### ### Copyright (C) 2013-2014,2016 Sebastian Meyer -### $Revision: 1642 $ -### $Date: 2016-03-16 13:37:42 +0100 (Wed, 16. Mar 2016) $ +### $Revision: 1802 $ +### $Date: 2016-12-01 15:11:02 +0100 (Thu, 01. Dec 2016) $ ################################################################################ ## x: "sts" or (simulated) matrix of counts ## tps: one or more time points. The unit-specific _sum_ of time points "tps" is ## plotted. tps=NULL means cumulation over all time points in x. ## at: number of levels for the grouped counts or specific break points to -## use, or list(n, data, trafo) passed to getCountIntervals(), +## use, or list(n, data, trafo) passed to getPrettyIntervals(), ## where data and trafo are optional. ## CAVE: intervals are closed on the left and open to the right. ## From panel.levelplot: zcol[z >= at[i] & z < at[i + 1]] <- i ## i.e. at=0:1 will have NA (=also white) for counts=1, thus we have to ## ensure max(at) > max(counts) -stsplot_space <- function (x, tps = NULL, map = x@map, - population = NULL, # nUnits-vector of population counts +stsplot_space <- function (x, tps = NULL, map = x@map, population = NULL, main = NULL, labels = FALSE, at = 10, col.regions = NULL, colorkey = list(space="bottom", labels=list(at=at)), @@ -32,20 +31,22 @@ xlim = bbox(map)[1, ], ylim = bbox(map)[2, ], ...) { counts <- if (inherits(x, "sts")) observed(x) else x + if (is.null(tps)) + tps <- seq_len(nrow(counts)) if (length(map) == 0L) stop("no map") if (is.null(colnames(counts))) stop("need 'colnames(x)' (to be matched against 'row.names(map)')") if (!all(colnames(counts) %in% row.names(map))) stop("incomplete 'map'; ensure that 'all(colnames(x) %in% row.names(map))'") - + ## compute data to plot ncases <- getCumCounts(counts, tps) total <- sum(ncases) - if (!is.null(population)) { # plot prevalence - stopifnot(is.vector(population, mode="numeric"), - length(population) == length(ncases)) - ncases <- ncases / population - total <- total / sum(population) + if (!is.null(population)) { # divide counts by region-specific population + population <- parse_population_argument(population, x) # pop matrix + populationByRegion <- population[tps[1L],] # pop at first time point + ncases <- ncases / populationByRegion # (cumulative) incidence by region + total <- total / sum(populationByRegion) } ## add ncases to map@data @@ -58,10 +59,10 @@ main <- stsTimeRange2text(x, tps) ## check/determine color break points 'at' - at <- checkat(at, ncases) + at <- checkat(at, ncases, counts = is.null(population)) ## default color palette if (is.null(col.regions)) { - separate0 <- at[1] == 0 & at[2] <= 1 + separate0 <- is.null(population) && at[1] == 0 && at[2] <= 1 col.regions <- c( if (separate0) "white", hcl.colors(ncolors=length(at)-1-separate0, @@ -108,38 +109,65 @@ ## sum of counts by unit over time points "tps" ## the resulting vector has no names -getCumCounts <- function (counts, tps=NULL, nUnits=ncol(counts)) +getCumCounts <- function (counts, tps) +{ + ntps <- length(tps) + if (ntps == 1) { + counts[tps,] + } else { + .colSums(counts[tps,,drop=FALSE], ntps, ncol(counts)) + } +} + +parse_population_argument <- function (population, x) { - if (!is.null(tps)) counts <- counts[tps,,drop=FALSE] - ntps <- nrow(counts) - if (ntps == 1) c(counts) else .colSums(counts, ntps, nUnits) + if (is.matrix(population)) { + if (!identical(dim(population), dim(x))) + stop("'dim(population)' does not match the data dimensions") + } else if (isScalar(population)) { # a unit, e.g., per 1000 inhabitants + if (!inherits(x, "sts")) + stop("'", deparse(substitute(x)), "' is no \"sts\" object; ", + "population numbers must be supplied") + population <- population(x) / population + } else { # region-specific population numbers (as in surveillance <= 1.12.2) + stopifnot(is.vector(population, mode = "numeric")) + if (length(population) != ncol(x)) + stop("'length(population)' does not match the number of data columns") + population <- rep(population, each = nrow(x)) + dim(population) <- dim(x) + } + population } -checkat <- function (at, data) { # "data" should be on the original scale +checkat <- function (at, data, counts = TRUE) { # for non-transformed "data" + data_range <- range(data, na.rm = TRUE) if (isScalar(at)) at <- list(n=at) at <- if (is.list(at)) { - at <- modifyList(list(n=10, data=data), at) - do.call("getCountIntervals", at) + at <- modifyList(list(n=10, data=data, counts=counts), at) + do.call("getPrettyIntervals", at) } else sort(at) if (any(data >= max(at) | data < min(at), na.rm=TRUE)) stop("'at' (right-open!) does not cover the data (range: ", - paste0(format(range(data)), collapse=" - "), ")") + paste0(format(data_range), collapse=" - "), ")") structure(at, checked=TRUE) } -getCountIntervals <- function (nInt, data, trafo=scales::sqrt_trans(), ...) { +getPrettyIntervals <- function (nInt, data, trafo=scales::sqrt_trans(), counts=TRUE, ...) { maxcount <- max(data, na.rm=TRUE) - if (maxcount < nInt) { # no aggregation of counts necessary + if (counts && maxcount < nInt) { # no aggregation of counts necessary at <- 0:ceiling(maxcount+sqrt(.Machine$double.eps)) # max(at) > maxcount } else { at <- if (requireNamespace("scales", quietly=TRUE)) { scales::trans_breaks(trafo$trans, trafo$inv, n=nInt+1, ...)(data) } else pretty(sqrt(data), n=nInt+1, ...)^2 - if (at[1] == 0 & at[2] > 1) # we want 0 counts separately ("white") + ## { # alternative: quantile-based scale (esp. for incidence plots) + ## quantile(data, probs=seq(0,1,length.out=nInt+1), na.rm=TRUE) + ## } + if (counts && at[1] == 0 && at[2] > 1) # we want 0 counts separately ("white") at <- sort(c(1, at)) - if (at[length(at)] == maxcount) # ensure max(at) > maxcount - at[length(at)] <- at[length(at)] + 1 + if (at[length(at)] == maxcount) # ensure max(at) > max(data) + at[length(at)] <- at[length(at)] + if (counts) 1 else 0.001*diff(range(at)) } at } @@ -148,9 +176,8 @@ sprintf(fmt, year(stsObj)[tps], epochInYear(stsObj)[tps]) } -stsTimeRange2text <- function (stsObj, tps=NULL, fmt="%i/%i", sep=" - ") +stsTimeRange2text <- function (stsObj, tps, fmt="%i/%i", sep=" - ") { - tpsRange <- if (is.null(tps)) c(1, nrow(stsObj)) else range(tps) - tpsRangeYW <- stsTime2text(stsObj, tps=tpsRange, fmt=fmt) + tpsRangeYW <- stsTime2text(stsObj, tps=range(tps), fmt=fmt) paste0(unique(tpsRangeYW), collapse=sep) } diff -Nru r-cran-surveillance-1.12.2/tests/testthat/test-earsc.R r-cran-surveillance-1.13.0/tests/testthat/test-earsc.R --- r-cran-surveillance-1.12.2/tests/testthat/test-earsc.R 1970-01-01 00:00:00.000000000 +0000 +++ r-cran-surveillance-1.13.0/tests/testthat/test-earsc.R 2016-12-03 07:25:48.000000000 +0000 @@ -0,0 +1,63 @@ +context("earsC method") + +test_that("earsC returns a sts object", { + #Sim data and convert to sts object + disProgObj <- sim.pointSource(p = 0.99, r = 0.5, length = 208, A = 1, + alpha = 1, beta = 0, phi = 0, + frequency = 1, state = NULL, K = 1.7) + stsObj = disProg2sts( disProgObj) + + res1 <- earsC(stsObj, control = list(range = 20:208, method = "C1")) + res2 <- earsC(stsObj, control = list(range = 20:208, method = "C2", + alpha = 0.05)) + + res3 <- earsC(stsObj, control = list(range = 20:208, method = "C3", sigma = 0.5)) + + expect_is(res1, "sts") + expect_is(res2, "sts") + expect_is(res3, "sts") + + data("salmNewport") + in2011 <- which(isoWeekYear(epoch(salmNewport))$ISOYear == 2011) + salmNewportGermany <- aggregate(salmNewport, by = "unit") + control <- list(range = in2011, method = "C1", alpha = 0.05) + surv <- earsC(salmNewportGermany, control = control) + + expect_is(surv, "sts") + expect_true(max(surv@upperbound[1:4] - + c(3.278854, 3.278854, 3.436517, 3.855617)) < 0.000001) +}) + +test_that("earsC returns error messages",{ + data("salmNewport") + salmNewportGermany <- aggregate(salmNewport, by = "unit") + control <- list(range = length(salmNewportGermany), method = "C1", alpha = 0.05, + baseline = 2) + expect_error(earsC(salmNewportGermany, control = control), + "Minimum baseline to use is 3.") + + control <- list(range = length(salmNewportGermany), method = "C1", alpha = 0.05, + minSigma = - 2) + expect_error(earsC(salmNewportGermany, control = control), + "The minimum sigma parameter") + + in2011 <- which(isoWeekYear(epoch(salmNewport))$ISOYear == 2011) + control <- list(range = in2011, method = "C1", alpha = 0.05, + baseline = 1500) + expect_error(earsC(salmNewportGermany, control = control), + "The vector of observed is too short!") + +}) + +test_that("The range is well defined",{ + data("salmNewport") + salmNewportGermany <- aggregate(salmNewport, by = "unit") + control <- list(range = length(salmNewportGermany), method = "C1", alpha = 0.05, + baseline = 2) + surv <- earsC(salmNewportGermany, + control = list(method = "C1", + baseline = 10)) + + expect_true(length(surv@upperbound) == length(salmNewportGermany@observed) - 10) + +}) diff -Nru r-cran-surveillance-1.12.2/tests/testthat/test-hhh4_NegBinGrouped.R r-cran-surveillance-1.13.0/tests/testthat/test-hhh4_NegBinGrouped.R --- r-cran-surveillance-1.12.2/tests/testthat/test-hhh4_NegBinGrouped.R 2015-11-25 15:15:35.000000000 +0000 +++ r-cran-surveillance-1.13.0/tests/testthat/test-hhh4_NegBinGrouped.R 2016-12-08 11:03:29.000000000 +0000 @@ -46,9 +46,12 @@ } +## fit a model with unit-specific overdispersion parameters using "NegBinM", +## equal to family = factor(colnames(fluBWsub), levels=colnames(fluBWsub)) +fluFitM <- hhh4(stsObj = fluBWsub, control = c(fluModel, list( + family = "NegBinM"))) + test_that("\"NegBinM\" fit is invariant to the ordering of the overdispersion parameters", { - fluFitM <- hhh4(stsObj = fluBWsub, control = c(fluModel, list( - family = "NegBinM"))) # identical to factor(colnames(fluBWsub), levels=colnames(fluBWsub)) fluFitM_reordered <- hhh4(stsObj = fluBWsub, control = c(fluModel, list( family = factor(colnames(fluBWsub), levels=rev(colnames(fluBWsub)))))) expect_equal(fluFitM_reordered$loglikelihood, @@ -109,3 +112,15 @@ expect_equal(calibrationTest(osa_final, which = "dss")[idx], calibrationTest(fluFitShared, which = "dss", subset = mysubset)[idx]) }) + +test_that("simulation correctly uses shared overdispersion parameters", { + fluSimShared <- simulate(fluFitShared, seed = 1) + ## simulate from the NegBinM model using the estimates from the shared fit + psiShared <- coeflist(fluFitShared)$fixed$overdisp + psiByUnit <- psiShared[fluFitShared$control$family] + names(psiByUnit) <- paste0("overdisp.", names(fluFitShared$control$family)) + coefsM <- c(coef(fluFitShared), psiByUnit)[names(coef(fluFitM))] + fluSimSharedM <- simulate(fluFitM, seed = 1, coefs = coefsM) + expect_identical(observed(fluSimShared), observed(fluSimSharedM)) + ## fails for surveillance 1.12.2 +}) diff -Nru r-cran-surveillance-1.12.2/vignettes/monitoringCounts.Rnw r-cran-surveillance-1.13.0/vignettes/monitoringCounts.Rnw --- r-cran-surveillance-1.12.2/vignettes/monitoringCounts.Rnw 2016-11-06 21:26:20.000000000 +0000 +++ r-cran-surveillance-1.13.0/vignettes/monitoringCounts.Rnw 2016-11-28 14:15:36.000000000 +0000 @@ -10,7 +10,7 @@ \newcommand{\NB}{\operatorname{NB}} %% almost as usual -\author{Ma\"elle Salmon\\Robert Koch Institute \And +\author{Ma\"elle Salmon\\Robert Koch Institute \And Dirk Schumacher\\Robert Koch Institute \And Michael H\"ohle\\ Stockholm University,\\Robert Koch Institute } \title{ @@ -65,7 +65,7 @@ E-mail: \email{maelle.salmon@yahoo.se}, \email{mail@dirk-schumacher.net}\\ URL: \url{https://masalmon.github.io/}\\ \phantom{URL: }\url{http://www.dirk-schumacher.net/}\\ - + Michael H\"{o}hle\\ Department of Mathematics\\ Stockholm University\\ @@ -101,43 +101,43 @@ ## create directories for plots and cache dir.create("plots", showWarnings=FALSE) dir.create("monitoringCounts-cache", showWarnings=FALSE) -@ +@ \SweaveOpts{prefix.string=plots/monitoringCounts} \label{sec:1} -The package provides a central S4 data class \code{sts} to capture multivariate or univariate time series. All further methods use objects of this class as an input. +The package provides a central S4 data class \code{sts} to capture multivariate or univariate time series. All further methods use objects of this class as an input. Therefore we first describe how to use the \code{sts} class and then, as all monitoring methods of the package conform to the same syntax, a typical call of a function for aberration detection will be presented. Furthermore, the visualization of time series and of the results of their monitoring is depicted. \subsection{How to store time series and related information} -In \pkg{surveillance}, time series of counts and related information are encoded in a specific S4-class called \code{sts} (\textit{surveillance time series}) that represents +In \pkg{surveillance}, time series of counts and related information are encoded in a specific S4-class called \code{sts} (\textit{surveillance time series}) that represents possibly multivariate time series of counts. Denote the counts as $\left( y_{it} ; i = 1, \ldots,m, t = 1, \ldots, n \right)$, where $n$ is the length of the time series and $m$ is the number of entities, e.g., geographical regions, hospitals or age groups, being -monitored. An example which we shall look at in more details is a time series representing the weekly counts of cases of infection with \textit{Salmonella Newport} in all 16 federal states of Germany +monitored. An example which we shall look at in more details is a time series representing the weekly counts of cases of infection with \textit{Salmonella Newport} in all 16 federal states of Germany from 2004 to 2013 with $n=525$ weeks and $m=16$ geographical units. Infections with \textit{Salmonella Newport}, a subtype of \textit{Salmonella}, can trigger gastroenteritis, prompting the seek of medical care. Infections with \textit{Salmonella} are notifiable in Germany since 2001 with data being forwarded to the RKI by federal states health authorities on behalf of the local health authorities. - + \subsubsection[Slots of the class sts]{Slots of the class \texttt{sts}} The key slots of the \code{sts} class are those describing the observed counts and the corresponding time periods of the aggregation. The observed counts $\left(y_{it}\right)$ are stored in the $n \times m$ matrix \code{observed}. -A number of other slots characterize time. First, \code{epoch} denotes the corresponding time period of the aggregation. If the Boolean \code{epochAsDate} is \code{TRUE}, -\code{epoch} is the numeric representation of \code{Date} objects corresponding to each observation in \code{observed}. If the Boolean \code{epochAsDate} is \code{FALSE}, +A number of other slots characterize time. First, \code{epoch} denotes the corresponding time period of the aggregation. If the Boolean \code{epochAsDate} is \code{TRUE}, +\code{epoch} is the numeric representation of \code{Date} objects corresponding to each observation in \code{observed}. If the Boolean \code{epochAsDate} is \code{FALSE}, \code{epoch} is the time index $1 \leq t \leq n$ of each of these observations. Then, \code{freq} is the number of observations per year: 365 for -daily data, 52 for weekly data and 12 for monthly data. +daily data, 52 for weekly data and 12 for monthly data. Finally, \code{start} is a vector representing the origin of the time series with two values that are the year and the epoch within that year for the first observation of the time series -- \code{c(2014, 1)} for a weekly time series starting on the first week of 2014 for instance. Other slots enable the storage of additional information. Known aberrations are recorded in the Boolean slot \code{state} of the same dimensions as \code{observed} - with \code{TRUE} indicating an outbreak and \code{FALSE} indicating the absence of any known aberration. The monitored population in each of the units is stored in - slot \code{populationFrac}, which gives either proportions or numbers. The geography of the zone under surveillance is accessible through slot \code{map} + with \code{TRUE} indicating an outbreak and \code{FALSE} indicating the absence of any known aberration. The monitored population in each of the units is stored in + slot \code{populationFrac}, which gives either proportions or numbers. The geography of the zone under surveillance is accessible through slot \code{map} which is an object of class \code{SpatialPolygonsDataFrame}~\citep{sp1,sp2} providing a shape of the $m$ areas which are monitored and slot \code{neighbourhood}, which is a symmetric matrix of Booleans size $m^2$ stating the neighborhood -matrix. Slot \code{map} is pertinent when units are geographical units, whereas \code{neighbourhood} could be useful in any case, e.g., for storing a contact matrix between age groups for modeling purposes. +matrix. Slot \code{map} is pertinent when units are geographical units, whereas \code{neighbourhood} could be useful in any case, e.g., for storing a contact matrix between age groups for modeling purposes. Finally, if monitoring has been performed on the data the information on its control arguments and its results are stored in \code{control}, \code{upperbound} and \code{alarm} presented in Section~\ref{sec:howto}. \subsubsection[Creation of an object of class sts]{Creation of an object of class \texttt{sts}} -The creation of a \code{sts} object is straightforward, requiring a call to the function \code{new} together with the slots to be assigned as arguments. The input of data from external files is one possibility for getting the counts +The creation of a \code{sts} object is straightforward, requiring a call to the function \code{new} together with the slots to be assigned as arguments. The input of data from external files is one possibility for getting the counts as it is described in \citet{hoehle-mazick-2010}. To exemplify the process we shall use weekly counts of \textit{Salmonella Newport} in Germany loaded using \code{data("salmNewport")}. Alternatively, one can use coercion methods to convert between the \texttt{ts} class and the \texttt{sts} class. Note that this only converts the content of the slot \texttt{observed}, that is, @@ -172,9 +172,9 @@ cex.leg <- cex.text line.lwd <- 2#1 stsPlotCol <- c("mediumblue","mediumblue","red2") -alarm.symbol <- list(pch=17, col="red2", cex=2,lwd=3) +alarm.symbol <- list(pch=17, col="red2", cex=2,lwd=3) #Define list with arguments to use with do.call("legend", legOpts) -legOpts <- list(x="topleft",legend=c(expression(U[t])),bty="n",lty=1,lwd=line.lwd,col=alarm.symbol$col,horiz=TRUE,cex=cex.leg) +legOpts <- list(x="topleft",legend=c(expression(U[t])),bty="n",lty=1,lwd=line.lwd,col=alarm.symbol$col,horiz=TRUE,cex=cex.leg) #How should the par of each plot look? par.list <- list(mar=c(6,5,5,5),family="Times") #Do this once @@ -186,17 +186,17 @@ ylab="No. of reports", xlab="Time (weeks)",lty=c(1,1,1), legend.opts=legOpts,alarm.symbol=alarm.symbol, xaxis.tickFreq=list("%V"=atChange,"%m"=atChange,"%G"=atChange), - xaxis.labelFreq=list("%Y"=atMedian), + xaxis.labelFreq=list("%Y"=atMedian), xaxis.labelFormat="%Y", par.list=par.list,hookFunc=hookFunc) - + @ <>= -# Load data +# Load data data("salmNewport") @ -<>= -# Plot +<>= +# Plot y.max <- max(aggregate(salmNewport,by="unit")@observed,na.rm=TRUE) plotOpts2 <- modifyList(plotOpts,list(x=salmNewport,legend.opts=NULL,ylim=c(0,y.max),type = observed ~ time),keep.null=TRUE) plotOpts2$par.list <- list(mar=c(6,5,0,5),family="Times") @@ -205,7 +205,7 @@ @ \setkeys{Gin}{height=7cm, width=15cm} \begin{figure} -\begin{center} +\begin{center} <>= <> @ @@ -227,30 +227,30 @@ @ which is shown in Figure~\ref{fig:Newport}. Here, the \code{atChange} and \code{atMedian} functions are small helper functions and the respective tick lengths are controlled by the \pkg{surveillance} specific option \code{surveillance.options("stsTickFactors")}. Actually \code{sts} objects can be plotted using different options: \code{type = observed ~ time} produces the time series for whole Germany as shown in Figure~\ref{fig:Newport}, whereas \code{type = observed ~ time | unit} -is a panelled graph with each panel representing the time series of counts of a federal state as seen in Figure~\ref{fig:unit}. +is a panelled graph with each panel representing the time series of counts of a federal state as seen in Figure~\ref{fig:unit}. \setkeys{Gin}{height=7cm, width=9cm} \begin{figure} %\begin{center} %\hspace*{\fill}% \hspace{-1em} \subfloat[]{ -<>= +<>= y.max <- max(observed(salmNewport[,2]),observed(salmNewport[,3]),na.rm=TRUE) plotOpts2 <- modifyList(plotOpts,list(x=salmNewport[,2],legend.opts=NULL,ylim=c(0,y.max)),keep.null=TRUE) plotOpts2$xaxis.tickFreq <- list("%G"=atChange) do.call("plot",plotOpts2) @ - + \includegraphics[width=9cm]{plots/monitoringCounts-unitPlot1.pdf} }\hspace{-3em}% \subfloat[]{ -<>= +<>= # Plot with special function plotOpts2 <- modifyList(plotOpts,list(x=salmNewport[,3],legend.opts=NULL,ylim=c(0,y.max)),keep.null=TRUE) plotOpts2$xaxis.tickFreq <- list("%G"=atChange) do.call("plot",plotOpts2) -@ +@ \includegraphics[width=9cm]{plots/monitoringCounts-unitPlot2.pdf} } %\hspace*{\fill}% @@ -259,8 +259,8 @@ %\end{center} \end{figure} -Once created one can use typical subset operations on a \code{sts} object: for instance \code{salmNewport[} \code{1:10, "Berlin"]} is a new \code{sts} object -with weekly counts for Berlin during the 10 first weeks of the initial dataset; \code{salmNewport[isoWeekYear(epoch(salmNewport))\$ISOYear<=2010,]} uses the \code{surveillance}'s \code{isoWeekYear()} +Once created one can use typical subset operations on a \code{sts} object: for instance \code{salmNewport[} \code{1:10, "Berlin"]} is a new \code{sts} object +with weekly counts for Berlin during the 10 first weeks of the initial dataset; \code{salmNewport[isoWeekYear(epoch(salmNewport))\$ISOYear<=2010,]} uses the \code{surveillance}'s \code{isoWeekYear()} function to get a \code{sts} object with weekly counts for all federal states up to 2010. Moreover, one can take advantage of the \proglang{R} function \code{aggregate()}. For instance, \code{aggregate(salmNewport,by="unit")} returns a \code{sts} object representing weekly counts of \textit{Salmonella Newport} in Germany as a whole, whereas \code{aggregate(salmNewport, by = "time")} corresponds to the total count of cases in each federal state over the whole period. @@ -272,7 +272,7 @@ \subsubsection{Statistical framework for aberration detection} We introduce the framework for aberration detection on an univariate time series of counts $\left\{y_t,\> t=1,2,\ldots\right\}$. Surveillance aims -at detecting an \textit{aberration}, that is to say, an important change in the process occurring at an unknown time $\tau$. +at detecting an \textit{aberration}, that is to say, an important change in the process occurring at an unknown time $\tau$. This change can be a step increase of the counts of cases or a more gradual change~\citep{Sonesson2003}. Based on the possibility of such a change, for each time $t$ we want to differentiate between the two states \textit{in-control} and \textit{out-of-control}. At any timepoint $t_0\geq 1$, the available information -- i.e., past counts -- is defined as $\bm{y}_{t_0} = \left\{ @@ -281,9 +281,9 @@ Functions for aberration detection thus use past data to estimate $r(\bm{y}_{t_0})$, and compare it to the threshold $g$, above which the current count can be considered as suspicious and thus doomed as \textit{out-of-control}. -Threshold values and alarm Booleans for each timepoint of the monitored range are -saved in the slots \code{upperbound} and \code{alarm}, of the same dimensions as \code{observed}, while the method parameters used for computing the threshold values -and alarm Booleans +Threshold values and alarm Booleans for each timepoint of the monitored range are +saved in the slots \code{upperbound} and \code{alarm}, of the same dimensions as \code{observed}, while the method parameters used for computing the threshold values +and alarm Booleans are stored in the slot \code{control}. \subsubsection{Aberration detection in the package} @@ -296,27 +296,27 @@ Here we shall expand on C1 for which the baseline are the 7 timepoints before the assessed timepoint $t_0$, that is to say $\left(y_{t_0-7},\ldots,y_{t_0-1}\right)$. The expected value is the mean of the baseline. The method is based on a statistic called $C_{t_0}$ defined as -$C_{t_0}= \frac{(y_{t_0}-\bar{y}_{t_0})}{s_{t_0}}$, where $$\bar{y}_{t_0}= \frac{1}{7} \cdot\sum_{i=t_0-7}^{t_0-1} y_i \textnormal{ and } s_{t_0}^2= \frac{1}{7-1} \cdot\sum_{i=t_0-7}^{t_0-1} \left(y_i - \bar{y}_{t_0}\right)^2.$$ +$C_{t_0}= \frac{(y_{t_0}-\bar{y}_{t_0})}{s_{t_0}}$, where $$\bar{y}_{t_0}= \frac{1}{7} \cdot\sum_{i=t_0-7}^{t_0-1} y_i \textnormal{ and } s_{t_0}^2= \frac{1}{7-1} \cdot\sum_{i=t_0-7}^{t_0-1} \left(y_i - \bar{y}_{t_0}\right)^2.$$ Under the null hypothesis of no outbreak, it is assumed that $C_{t_0} \stackrel{H_0}{\sim} {N}(0,1)$. The upperbound $U_{t_0}$ is found by assuming that $y_t$ is normal, estimating parameters by plug-in and then taking the $(1-\alpha)$-th quantile of this distribution, i.e. $U_{t_0}= \bar{y}_{t_0} + z_{1-\alpha}s_{t_0}$, where $z_{1-\alpha}$ is the $(1-\alpha)$-quantile of the standard normal distribution. An alarm is raised if $y_{t_0} > U_{t_0}$. - The output of the algorithm is a \code{sts} object that contains subsets of slots \code{observed}, \code{population} and \code{state} defined by the range of timepoints specified in the input -- + The output of the algorithm is a \code{sts} object that contains subsets of slots \code{observed}, \code{population} and \code{state} defined by the range of timepoints specified in the input -- \textit{e.g} the last 20 timepoints of the time series, -and with the slots \code{upperbound} and \code{alarm} filled by the output of the algorithm. -Information relative to the \code{range} of data to be monitored -and to the parameters of the algorithm, such as \code{alpha} for \code{earsC}, has to be formulated in the slot \code{control}. -This information is also stored in the slot \code{control} of the returned \code{sts} object for later inspection. +and with the slots \code{upperbound} and \code{alarm} filled by the output of the algorithm. +Information relative to the \code{range} of data to be monitored +and to the parameters of the algorithm, such as \code{alpha} for \code{earsC}, has to be formulated in the slot \code{control}. +This information is also stored in the slot \code{control} of the returned \code{sts} object for later inspection. <>= in2011 <- which(isoWeekYear(epoch(salmNewport))$ISOYear == 2011) salmNewportGermany <- aggregate(salmNewport, by = "unit") control <- list(range = in2011, method = "C1", alpha = 0.05) -surv <- earsC(salmNewportGermany, control = control) +surv <- earsC(salmNewportGermany, control = control) plot(surv) @ -<>= +<>= # Range for the monitoring in2011 <- which(isoWeekYear(epoch(salmNewport))$ISOYear==2011) # Aggregate counts over Germany @@ -325,7 +325,7 @@ control <- list(range = in2011, method="C1", alpha=0.05) # Apply earsC function surv <- earsC(salmNewportGermany, control=control) -# Plot the results +# Plot the results #plot(surv) # Plot y.max <- max(observed(surv),upperbound(surv),na.rm=TRUE) @@ -333,8 +333,8 @@ @ \setkeys{Gin}{height=7cm, width=15cm} \begin{figure} -\begin{center} - +\begin{center} + <>= <> @ @@ -347,7 +347,7 @@ \end{figure} The \code{sts} object is easily visualized -using the function \code{plot} as depicted in Figure~\ref{fig:NewportEARS}, which shows the upperbound as a solid line and the alarms -- timepoints +using the function \code{plot} as depicted in Figure~\ref{fig:NewportEARS}, which shows the upperbound as a solid line and the alarms -- timepoints where the upperbound has been exceeded -- as triangles. The four last alarms correspond to a known outbreak in 2011 due to sprouts~\citep{Newport2011}. One sees that the upperbound right after the outbreak is affected by the outbreak: it is very high, so that a smaller outbreak would not be detected. @@ -357,8 +357,8 @@ account when estimating the expected count and the associated threshold. For instance, ignoring an increasing time trend could decrease sensitivity. Inversely, overlooking an annual surge in counts during the summer could decrease specificity. Therefore, it is advisable to use detection methods whose underlying models incorporate essential characteristics of time series of disease count data -such as overdispersion, seasonality, time trend and presence of past outbreaks in the records~\citep{Unkel2012,Shmueli2010}. Moreover, the EARS methods do not compute a proper prediction interval for the -current count. Sounder statistical methods will be reviewed in the next section. +such as overdispersion, seasonality, time trend and presence of past outbreaks in the records~\citep{Unkel2012,Shmueli2010}. Moreover, the EARS methods do not compute a proper prediction interval for the +current count. Sounder statistical methods will be reviewed in the next section. \section[Using surveillance in selected contexts]{Using \pkg{surveillance} in selected contexts} @@ -367,7 +367,7 @@ More than a dozen algorithms for aberration detection are implemented in the package. Among those, this section presents a set of representative algorithms, which are already in routine application at several public health institutions or which we think have the potential to become so. -First we describe the Farrington method introduced by~\citet{farrington96} together with the improvements proposed by~\citet{Noufaily2012}. +First we describe the Farrington method introduced by~\citet{farrington96} together with the improvements proposed by~\citet{Noufaily2012}. As a Bayesian counterpart to these methods we present the BODA method published by~\citet{Manitz2013} which allows the easy integration of covariates. All these methods perform one-timepoint detection in that they detect aberrations only when the count at the currently monitored timepoint is above the threshold. Hence, no accumulation of evidence takes place. As an extension, we introduce an implementation of the negative binomial cumulative sum (CUSUM) of~\citet{hoehle.paul2008} that allows the detection @@ -375,10 +375,10 @@ \subsection{One size fits them all for count data} Two implementations of the Farrington method, which is currently \textit{the} method of choice at European public health institutes \citep{hulth_etal2010}, exist in the package. First, the original method as described in \citet{farrington96} is implemented as the function \code{farrington}. Its use was already described in \citet{hoehle-mazick-2010}. -Now, the newly implemented function \code{farringtonFlexible} supports the use of this \textit{original method} as well as of the \textit{improved method} built on suggestions made by~\citet{Noufaily2012} for improving the specificity without reducing the sensitivity. +Now, the newly implemented function \code{farringtonFlexible} supports the use of this \textit{original method} as well as of the \textit{improved method} built on suggestions made by~\citet{Noufaily2012} for improving the specificity without reducing the sensitivity. -In the function \code{farringtonFlexible} one can choose to use the original method or the improved method by specification of appropriate \code{control} arguments. -Which variant of the algorithm is to be used is determined by the contents of the \code{control} slot. +In the function \code{farringtonFlexible} one can choose to use the original method or the improved method by specification of appropriate \code{control} arguments. +Which variant of the algorithm is to be used is determined by the contents of the \code{control} slot. In the example below, \code{control1} corresponds to the use of the original method and \code{control2} indicates the options for the improved method. <>= # Control slot for the original method @@ -386,7 +386,7 @@ b=4,w=3,weightsThreshold=1,pastWeeksNotIncluded=3, pThresholdTrend=0.05,thresholdMethod="delta",alpha=0.05, limit54=c(0,50)) -# Control slot for the improved method +# Control slot for the improved method control2 <- list(range=in2011,noPeriods=10, b=4,w=3,weightsThreshold=2.58,pastWeeksNotIncluded=26, pThresholdTrend=1,thresholdMethod="nbPlugin",alpha=0.05, @@ -404,22 +404,22 @@ @ In both cases the steps of the algorithm are the same. In a first step, an overdispersed Poisson generalized linear model with log link is fitted to the reference data $\bm{y}_{t_0} \subseteq \left\{ -y_t\>;\> t\leq t_0\right\}$, where $\E(y_t)=\mu_t$ with $\log \mu_t = \alpha + \beta t$ and $\Var(y_t)=\phi\cdot\mu_t$ and where $\phi\geq1$ is ensured. +y_t\>;\> t\leq t_0\right\}$, where $\E(y_t)=\mu_t$ with $\log \mu_t = \alpha + \beta t$ and $\Var(y_t)=\phi\cdot\mu_t$ and where $\phi\geq1$ is ensured. The original method took seasonality into account by using a subset of the available data as reference data for fitting the GLM: \code{w} timepoints centred around the timepoint located $1,2,\ldots,b$ years before $t_0$, amounting to a total $b \cdot (2w+1)$ reference values. However, it was shown in~\citet{Noufaily2012} that the algorithm performs better when using more historical data. In order to do do so without disregarding seasonality, the authors introduced a zero order spline with 11 knots, which can be conveniently represented as a 10-level factor. We have extended this idea in our implementation so that one can choose an arbitrary number of periods -in each year. Thus, $\log \mu_t = \alpha + \beta t +\gamma_{c(t)}$ where $\gamma_{c(t)}$ are the coefficients of a zero order spline with $\mathtt{noPeriods}+1$ knots, which can be conveniently represented as a $\mathtt{noPeriods}$-level factor that reflects seasonality. Here, $c(t)$ is a function indicating in which season or period of the year $t$ belongs to. -The algorithm uses \code{w}, \code{b} and \texttt{noPeriods} to deduce the length of periods so they have -the same length up to rounding. An exception is the reference window centred around $t_0$. Figure~\ref{fig:fPlot} shows a minimal example, where each character corresponds to a different period. Note that setting $\mathtt{noPeriods} = 1$ corresponds to using the original method with only a subset of the data: +in each year. Thus, $\log \mu_t = \alpha + \beta t +\gamma_{c(t)}$ where $\gamma_{c(t)}$ are the coefficients of a zero order spline with $\mathtt{noPeriods}+1$ knots, which can be conveniently represented as a $\mathtt{noPeriods}$-level factor that reflects seasonality. Here, $c(t)$ is a function indicating in which season or period of the year $t$ belongs to. +The algorithm uses \code{w}, \code{b} and \texttt{noPeriods} to deduce the length of periods so they have +the same length up to rounding. An exception is the reference window centred around $t_0$. Figure~\ref{fig:fPlot} shows a minimal example, where each character corresponds to a different period. Note that setting $\mathtt{noPeriods} = 1$ corresponds to using the original method with only a subset of the data: there is only one period defined per year, the reference window around $t_0$ and other timepoints are not included in the model. \setkeys{Gin}{height=3cm, width=7cm} \begin{figure} \subfloat[$\texttt{noPeriods}=2$]{ -<>= +<>= library(ggplot2) library(grid) # for rectanges @@ -456,20 +456,20 @@ annotate("text", label = "Time", x = 170, y = 0, size = 8, colour = "black", family="serif") + # ticks labels -annotate('text',label=c("t[0]-2 %.% freq","t[0]-freq","t[0]"),x = xTicks, - y = yTicksEnd - 10, size = 8,family="serif",parse=T) +annotate('text',label=c("t[0]-2 %.% freq","t[0]-freq","t[0]"),x = xTicks, + y = yTicksEnd - 10, size = 8,family="serif",parse=T) p+ # periods labels -annotate('text',label=c("A","A","A","B","B"),x = xPeriods, - y = rep(6,5), size = 8,family="serif",parse=T) +annotate('text',label=c("A","A","A","B","B"),x = xPeriods, + y = rep(6,5), size = 8,family="serif",parse=T) @ \includegraphics[width=0.45\textwidth]{plots/monitoringCounts-fPlot1.pdf} } \qquad \subfloat[$\texttt{noPeriods}=3$]{ -<>= +<>= yTicksBigEnd2 <- rep(0,4) yTicksBigStart2 <- rep(heightTick,4) newX <- c(xTicks[1:2]+widthRectangles/2+52-widthRectangles,xTicks[1:2]+52/2) @@ -477,63 +477,63 @@ p + geom_segment(aes(x = newX, y = yTicksBigStart2, xend = newX, yend = yTicksBigEnd2), size=1)+ # periods labels -annotate('text',label=c("A","A","A","B","B","C","C"),x = xPeriods, - y = rep(6,7), size = 8,family="serif",parse=T) +annotate('text',label=c("A","A","A","B","B","C","C"),x = xPeriods, + y = rep(6,7), size = 8,family="serif",parse=T) @ \includegraphics[width=0.45\textwidth]{plots/monitoringCounts-fPlot2.pdf} } -\caption{Construction of the noPeriods-level factor to account for seasonality, depending on the value of the half-window size $w$ and of the freq of the -data. Here the number of years to go back in the past $b$ is 2. -Each level of the factor variable corresponds to a period delimited by ticks and is denoted by a character. The windows around $t_0$ are respectively of size $2w+1$,~$2w+1$ -and $w+1$. The segments between them are divided into the other periods so that they +\caption{Construction of the noPeriods-level factor to account for seasonality, depending on the value of the half-window size $w$ and of the freq of the +data. Here the number of years to go back in the past $b$ is 2. +Each level of the factor variable corresponds to a period delimited by ticks and is denoted by a character. The windows around $t_0$ are respectively of size $2w+1$,~$2w+1$ +and $w+1$. The segments between them are divided into the other periods so that they have the same length up to rounding.} \label{fig:fPlot} \end{figure} -Moreover, it was shown in \citet{Noufaily2012} that it is better to exclude the last 26 weeks before $t_0$ from the baseline -in order to avoid reducing sensitivity when an outbreak has started recently before $t_0$. In the \code{farringtonFlexible} function, one controls this by specifying \code{pastWeeksNotIncluded}, which is +Moreover, it was shown in \citet{Noufaily2012} that it is better to exclude the last 26 weeks before $t_0$ from the baseline +in order to avoid reducing sensitivity when an outbreak has started recently before $t_0$. In the \code{farringtonFlexible} function, one controls this by specifying \code{pastWeeksNotIncluded}, which is the number of last timepoints before $t_0$ that are not to be used. The default value is 26. Lastly, in the new implementation a population offset can be included in the GLM by setting \code{populationBool} to \code{TRUE} and supplying the possibly time-varying population size in the \code{population} slot of the \code{sts} object, but this will not be discussed further here. In a second step, the expected number of counts $\mu_{t_0}$ is predicted for the current timepoint $t_0$ using this GLM. An upperbound $U_{t_0}$ is calculated based on this predicted value and its variance. The two versions of the algorithm make different assumptions for this calculation. -The +The original method assumes that a transformation of the prediction error $g\left(y_{t_0}-\hat{\mu}_{t_0}\right)$ is normally distributed, for instance when using the identity transformation $g(x)=x$ one obtains $$y_{t_0} - \hat{\mu}_0 \sim \mathcal{N}(0,\Var(y_{t_0}-\hat{\mu}_0))\cdot$$ -The upperbound of the prediction interval is then calculated +The upperbound of the prediction interval is then calculated based on this distribution. First we have that $$ \Var(y_{t_0}-\hat{\mu}_{t_0}) = \Var(\hat{y}_{t_0}) + \Var(\hat{\mu}_{t_0})=\phi\mu_0+\Var(\hat{\mu}_{t_0}) $$ with $\Var(\hat{y}_{t_0})$ being the variance of an observation and $\Var(\hat{\mu}_{t_0})$ being the variance of the estimate. The threshold, defined as the upperbound of a one-sided $(1-\alpha)\cdot 100\%$ prediction interval, is then $$U_{t_0} = \hat{\mu}_0 + z_{1-\alpha}\widehat{\Var}(y_{t_0}-\hat{\mu}_{t_0})\cdot$$ This method can be used by setting the control option \code{thresholdMethod} equal to "\code{delta}". - However, a weakness -of this procedure is the normality assumption itself, so that an alternative was presented in \citet{Noufaily2012} and implemented as \code{thresholdMethod="Noufaily"}. -The central assumption of this approach is that + However, a weakness +of this procedure is the normality assumption itself, so that an alternative was presented in \citet{Noufaily2012} and implemented as \code{thresholdMethod="Noufaily"}. +The central assumption of this approach is that $y_{t_0} \sim \NB\left(\mu_{t_0},\nu\right)$, -with $\mu_{t_0}$ the mean of the distribution and $\nu=\frac{\mu_{t_0}}{\phi-1}$ its overdispersion parameter. In this parameterization, we still have $\E(y_t)=\mu_t$ and $\Var(y_t)=\phi\cdot\mu_t$ with $\phi>1$ -- otherwise a Poisson distribution is assumed for the -observed count. +with $\mu_{t_0}$ the mean of the distribution and $\nu=\frac{\mu_{t_0}}{\phi-1}$ its overdispersion parameter. In this parameterization, we still have $\E(y_t)=\mu_t$ and $\Var(y_t)=\phi\cdot\mu_t$ with $\phi>1$ -- otherwise a Poisson distribution is assumed for the +observed count. The threshold is defined as a quantile of the negative binomial distribution with plug-in estimates $\hat{\mu}_{t_0}$ and $\hat{\phi}$. Note that this disregards the estimation uncertainty in $\hat{\mu}_{t_0}$ and $\hat{\phi}$. As a consequence, the method "\code{muan}" (\textit{mu} for $\mu$ and \textit{an} for asymptotic normal) tries to solve the problem by using the asymptotic normal distribution of $(\hat{\alpha},\hat{\beta})$ to derive the upper $(1-\alpha)\cdot 100\%$ quantile of the asymptotic normal distribution of $\hat{\mu}_{t_0}=\hat{\alpha}+\hat{\beta}t_0$. Note that this does not reflect all estimation uncertainty because it disregards the estimation uncertainty of $\hat{\phi}$. -Note also that for time series where the variance of the estimator is large, the upperbound also ends up being very large. +Note also that for time series where the variance of the estimator is large, the upperbound also ends up being very large. Thus, the method "\code{nbPlugin}" seems to provide information that is easier to interpret by epidemiologists but with "\code{muan}" being more statistically correct. In a last step, the observed count $y_{t_0}$ is compared to the upperbound $U_{t_0}$ and an alarm is raised if $y_{t_0} > U_{t_0}$. -In both cases the fitting of the GLM involves three important steps. First, the algorithm performs an optional power-transformation for skewness correction and variance stabilisation, +In both cases the fitting of the GLM involves three important steps. First, the algorithm performs an optional power-transformation for skewness correction and variance stabilisation, depending on the value of the parameter \code{powertrans} in the \code{control} slot. Then, the significance of the time trend is checked. The time trend is included only when significant at a chosen level \code{pThresholdTrend}, when there are more than three years reference data and if no overextrapolation occurs because of the time trend. Lastly, past outbreaks are reweighted based on their Anscombe residuals. In \code{farringtonFlexible} the limit for reweighting past counts, \code{weightsThreshold}, can be specified by the user. If the Anscombe residual of a count is higher than \code{weightsThreshold} it is reweighted accordingly in a second fitting of the GLM. \citet{farrington96} used a value of $1$ -whereas \citet{Noufaily2012} advise a value of $2.56$ so that the reweighting procedure is less drastic, because it also shrinks the variance of the observations. +whereas \citet{Noufaily2012} advise a value of $2.56$ so that the reweighting procedure is less drastic, because it also shrinks the variance of the observations. The original method is widely used in public health surveillance~\citep{hulth_etal2010}. The reason for its success is primarily - that it does not need to be fine-tuned for each specific pathogen. It is hence easy to implement it for scanning data for many different pathogens. Furthermore, it does tackle classical issues of surveillance data: overdispersion, presence of past outbreaks that are reweighted, + that it does not need to be fine-tuned for each specific pathogen. It is hence easy to implement it for scanning data for many different pathogens. Furthermore, it does tackle classical issues of surveillance data: overdispersion, presence of past outbreaks that are reweighted, seasonality that is taken into account differently in the two methods. An example of use of the function is shown in Figure~\ref{fig:newportFar} with the code below. <>= -salm.farrington <- farringtonFlexible(salmNewportGermany, control1) +salm.farrington <- farringtonFlexible(salmNewportGermany, control1) salm.noufaily <- farringtonFlexible(salmNewportGermany, control2) @ @@ -544,23 +544,23 @@ \hspace{-1em} %\begin{center} \subfloat[]{ -<>= +<>= # Plot y.max <- max(observed(salm.farrington),upperbound(salm.farrington),observed(salm.noufaily),upperbound(salm.noufaily),na.rm=TRUE) do.call("plot",modifyList(plotOpts,list(x=salm.farrington,ylim=c(0,y.max)))) @ - + \includegraphics[width=9cm]{plots/monitoringCounts-farPlot1.pdf} } \hspace{-3em} \subfloat[]{ -<>= +<>= # Plot do.call("plot",modifyList(plotOpts,list(x=salm.noufaily,ylim=c(0,y.max)))) -@ +@ \includegraphics[width=9cm]{plots/monitoringCounts-farPlot2.pdf} } -\caption{S. Newport in Germany in 2011 monitored by (a) the original method and (b) the improved method. +\caption{S. Newport in Germany in 2011 monitored by (a) the original method and (b) the improved method. For the figure we turned off the option that the threshold is only computed if there were more than 5 cases during the 4 last timepoints including $t_0$. One gets less alarms with the most recent method and still does not miss the outbreak in the summer. Simulations on more time series support the use of the improved method instead of the original method.} @@ -571,48 +571,48 @@ \subsubsection{Similar methods in the package} The package also contains further methods based on a subset of the historical data: \code{bayes}, \code{rki} and \code{cdc}. -See Table~\ref{table:ref} for the corresponding references. Here, \code{bayes} uses a simple conjugate prior-posterior approach and computes the parameters of a +See Table~\ref{table:ref} for the corresponding references. Here, \code{bayes} uses a simple conjugate prior-posterior approach and computes the parameters of a negative binomial distribution based on past values. The procedure \code{rki} makes either the assumption of a normal or a Poisson distribution based on the mean of past counts. Finally, \code{cdc} aggregates weekly data into 4-week-counts and computes a normal distribution based upper confidence interval. None of these methods offer the inclusion of a linear trend, down-weighting of past outbreaks or power transformation of the data. Although these methods are good to have at hand, we personally recommend the use of the improved method implemented in the function \code{farringtonFlexible} because it is rather fast and makes use of more historical data than the other methods. \subsection{A Bayesian refinement} -The \code{farringtonFlexible} function described previously +The \code{farringtonFlexible} function described previously was a first indication that the \textit{monitoring} of surveillance time series requires a good \textit{modeling} of the time series before assessing aberrations. Generalized linear models (GLMs) and generalized additive models (GAMs) are well-established and powerful modeling frameworks for handling the -count data nature and trends of time series in a regression context. +count data nature and trends of time series in a regression context. The \code{boda} procedure~\citep{Manitz2013} continues this line of thinking by extending the simple GLMs used in the \code{farrington} and \code{farringtonFlexible} procedures to a fully fledged Bayesian GAM allowing -for penalized splines, e.g., to describe trends and seasonality, while +for penalized splines, e.g., to describe trends and seasonality, while simultaneously adjusting for previous outbreaks or concurrent processes influencing the case counts. A particular advantage of the Bayesian approach is that it constitutes a seamless -framework for performing both estimation and subsequent prediction: the +framework for performing both estimation and subsequent prediction: the uncertainty in parameter estimation is directly carried forward to the predictive posterior distribution. No asymptotic normal approximations nor plug-in inference -is needed. For fast approximate Bayesian inference we use the \pkg{INLA} \proglang{R} -package~\citep{INLA} to fit the Bayesian GAM. +is needed. For fast approximate Bayesian inference we use the \pkg{INLA} \proglang{R} +package~\citep{INLA} to fit the Bayesian GAM. -Still, monitoring with +Still, monitoring with \code{boda} is substantially slower than using the Farrington procedures. Furthermore, detailed regression modeling is only meaningful if the time series is known to be subject to external influences on which information is available. -Hence, the typical use at a public health institution would be the +Hence, the typical use at a public health institution would be the detailed analysis of a few selected time series, e.g., critical ones or those -with known trend character. +with known trend character. As an example, \citet{Manitz2013} studied the influence of absolute -humidity on the occurence of weekly reported campylobacter cases in Germany. +humidity on the occurence of weekly reported campylobacter cases in Germany. <>= # Load data and create \code{sts}-object data("campyDE") -cam.sts <- new("sts",epoch=as.numeric(campyDE$date), - observed=campyDE$case, state=campyDE$state, +cam.sts <- new("sts",epoch=as.numeric(campyDE$date), + observed=campyDE$case, state=campyDE$state, epochAsDate=TRUE) par(las=1) # Plot @@ -622,21 +622,21 @@ do.call("plot",plotOpts3) par(las=0) #mtext(side=2,text="No. of reports", - # las=0,line=3, cex=cex.text,family="Times") + # las=0,line=3, cex=cex.text,family="Times") par(family="Times") -text(-20, 2600, "No. of\n reports", pos = 3, xpd = T,cex=cex.text) +text(-20, 2600, "No. of\n reports", pos = 3, xpd = T,cex=cex.text) text(510, 2900, "Absolute humidity", pos = 3, xpd = T,cex=cex.text) text(510, 2550, expression(paste("[",g/m^3,"]", sep='')), pos = 3, xpd = T,cex=cex.text) -lines(campyDE$hum*50, col="white", lwd=2) +lines(campyDE$hum*50, col="white", lwd=2) axis(side=4, at=seq(0,2500,by=500),labels=seq(0,50,by=10),las=1,cex.lab=cex.text, cex=cex.text,cex.axis=cex.text,pos=length(epoch(cam.sts))+20) #mtext(side=4,text=expression(paste("Absolute humidity [ ",g/m^3,"]", sep='')), - # las=0,line=1, cex=cex.text,family="Times") + # las=0,line=1, cex=cex.text,family="Times") @ \setkeys{Gin}{height=7cm, width=15cm} \begin{figure} -\begin{center} +\begin{center} <>= <> @@ -652,27 +652,27 @@ data("campyDE") cam.sts <- new("sts", epoch = as.numeric(campyDE$date), observed = campyDE$case, state = campyDE$state, - epochAsDate = TRUE) + epochAsDate = TRUE) plot(cam.sts, legend = NULL, xlab = "time [weeks]", ylab = "No. reported", col = "gray", cex = 2, cex.axis = 2, cex.lab = 2) lines(campyDE$hum * 50, col = "darkblue", lwd = 2) @ The corresponding plot of the weekly time series is shown in -Figure~\ref{fig:campyDE}. We observe a strong association between humidity -and case numbers - an +Figure~\ref{fig:campyDE}. We observe a strong association between humidity +and case numbers - an association which is stronger than with, e.g., temperature or relative humidity. As noted in \citet{Manitz2013} the excess in cases in 2007 is thus partly explained by the high atmospheric humidity. - Furthermore, an increase in case numbers during the 2011 STEC O104:H4 outbreak is observed, which is explained by increased awareness and testing of many gastroenteritits pathogens during that period. The hypothesis is thus that there is no actual increased disease -activity~\citep{bernard_etal2014}. -Unfortunately, the German reporting system only records positive test results without keeping track of the -number of actual tests performed -- otherwise this would have been a natural adjustment variable. Altogether, the series contains several artefacts which + Furthermore, an increase in case numbers during the 2011 STEC O104:H4 outbreak is observed, which is explained by increased awareness and testing of many gastroenteritits pathogens during that period. The hypothesis is thus that there is no actual increased disease +activity~\citep{bernard_etal2014}. +Unfortunately, the German reporting system only records positive test results without keeping track of the +number of actual tests performed -- otherwise this would have been a natural adjustment variable. Altogether, the series contains several artefacts which appear prudent to address when monitoring the campylobacteriosis series. -The GAM in \code{boda} is based on the negative binomial distribution with time-varying expectation and time constant overdispersion parameter, i.e., +The GAM in \code{boda} is based on the negative binomial distribution with time-varying expectation and time constant overdispersion parameter, i.e., \begin{align*} -y_t &\sim \operatorname{NB}(\mu_t,\nu) -\end{align*} +y_t &\sim \operatorname{NB}(\mu_t,\nu) +\end{align*} with $\mu_{t}$ the mean of the distribution and $\nu$ the dispersion parameter~\citep{lawless1987}. Hence, we have $\E(y_t)=\mu_t$ and $\Var(y_t)=\mu_t\cdot(1+\mu_t/\nu)$. The linear predictor is given by \begin{align*} \log(\mu_t) &= \alpha_{0t} + \beta t + \gamma_t + \bm{x}_t^\top \bm{\delta} + \xi z_t, \quad t=1,\ldots,t_0. @@ -680,22 +680,22 @@ Here, the time-varying intercept $\alpha_{0t}$ is described by a penalized spline (e.g., first or second order random walk) and $\gamma_t$ denotes a periodic penalized -spline (as implemented in \code{INLA}) with period equal to the periodicity +spline (as implemented in \code{INLA}) with period equal to the periodicity of the data. Furthermore, $\beta$ characterizes the effect of a possible linear trend (on the log-scale) and $\xi$ is the effect of previous outbreaks. Typically, $z_t$ is -a zero-one process denoting if there was an outbreak in week $t$, but more involved adaptive and non-binary forms are imaginable. Finally, +a zero-one process denoting if there was an outbreak in week $t$, but more involved adaptive and non-binary forms are imaginable. Finally, $\bm{x}_t$ denotes a vector of possibly time-varying covariates, which influence the expected number of cases. Data from timepoints $1,\ldots,t_0-1$ are now used to determine the posterior distribution of all model parameters and subsequently the posterior predictive distribution of $y_{t_0}$ is computed. If the actual observed value of $y_{t_0}$ is above the $(1-\alpha)\cdot 100\%$ quantile of the predictive posterior distribution an alarm is flagged for $t_0$. -Below we illustrate the use +Below we illustrate the use of \code{boda} to monitor the campylobacterioris time series from 2007. -In the first case we include in the model for $\log\left(\mu_t\right)$ penalized splines for trend and +In the first case we include in the model for $\log\left(\mu_t\right)$ penalized splines for trend and seasonality and a simple linear trend. <>= rangeBoda <- which(epoch(cam.sts) >= as.Date("2007-01-01")) control.boda <- list(range = rangeBoda, X = NULL, trend = TRUE, - season = TRUE, prior = "iid", alpha = 0.025, + season = TRUE, prior = "iid", alpha = 0.025, mc.munu = 10000, mc.y = 1000, samplingMethod = "marginals") boda <- boda(cam.sts, control = control.boda) @@ -706,7 +706,7 @@ if (computeALL) { library("INLA") control.boda <- list(range=rangeBoda, X=NULL, trend=TRUE, - season=TRUE, prior='rw1', alpha=0.025, + season=TRUE, prior='rw1', alpha=0.025, mc.munu=10000, mc.y=1000, samplingMethod = "marginals") # boda without covariates: trend + spline + periodic spline @@ -719,18 +719,18 @@ In the second case we instead use only penalized and linear trend components, and, furthermore, include as covariates lags 1--4 of the absolute humidity as well -as zero-one indicators for $t_0$ belonging to the last two weeks +as zero-one indicators for $t_0$ belonging to the last two weeks (\code{christmas}) or first two weeks (\code{newyears}) of the year, -respectively. The later two variables are needed, because there is a +respectively. The later two variables are needed, because there is a systematically changed reporting behavior at the turn of the year (c.f.\ Figure~\ref{fig:campyDE}). Finally, \code{O104period} is an indicator variable on whether the reporting week belongs to the W21--W30 2011 period of increased awareness during the O104:H4 STEC outbreak. No additional correction for past outbreaks is made. <>= -covarNames <- c("l1.hum", "l2.hum", "l3.hum", "l4.hum", +covarNames <- c("l1.hum", "l2.hum", "l3.hum", "l4.hum", "newyears", "christmas", "O104period") -control.boda2 <- modifyList(control.boda, +control.boda2 <- modifyList(control.boda, list(X = campyDE[, covarNames], season = FALSE)) boda.covars <- boda(cam.sts, control = control.boda2) @ @@ -739,7 +739,7 @@ # boda with covariates: trend + spline + lagged hum + indicator variables covarNames <- c(paste("l",1:4,".hum",sep=""),"newyears","christmas", "O104period") -control.boda2 <- modifyList(control.boda, +control.boda2 <- modifyList(control.boda, list(X=campyDE[,covarNames],season=FALSE)) boda.covars <- boda(cam.sts, control=control.boda2) save(boda.covars, file = "monitoringCounts-cache/boda.covars.RData") @@ -748,25 +748,25 @@ } @ -We plot \code{boda.covars} in Figure~\ref{fig:b} and compare the output of the two boda calls with the output of -\code{farrington}, \code{farringtonFlexible} and \code{bayes} in -Figure~\ref{fig:alarmplot}. +We plot \code{boda.covars} in Figure~\ref{fig:b} and compare the output of the two boda calls with the output of +\code{farrington}, \code{farringtonFlexible} and \code{bayes} in +Figure~\ref{fig:alarmplot}. <>= cam.surv <- combineSTS(list(boda.covars=boda.covars,boda=boda,bayes=bayes, - farrington=far,farringtonFlexible=farflex)) + farrington=far,farringtonFlexible=farflex)) plot(cam.surv,type = alarm ~ time) @ Note here that the \code{bayes} procedure is not really useful as the adjustment for seasonality only works -poorly. Moreover, we think that this method produces many false alarms for this time series because it disregards the increasing time trend in number of +poorly. Moreover, we think that this method produces many false alarms for this time series because it disregards the increasing time trend in number of reported cases. Furthermore, it becomes clear that the improved Farrington procedure acts similar to the original procedure, but the improved reweighting and trend inclusion produces fewer alarms. The \code{boda} method is to be seen as a step towards more Bayesian thinking in aberration detection. However, besides its time demands for a detailed modeling, the speed of the procedure is also prohibitive as regards routine application. As a response~\citet{Maelle} introduce a method which has two advantages: it allows to adjust outbreak detection for reporting delays and includes an approximate inference method much faster than the INLA inference method. However, its linear predictor is more in the style of~\citet{Noufaily2012} not allowing for additionnal covariates or penalized options for the intercept. -<>= +<>= # Plot with special function y.max <- max(observed(boda.covars),upperbound(boda.covars),na.rm=TRUE) plotOpts2 <- modifyList(plotOpts,list(x=boda.covars,ylim=c(0,y.max)),keep.null=TRUE) @@ -775,8 +775,8 @@ @ \setkeys{Gin}{height=7cm, width=15cm} \begin{figure} -\begin{center} - +\begin{center} + <>= <> @ @@ -820,7 +820,7 @@ cam.surv <- combineSTS(list(boda.covars=boda.covars,boda=boda,bayes=bayes, farrington=far,farringtonFlexible=farflex)) -par(mar=c(4,8,2.1,2),family="Times") +par(mar=c(4,8,2.1,2),family="Times") plot(cam.surv,type = alarm ~ time,lvl=rep(1,ncol(cam.surv)), alarm.symbol=list(pch=17, col="red2", cex=1,lwd=3), cex.axis=1,xlab="Time (weeks)",cex.lab=1,xaxis.tickFreq=list("%m"=atChange,"%G"=atChange),xaxis.labelFreq=list("%G"=at2ndChange), @@ -829,8 +829,8 @@ \setkeys{Gin}{height=7cm, width=16cm} \begin{figure} -\begin{center} - +\begin{center} + <>= <> @@ -841,33 +841,33 @@ \end{figure} \subsection{Beyond one-timepoint detection} -GLMs as used in the Farrington method are suitable for the purpose of aberration detection since they allow a regression approach for adjusting counts for known phenomena such as trend or seasonality in surveillance data. -Nevertheless, the Farrington method only performs one-timepoint detection. In some contexts it can be more relevant -to detect sustained shifts early, e.g., an outbreak could be characterized at first by counts slightly higher than usual -in subsequent weeks without each weekly count being flagged by one-timepoint detection methods. Control charts inspired by statistical process control (SPC) -e.g., cumulative sums would allow the detection of sustained shifts. Yet they were not tailored to the specific characteristics of surveillance data such as overdispersion or seasonality. +GLMs as used in the Farrington method are suitable for the purpose of aberration detection since they allow a regression approach for adjusting counts for known phenomena such as trend or seasonality in surveillance data. +Nevertheless, the Farrington method only performs one-timepoint detection. In some contexts it can be more relevant +to detect sustained shifts early, e.g., an outbreak could be characterized at first by counts slightly higher than usual +in subsequent weeks without each weekly count being flagged by one-timepoint detection methods. Control charts inspired by statistical process control (SPC) +e.g., cumulative sums would allow the detection of sustained shifts. Yet they were not tailored to the specific characteristics of surveillance data such as overdispersion or seasonality. The method presented in \citet{hoehle.paul2008} conducts a synthesis of both worlds, i.e., traditional surveillance methods and SPC. The method is implemented in the package as the function \code{glrnb}, whose use is explained here. \subsubsection{Definition of the control chart} For the control chart, two distributions are defined, one for each of the two states \textit{in-control} and \textit{out-of-control}, whose likelihoods are compared at each time step. The \textit{in-control} distribution -$f_{\bm{\theta}_0}(y_t|\bm{z}_t)$ with the covariates $\bm{z}_t$ is estimated by a GLM of the Poisson or negative binomial family with a log link, depending on the overdispersion of the data. +$f_{\bm{\theta}_0}(y_t|\bm{z}_t)$ with the covariates $\bm{z}_t$ is estimated by a GLM of the Poisson or negative binomial family with a log link, depending on the overdispersion of the data. In this context, the standard model for the \textit{in-control} mean is -$$\log \mu_{0,t}=\beta_0+\beta_1t+\sum_{s=1}^S\left[\beta_{2s}\cos \left(\frac{2\pi s t}{\mathtt{Period}}\right)+\beta_{2s+1}\sin \left(\frac{2\pi s t}{\mathtt{Period}}\right)\right] $$ -where $S$ is the number of harmonic waves to use and \texttt{Period} is the period of the data as indicated in the \code{control} slot, for instance 52 for weekly data. -However, more flexible linear predictors, e.g., containing splines, concurrent covariates -or an offset could be used on the right hand-side of the equation. +$$\log \mu_{0,t}=\beta_0+\beta_1t+\sum_{s=1}^S\left[\beta_{2s}\cos \left(\frac{2\pi s t}{\mathtt{Period}}\right)+\beta_{2s+1}\sin \left(\frac{2\pi s t}{\mathtt{Period}}\right)\right] $$ +where $S$ is the number of harmonic waves to use and \texttt{Period} is the period of the data as indicated in the \code{control} slot, for instance 52 for weekly data. +However, more flexible linear predictors, e.g., containing splines, concurrent covariates +or an offset could be used on the right hand-side of the equation. The GLM could therefore be made very similar -to the one used by~\citet{Noufaily2012}, with reweighting of past outbreaks and various criteria for including the time trend. - -The parameters of the \textit{in-control} and \textit{out-of-control} models +to the one used by~\citet{Noufaily2012}, with reweighting of past outbreaks and various criteria for including the time trend. + +The parameters of the \textit{in-control} and \textit{out-of-control} models are respectively given by $\bm{\theta}_0$ and $\bm{\theta}_1$. -The \textit{out-of-control} mean is defined as a function of the \textit{in-control} mean, either with a multiplicative shift (additive on the log-scale) whose size $\kappa$ can be given as an input +The \textit{out-of-control} mean is defined as a function of the \textit{in-control} mean, either with a multiplicative shift (additive on the log-scale) whose size $\kappa$ can be given as an input or reestimated at each timepoint $t>1$, $\mu_{1,t}=\mu_{0,t}\cdot \exp(\kappa)$, or with an unknown autoregressive component as in \citet{held-etal-2005}, $\mu_{1,t}=\mu_{0,t}+\lambda y_{t-1}$ with unknown $\lambda>0$. -In \code{glrnb}, timepoints are divided into two intervals: phase 1 and phase 2. The \textit{in-control} mean and overdispersion are estimated with a GLM fitted on phase 1 data, whereas surveillance operates on phase 2 data. +In \code{glrnb}, timepoints are divided into two intervals: phase 1 and phase 2. The \textit{in-control} mean and overdispersion are estimated with a GLM fitted on phase 1 data, whereas surveillance operates on phase 2 data. When $\lambda$ is fixed, one uses a likelihood-ratio (LR) and defines the stopping time for alarm as $$N=\min \left\{ t_0 \geq 1 : \max_{1\leq t \leq t_0} \left[ \sum_{s=t}^{t_0} \log\left\{ \frac{f_{\bm{\theta}_1}(y_s|\bm{z}_s)}{f_{\bm{\theta}_0}(y_s|\bm{z}_s)} \right\} \right] \geq \mathtt{c.ARL} \right\},$$ @@ -882,24 +882,24 @@ For using \code{glrnb} one has two choices to make. First, one has to choose an \textit{in-control} model that will be fitted on phase 1 data. One can either provide the predictions for the vector of \textit{in-control} means \code{mu0} and the overdispersion parameter \code{alpha} by relying on an external fit, or use the built-in GLM estimator, that will use all data before the beginning of the surveillance range to fit a GLM with the number of harmonics \code{S} and a time trend if \code{trend} is \code{TRUE}. -The choice of the exact \textit{in-control} model depends on the data under surveillance. Performing model selection is a compulsory step in practical applications. Then, one needs to tune the surveillance function itself, for one of the two possible change forms, \code{intercept}~or~\code{epi}.~One~can choose either to set \code{theta} to a given value and thus perform LR instead of GLR. The value of \code{theta} has to be adapted to -the specific context in which the algorithm is applied: how big are shifts one wants to detect optimally? Is it better not to specify any and use GLR instead? +The choice of the exact \textit{in-control} model depends on the data under surveillance. Performing model selection is a compulsory step in practical applications. Then, one needs to tune the surveillance function itself, for one of the two possible change forms, \code{intercept}~or~\code{epi}.~One~can choose either to set \code{theta} to a given value and thus perform LR instead of GLR. The value of \code{theta} has to be adapted to +the specific context in which the algorithm is applied: how big are shifts one wants to detect optimally? Is it better not to specify any and use GLR instead? The threshold \texttt{c.ARL} also has to be specified by the user. As explained in \citet{hoehle-mazick-2010} one can compute the threshold for a desired run-length in control through direct Monte Carlo simulation or a Markov chain approximation. Lastly, as mentioned in -\citet{hoehle.paul2008}, a window-limited approach of surveillance, instead of looking at all the timepoints until the first observation, can make computation faster. +\citet{hoehle.paul2008}, a window-limited approach of surveillance, instead of looking at all the timepoints until the first observation, can make computation faster. -Here we apply \code{glrnb} to the time series of report counts of \textit{Salmonella Newport} in Germany by assuming a known multiplicative shift of factor $2$ and by using the built-in estimator to fit an \textit{in-control} model with one harmonic for -seasonality and a trend. This model will be refitted after each alarm, but first we use data from the years before 2011 as reference or \code{phase1}, -and the data from 2011 as data to be monitored or \code{phase2}. The threshold \texttt{c.ARL} was chosen to be 4 as we found with the same approach as \citet{hoehle-mazick-2010} that it made the probability of a false alarm within one year +Here we apply \code{glrnb} to the time series of report counts of \textit{Salmonella Newport} in Germany by assuming a known multiplicative shift of factor $2$ and by using the built-in estimator to fit an \textit{in-control} model with one harmonic for +seasonality and a trend. This model will be refitted after each alarm, but first we use data from the years before 2011 as reference or \code{phase1}, +and the data from 2011 as data to be monitored or \code{phase2}. The threshold \texttt{c.ARL} was chosen to be 4 as we found with the same approach as \citet{hoehle-mazick-2010} that it made the probability of a false alarm within one year smaller than 0.1. Figure~\ref{fig:glrnb}~shows the results of this monitoring. -<>= +<>= phase1 <- which(isoWeekYear(epoch(salmNewportGermany))$ISOYear < 2011) phase2 <- in2011 control = list(range = phase2, c.ARL = 4, theta = log(2), ret = "cases", - mu0 = list(S = 1, trend = TRUE, refit = FALSE)) + mu0 = list(S = 1, trend = TRUE, refit = FALSE)) salmGlrnb <- glrnb(salmNewportGermany, control = control) @ <>= @@ -912,18 +912,18 @@ trend=TRUE, refit=FALSE),c.ARL = 4, theta=log(2),ret="cases") -# Perform monitoring with glrnb +# Perform monitoring with glrnb salmGlrnb <- glrnb(salmNewportGermany,control=control) @ -<>= +<>= # Plot y.max <- max(observed(salmGlrnb),upperbound(salmGlrnb),na.rm=TRUE) -do.call("plot",modifyList(plotOpts,list(x=salmGlrnb,ylim=c(0,y.max)))) +do.call("plot",modifyList(plotOpts,list(x=salmGlrnb,ylim=c(0,y.max)))) @ \setkeys{Gin}{height=7cm, width=15cm} \begin{figure} -\begin{center} - +\begin{center} + <>= <> @@ -934,16 +934,16 @@ \label{fig:glrnb} \end{figure} -The implementation of \code{glrnb} on individual time series was already thoroughly explained in \citet{hoehle-mazick-2010}. Our objective in the present document is rather to provide practical tips -for the implementation of this function on huge amounts of data in public health surveillance applications. Issues of computational speed become very significant in such a context. Our proposal -to reduce the computational burden incurred by this algorithm is -to compute the \textit{in-control} model for each time serie (pathogen, subtype, subtype in a given location, etc.) only once a year and to use this estimation for the computation of a threshold for each time series. - An idea to avoid starting with an initial value of zero in the CUSUM is to use either $\left(\frac{1}{2}\right)\cdot\mathtt{c.ARL}$ as a starting value (fast initial response - CUSUM as presented in~\citet{lucas1982fast}) or to let surveillance run with the new \textit{in-control} model during +The implementation of \code{glrnb} on individual time series was already thoroughly explained in \citet{hoehle-mazick-2010}. Our objective in the present document is rather to provide practical tips +for the implementation of this function on huge amounts of data in public health surveillance applications. Issues of computational speed become very significant in such a context. Our proposal +to reduce the computational burden incurred by this algorithm is +to compute the \textit{in-control} model for each time serie (pathogen, subtype, subtype in a given location, etc.) only once a year and to use this estimation for the computation of a threshold for each time series. + An idea to avoid starting with an initial value of zero in the CUSUM is to use either $\left(\frac{1}{2}\right)\cdot\mathtt{c.ARL}$ as a starting value (fast initial response + CUSUM as presented in~\citet{lucas1982fast}) or to let surveillance run with the new \textit{in-control} model during a buffer period and use the resulting CUSUM as an initial value. One could also choose the maximum of these two possible starting values as a starting value. - During the buffer period alarms would be generated with the old model. Lastly, using GLR is much more computationally intensive than using LR, - whereas LR performs reasonably well on shifts different from the one indicated by \code{theta} as seen in the simulation studies of~\citet{hoehle.paul2008}. Our advice would therefore be to use LR with a reasonable predefined \code{theta}. - The amount of historical data used each year to update the model, the length of the buffer period and the value of \code{theta} have to be fixed for each specific application, e.g., using simulations and/or + During the buffer period alarms would be generated with the old model. Lastly, using GLR is much more computationally intensive than using LR, + whereas LR performs reasonably well on shifts different from the one indicated by \code{theta} as seen in the simulation studies of~\citet{hoehle.paul2008}. Our advice would therefore be to use LR with a reasonable predefined \code{theta}. + The amount of historical data used each year to update the model, the length of the buffer period and the value of \code{theta} have to be fixed for each specific application, e.g., using simulations and/or discussion with experts. \subsubsection{Similar methods in the package} @@ -953,33 +953,33 @@ The package also includes a semi-parametric method \code{outbreakP} that aims at detecting changes from a constant level to a monotonically increasing incidence, for instance the beginning of the influenza season. See Table~\ref{table:ref} for the corresponding references. \subsection{A method for monitoring categorical data} -All monitoring methods presented up to now have been methods for analysing count data. Nevertheless, in public health surveillance one also encounters categorical time series -which are time series where the response variable obtains one of $k\geq2$ different categories (nominal or ordinal). When $k=2$ the time series is binary, for instance representing -a specific outcome in cases such as hospitalization, death or a positive result to some diagnostic test. One can also think of applications with -$k>2$ if one studies, e.g., the age groups of the cases in the context of monitoring a vaccination program: vaccination targeted at children could induce a shift towards older cases which one wants to detect +All monitoring methods presented up to now have been methods for analysing count data. Nevertheless, in public health surveillance one also encounters categorical time series +which are time series where the response variable obtains one of $k\geq2$ different categories (nominal or ordinal). When $k=2$ the time series is binary, for instance representing +a specific outcome in cases such as hospitalization, death or a positive result to some diagnostic test. One can also think of applications with +$k>2$ if one studies, e.g., the age groups of the cases in the context of monitoring a vaccination program: vaccination targeted at children could induce a shift towards older cases which one wants to detect as quickly as possible -- this will be explained thorougly with an example. -The developments of prospective surveillance methods for such categorical time series were up to recently limited to CUSUM-based approaches for -binary data such as those explained in~\citet{Chen1978},~\citet{Reynolds2000} and~\citet{rogerson_yamada2004}. Other than being only suitable for binary data these methods have the drawback of not handling -overdispersion. A method improving on these two limitations while casting the problem into a more comprehending GLM regression framework for categorical data was +The developments of prospective surveillance methods for such categorical time series were up to recently limited to CUSUM-based approaches for +binary data such as those explained in~\citet{Chen1978},~\citet{Reynolds2000} and~\citet{rogerson_yamada2004}. Other than being only suitable for binary data these methods have the drawback of not handling +overdispersion. A method improving on these two limitations while casting the problem into a more comprehending GLM regression framework for categorical data was presented in~\citet{hoehle2010}. It is implemented as the function \code{categoricalCUSUM}. - + The way \code{categoricalCUSUM} operates is very similar to what \code{glrnb} does with fixed \textit{out-of-control} parameter. -First, the parameters in a multivariate GLM for the \textit{in-control} distribution are estimated from the historical data. Then the \textit{out-of-control} distribution is defined by a given -change in the parameters of this GLM, e.g., an intercept change, as explained later. Lastly, prospective monitoring is performed on -current data using a likelihood ratio detector which compares the likelihood of the response under the \textit{in-control} and \textit{out-of-control} distributions. +First, the parameters in a multivariate GLM for the \textit{in-control} distribution are estimated from the historical data. Then the \textit{out-of-control} distribution is defined by a given +change in the parameters of this GLM, e.g., an intercept change, as explained later. Lastly, prospective monitoring is performed on +current data using a likelihood ratio detector which compares the likelihood of the response under the \textit{in-control} and \textit{out-of-control} distributions. \subsubsection{Categorical CUSUM for binomial models} -The challenge when performing these steps with categorical data from surveillance systems -is finding an appropriate model. Binary GLMs as presented in Chapter~6 of \citet{Fahrmeir.etal2013} could be a solution but they do not tackle well the inherent overdispersion in the binomial time series. -Of course one could choose a quasi family but these are not proper statistical distributions making many issues such as prediction complicated. A better alternative -is offered by the use of \textit{generalized additive models for location, scale and shape} \citep[GAMLSS,][]{Rigby2005}, that support distributions such as the beta-binomial distribution, suitable for overdispersed binary data. With GAMLSS one can model the dependency of the mean -- \textit{location} -- -upon explanatory variables but the regression modeling is also extended to other parameters of the distribution, e.g., scale. Moreover any modelled parameter can be put under surveillance, be it the mean (as in the example later developed) +The challenge when performing these steps with categorical data from surveillance systems +is finding an appropriate model. Binary GLMs as presented in Chapter~6 of \citet{Fahrmeir.etal2013} could be a solution but they do not tackle well the inherent overdispersion in the binomial time series. +Of course one could choose a quasi family but these are not proper statistical distributions making many issues such as prediction complicated. A better alternative +is offered by the use of \textit{generalized additive models for location, scale and shape} \citep[GAMLSS,][]{Rigby2005}, that support distributions such as the beta-binomial distribution, suitable for overdispersed binary data. With GAMLSS one can model the dependency of the mean -- \textit{location} -- +upon explanatory variables but the regression modeling is also extended to other parameters of the distribution, e.g., scale. Moreover any modelled parameter can be put under surveillance, be it the mean (as in the example later developed) or the time trend in the linear predictor of the mean. This very flexible modeling framework is implemented in \proglang{R} through the \pkg{gamlss} package~\citep{StasJSS}. As an example we consider the time series of the weekly number of hospitalized cases among all \textit{Salmonella} cases in Germany in Jan 2004--Jan 2014, depicted in -Figure~\ref{fig:cat1}. We use 2004--2012 data to estimate the \textit{in-control} parameters and then perform surveillance on the data from 2013 and early 2014. We start by preprocessing the data. +Figure~\ref{fig:cat1}. We use 2004--2012 data to estimate the \textit{in-control} parameters and then perform surveillance on the data from 2013 and early 2014. We start by preprocessing the data. <>= data("salmHospitalized") isoWeekYearData <- isoWeekYear(epoch(salmHospitalized)) @@ -988,13 +988,13 @@ data2013 <- which(isoWeekYearData$ISOYear == 2013) dataEarly2014 <- which(isoWeekYearData$ISOYear == 2014 & isoWeekYearData$ISOWeek <= 4) - + phase1 <- dataBefore2013 phase2 <- c(data2013, dataEarly2014) weekNumbers <- isoWeekYearData$ISOWeek salmHospitalized.df <- cbind(as.data.frame(salmHospitalized), weekNumbers) -colnames(salmHospitalized.df) <- c("y", "t", "state", "alarm", "n", +colnames(salmHospitalized.df) <- c("y", "t", "state", "alarm", "upperbound","n", "freq", "epochInPeriod", "weekNumber") @ <>= @@ -1008,52 +1008,52 @@ # Prepare data for fitting the model weekNumber <- isoWeekYear(epoch(salmHospitalized))$ISOWeek salmHospitalized.df <- cbind(as.data.frame(salmHospitalized),weekNumber) -colnames(salmHospitalized.df) <- c("y","t","state","alarm","n","freq", +colnames(salmHospitalized.df) <- c("y","t","state","alarm","upperbound","n","freq", "epochInPeriod","weekNumber") @ We assume that the number of hospitalized cases follows a beta-binomial distribution, i.e., -$ y_t \sim \BetaBin(n_t,\pi_t,\sigma_t)$ with $n_t$ the total number of reported cases at time $t$, $\pi_t$ the proportion of these cases that were hospitalized and $\sigma$ the dispersion parameter. In this +$ y_t \sim \BetaBin(n_t,\pi_t,\sigma_t)$ with $n_t$ the total number of reported cases at time $t$, $\pi_t$ the proportion of these cases that were hospitalized and $\sigma$ the dispersion parameter. In this parametrization, $$E(y_t)=n_t \pi_t,\quad \text{and}$$ $$\Var(y_t)=n_t \pi_t(1-\pi_t)\left( 1 + \frac{\sigma(n_t-1)}{\sigma+1} \right)\cdot$$ We choose to model the expectation $n_t \pi_t$ using a beta-binomial model with a logit-link which is a special case of a GAMLSS, i.e., $$\logit(\pi_t)=\bm{z}_t^\top\bm{\beta}$$ where $\bm{z}_t$ is a vector of possibly time-varying covariates and $\bm{\beta}$ a vector of covariate effects in our example. -The proportion of hospitalized cases -varies throughout the year as seen in Figure~\ref{fig:cat1}. -One observes that in the summer the proportion of hospitalized cases is smaller than in other seasons. However, over the holidays in December the proportion of hospitalized cases increases. - Note that the number of non-hospitalized cases drops while the number of hospitalized cases remains constant (data not shown): this might be explained by the fact that cases that are not serious enough to go to the hospital are not seen by general practitioners because sick workers do not need - a sick note during the holidays. Therefore, the \textit{in-control} model should contain these elements, as well as the fact that there is an +The proportion of hospitalized cases +varies throughout the year as seen in Figure~\ref{fig:cat1}. +One observes that in the summer the proportion of hospitalized cases is smaller than in other seasons. However, over the holidays in December the proportion of hospitalized cases increases. + Note that the number of non-hospitalized cases drops while the number of hospitalized cases remains constant (data not shown): this might be explained by the fact that cases that are not serious enough to go to the hospital are not seen by general practitioners because sick workers do not need + a sick note during the holidays. Therefore, the \textit{in-control} model should contain these elements, as well as the fact that there is an increasing trend of the proportion because GPs prescribe less and less stool diagnoses so that more diagnoses are done on hospitalized cases. -We choose a model with an intercept, a time trend, two harmonic terms and a factor variable for the first two weeks of -each year. The variable \code{epochInPeriod} takes into account the fact that not all years have 52 weeks. +We choose a model with an intercept, a time trend, two harmonic terms and a factor variable for the first two weeks of +each year. The variable \code{epochInPeriod} takes into account the fact that not all years have 52 weeks. <>= vars <- c( "y", "n", "t", "epochInPeriod", "weekNumber") m.bbin <- gamlss(cbind(y, n-y) ~ 1 + t - + sin(2 * pi * epochInPeriod) + cos(2 * pi * epochInPeriod) + + sin(2 * pi * epochInPeriod) + cos(2 * pi * epochInPeriod) + sin(4 * pi * epochInPeriod) + cos(4 * pi * epochInPeriod) - + I(weekNumber == 1) + I(weekNumber == 2), + + I(weekNumber == 1) + I(weekNumber == 2), sigma.formula =~ 1, family = BB(sigma.link = "log"), data = salmHospitalized.df[phase1, vars]) @ -The change we aim to detect is defined by a multiplicative change of odds, from $\frac{\pi_t^0}{(1-\pi_t^0)}$ to $R\cdot\frac{\pi_t^0}{(1-\pi_t^0)}$ with $R>0$, similar to what was done in~\citet{Steiner1999} for the logistic regression model. +The change we aim to detect is defined by a multiplicative change of odds, from $\frac{\pi_t^0}{(1-\pi_t^0)}$ to $R\cdot\frac{\pi_t^0}{(1-\pi_t^0)}$ with $R>0$, similar to what was done in~\citet{Steiner1999} for the logistic regression model. This is equivalent to an additive change of the log-odds, $$\logit(\pi_t^1)=\logit(\pi_t^0)+\log R$$ with $\pi_t^0$ being the \textit{in-control} proportion and $\pi_t^1$ the \textit{out-of-control} distribution. The likelihood ratio based CUSUM statistic is now defined as $$C_{t_0}=\max_{1\leq t \leq {t_0}}\left( \sum_{s=t}^{t_0} \log \left( \frac{f(y_s;\bm{z}_s,\bm{\theta}_1)}{f(y_s;\bm{z}_s,\bm{\theta}_0)} \right) \right)$$ -with $\bm{\theta}_0$ and $\bm{\theta}_1$ being the vector in- and \textit{out-of-control} parameters, respectively. Given a threshold \code{h}, an alarm is sounded at the first time when $C_{t_0}>\mathtt{h}$. +with $\bm{\theta}_0$ and $\bm{\theta}_1$ being the vector in- and \textit{out-of-control} parameters, respectively. Given a threshold \code{h}, an alarm is sounded at the first time when $C_{t_0}>\mathtt{h}$. -We set the parameters of the \code{categoricalCUSUM} to optimally detect a doubling of the odds in 2013 and 2014, i.e., $R=2$. Furthermore, we for now set the threshold of the CUSUM at $h=2$. +We set the parameters of the \code{categoricalCUSUM} to optimally detect a doubling of the odds in 2013 and 2014, i.e., $R=2$. Furthermore, we for now set the threshold of the CUSUM at $h=2$. We use the GAMLSS to predict the mean of the \textit{in-control} and \textit{out-of-control} distributions and store them into matrices with two columns among which the second one represents the reference category. <>= -R <- 2 +R <- 2 h <- 2 pi0 <- predict(m.bbin, newdata = salmHospitalized.df[phase2, vars], type = "response") @@ -1064,7 +1064,7 @@ <>= # CUSUM parameters R <- 2 #detect a doubling of the odds for a salmHospitalized being positive -h <- 2 #threshold of the cusum +h <- 2 #threshold of the cusum # Compute \textit{in-control} and out of control mean pi0 <- predict(m.bbin,newdata=salmHospitalized.df[phase2,vars], type="response") @@ -1074,15 +1074,15 @@ pi0m <- rbind(pi0, 1-pi0) pi1m <- rbind(pi1, 1-pi1) @ -Note that the \code{categoricalCUSUM} function is constructed to operate on the observed slot of \code{sts}-objects -which have as columns the number of cases in each category at each timepoint, \textit{i.e.}, each row of the observed slot contains the elements -$(y_{t1},...,y_{tk})$. +Note that the \code{categoricalCUSUM} function is constructed to operate on the observed slot of \code{sts}-objects +which have as columns the number of cases in each category at each timepoint, \textit{i.e.}, each row of the observed slot contains the elements +$(y_{t1},...,y_{tk})$. <>= -populationHosp <- cbind(population(salmHospitalized), +populationHosp <- cbind(population(salmHospitalized), population(salmHospitalized)) -observedHosp <- cbind(observed(salmHospitalized), - population(salmHospitalized) - +observedHosp <- cbind(observed(salmHospitalized), + population(salmHospitalized) - observed(salmHospitalized)) nrowHosp <- nrow(salmHospitalized) salmHospitalized.multi <- new("sts", freq = 52, start = c(2004, 1), @@ -1090,7 +1090,7 @@ epochAsDate = TRUE, observed = observedHosp, populationFrac = populationHosp, - state = matrix(0, nrow = nrowHosp, ncol = 2), + state = matrix(0, nrow = nrowHosp, ncol = 2), multinomialTS = TRUE) @ <>= @@ -1105,71 +1105,71 @@ populationFrac = cbind(population, population), state=matrix(0, nrow=nrow(salmHospitalized), - ncol = 2), + ncol = 2), multinomialTS=TRUE) @ -Furthermore, one needs to define a wrapper for the distribution function in order to have a argument named \code{"mu"} in the function. +Furthermore, one needs to define a wrapper for the distribution function in order to have a argument named \code{"mu"} in the function. -<>= +<>= dBB.cusum <- function(y, mu, sigma, size, log = FALSE) { - return(dBB(if (is.matrix(y)) y[1,] else y, + return(dBB(if (is.matrix(y)) y[1,] else y, if (is.matrix(y)) mu[1,] else mu, - sigma = sigma, bd = size, log = log)) + sigma = sigma, bd = size, log = log)) } @ -<>= -# Function to use as dfun in the categoricalCUSUM +<>= +# Function to use as dfun in the categoricalCUSUM dBB.cusum <- function(y, mu, sigma, size, log = FALSE) { return(dBB( if (is.matrix(y)) y[1,] else y, if (is.matrix(y)) mu[1,] else mu, - sigma = sigma, bd = size, log = log)) + sigma = sigma, bd = size, log = log)) } @ After these preliminary steps, the monitoring can be performed. <>= -controlCat <- list(range = phase2, h = 2, pi0 = pi0m, pi1 = pi1m, +controlCat <- list(range = phase2, h = 2, pi0 = pi0m, pi1 = pi1m, ret = "cases", dfun = dBB.cusum) -salmHospitalizedCat <- categoricalCUSUM(salmHospitalized.multi, +salmHospitalizedCat <- categoricalCUSUM(salmHospitalized.multi, control = controlCat, - sigma = exp(m.bbin$sigma.coef)) + sigma = exp(m.bbin$sigma.coef)) @ <>= # Monitoring controlCat <- list(range = phase2,h = 2,pi0 = pi0m, pi1 = pi1m, ret = "cases", dfun = dBB.cusum) -salmHospitalizedCat <- categoricalCUSUM(salmHospitalized.multi, +salmHospitalizedCat <- categoricalCUSUM(salmHospitalized.multi, control = controlCat, - sigma = exp(m.bbin$sigma.coef)) + sigma = exp(m.bbin$sigma.coef)) @ -The results can be seen in Figure~\ref{fig:catDouble}(a). With the given settings, there are alarms at week -16 in 2004 -and at week 3 in 2004. -The one in 2014 corresponds to the usual peak of the beginning of the year, which was larger than expected this year, maybe because the +The results can be seen in Figure~\ref{fig:catDouble}(a). With the given settings, there are alarms at week +16 in 2004 +and at week 3 in 2004. +The one in 2014 corresponds to the usual peak of the beginning of the year, which was larger than expected this year, maybe because the weekdays of the holidays were particularly worker-friendly so that sick notes were even less needed. -<>= +<>= y.max <- max(observed(salmHospitalized)/population(salmHospitalized),upperbound(salmHospitalized)/population(salmHospitalized),na.rm=TRUE) plotOpts2 <- modifyList(plotOpts,list(x=salmHospitalized,legend.opts=NULL,ylab="",ylim=c(0,y.max)),keep.null=TRUE) plotOpts2$xaxis.tickFreq <- list("%G"=atChange,"%m"=atChange) plotOpts2$par.list <- list(mar=c(6,5,5,5),family="Times",las=1) do.call("plot",plotOpts2) lines(salmHospitalized@populationFrac/4000,col="grey80",lwd=2) -lines(campyDE$hum*50, col="white", lwd=2) +lines(campyDE$hum*50, col="white", lwd=2) axis(side=4, at=seq(0,2000,by=500)/4000,labels=as.character(seq(0,2000,by=500)),las=1, cex=2,cex.axis=1.5,pos=length(observed(salmHospitalized))+20) par(family="Times") -text(-20, 0.6, "Proportion", pos = 3, xpd = T,cex=cex.text) +text(-20, 0.6, "Proportion", pos = 3, xpd = T,cex=cex.text) text(520, 0.6, "Total number of \n reported cases", pos = 3, xpd = T,cex=cex.text) #mtext(side=4,text=expression(paste("Total number of reported cases (thousands)", sep='')), - #las=0,line=1, cex=cex.text) + #las=0,line=1, cex=cex.text) @ \begin{figure} -\begin{center} - +\begin{center} + <>= <> @@ -1179,33 +1179,33 @@ \caption{Weekly proportion of Salmonella cases that were hospitalized in Germany 2004-2014. In addition the corresponding number of reported cases is shown as a light curve.} \label{fig:cat1} \end{figure} -<>= +<>= @ -The value for the threshold \code{h} can be determined following the procedures presented in \citet{hoehle-mazick-2010} for count data, and as in the code exhibited below. Two methods can be used for -determining the probability of a false alarm within a pre-specified number of steps for a given value of the threshold \code{h}: a Monte Carlo method relying on, e.g., 1000 simulations and a Markov Chain approximation of the CUSUM. The former is much more computationally intensive than the latter: +The value for the threshold \code{h} can be determined following the procedures presented in \citet{hoehle-mazick-2010} for count data, and as in the code exhibited below. Two methods can be used for +determining the probability of a false alarm within a pre-specified number of steps for a given value of the threshold \code{h}: a Monte Carlo method relying on, e.g., 1000 simulations and a Markov Chain approximation of the CUSUM. The former is much more computationally intensive than the latter: with the code below, the Monte Carlo method needed approximately 300 times more time than the Markov Chain method. -Since both results are close we recommend the Markov Chain approximation for practical use. The Monte Carlo method works by sampling observed values from the estimated -distribution and performing monitoring with \code{categoricalCUSUM} on this \code{sts} object. As observed values are estimated from the \textit{in-control} distribution every alarm thus obtained is a -false alarm so that the simulations allow to estimate the probability of a false alarm when monitoring \textit{in-control} data over the timepoints of \code{phase2}. The Markov Chain approximation introduced by \citet{brook_evans1972} is implemented as \code{LRCUSUM.runlength} +Since both results are close we recommend the Markov Chain approximation for practical use. The Monte Carlo method works by sampling observed values from the estimated +distribution and performing monitoring with \code{categoricalCUSUM} on this \code{sts} object. As observed values are estimated from the \textit{in-control} distribution every alarm thus obtained is a +false alarm so that the simulations allow to estimate the probability of a false alarm when monitoring \textit{in-control} data over the timepoints of \code{phase2}. The Markov Chain approximation introduced by \citet{brook_evans1972} is implemented as \code{LRCUSUM.runlength} which is already used for \code{glrnb}. Results from both methods can be seen in Figure~\ref{fig:catDouble}(b). We chose a value of 2 for \code{h} so that the probability of a false alarm within the 56 timepoints of \code{phase2} is less than $0.1$. -One first has to set the values of the threshold to be investigated and to prepare the function used for simulation, that draws observed values from the -\textit{in-control} distribution and performs monitoring on the corresponding time series, then indicating if there was at least one alarm. Then 1000 simulations were -performed with a fixed seed value for the sake of reproducibility. Afterwards, we tested the Markov Chain approximation using the function \code{LRCUSUM.runlength} over the same grid +One first has to set the values of the threshold to be investigated and to prepare the function used for simulation, that draws observed values from the +\textit{in-control} distribution and performs monitoring on the corresponding time series, then indicating if there was at least one alarm. Then 1000 simulations were +performed with a fixed seed value for the sake of reproducibility. Afterwards, we tested the Markov Chain approximation using the function \code{LRCUSUM.runlength} over the same grid of values for the threshold. <>= -h.grid <- seq(1, 10, by = 0.5) +h.grid <- seq(1, 10, by = 0.5) simone <- function(sts, h) { y <- rBB(length(phase2), mu = pi0m[1, , drop = FALSE], bd = population(sts)[phase2, ], sigma = exp(m.bbin$sigma.coef)) - observed(sts)[phase2, ] <- cbind(y, sts@populationFrac[phase2, 1] - y) + observed(sts)[phase2, ] <- cbind(y, sts@populationFrac[phase2, 1] - y) one.surv <- categoricalCUSUM(sts, modifyList(controlCat, list(h = h)), sigma = exp(m.bbin$sigma.coef)) return(any(alarms(one.surv)[, 1])) @@ -1214,13 +1214,13 @@ nSims <- 1000 -pMC <- sapply(h.grid, function(h) { - mean(replicate(nSims, simone(salmHospitalized.multi, h))) +pMC <- sapply(h.grid, function(h) { + mean(replicate(nSims, simone(salmHospitalized.multi, h))) }) pMarkovChain <- sapply( h.grid, function(h) { - TA <- LRCUSUM.runlength(mu = pi0m[1,, drop = FALSE], - mu0 = pi0m[1,, drop = FALSE], + TA <- LRCUSUM.runlength(mu = pi0m[1,, drop = FALSE], + mu0 = pi0m[1,, drop = FALSE], mu1 = pi1m[1,, drop = FALSE], n = population(salmHospitalized.multi)[phase2, ], h = h, dfun = dBB.cusum, @@ -1228,10 +1228,10 @@ return(tail(TA$cdf, n = 1)) }) @ -<>= +<>= # Values of the threshold to be investigated -h.grid <- seq(1,10,by=0.5) - +h.grid <- seq(1,10,by=0.5) + # Prepare function for simulations simone <- function(sts, h) { # Draw observed values from the \textit{in-control} distribution @@ -1239,7 +1239,7 @@ bd=population(sts)[phase2,], sigma=exp(m.bbin$sigma.coef)) observed(sts)[phase2,] <- cbind(y,sts@populationFrac[phase2,1] - y) -# Perform monitoring +# Perform monitoring one.surv <- categoricalCUSUM(sts, control=modifyList(controlCat, list(h=h)), sigma=exp(m.bbin$sigma.coef)) # Return 1 if there was at least one alarm @@ -1251,17 +1251,17 @@ # Number of simulations nSims=1000 # Simulations over the possible h values -pMC <- sapply(h.grid, function(h) { +pMC <- sapply(h.grid, function(h) { h <- h - mean(replicate(nSims, simone(salmHospitalized.multi,h))) + mean(replicate(nSims, simone(salmHospitalized.multi,h))) }) # Distribution function to be used by LRCUSUM.runlength dBB.rl <- function(y, mu, sigma, size, log = FALSE) { - dBB(y, mu = mu, sigma = sigma, bd = size, log = log) + dBB(y, mu = mu, sigma = sigma, bd = size, log = log) } # Markov Chain approximation over h.grid pMarkovChain <- sapply( h.grid, function(h) { - TA <- LRCUSUM.runlength(mu=pi0m[1,,drop=FALSE], mu0=pi0m[1,,drop=FALSE], + TA <- LRCUSUM.runlength(mu=pi0m[1,,drop=FALSE], mu0=pi0m[1,,drop=FALSE], mu1=pi1m[1,,drop=FALSE], n=population(salmHospitalized.multi)[phase2,], h=h, dfun=dBB.rl, sigma=exp(m.bbin$sigma.coef)) @@ -1283,7 +1283,7 @@ \hspace{-1em} \subfloat[]{ -<>= +<>= y.max <- max(observed(salmHospitalizedCat[,1])/population(salmHospitalizedCat[,1]),upperbound(salmHospitalizedCat[,1])/population(salmHospitalizedCat[,1]),na.rm=TRUE) plotOpts3 <- modifyList(plotOpts,list(x=salmHospitalizedCat[,1],ylab="Proportion",ylim=c(0,y.max))) plotOpts3$legend.opts <- list(x="top",bty="n",legend=c(expression(U[t])),lty=1,lwd=line.lwd,col=alarm.symbol$col,horiz=TRUE,cex=cex.leg) @@ -1293,12 +1293,12 @@ @ - + \includegraphics[width=9cm]{plots/monitoringCounts-catF.pdf} } \hspace{-3em} \subfloat[]{ -<>= +<>= par(mar=c(6,5,5,5),family="Times") matplot(h.grid, cbind(pMC,pMarkovChain),type="l",ylab=expression(P(T[A] <= 56 * "|" * tau * "=" * infinity)),xlab="Threshold h",col=1,cex=cex.text, cex.axis =cex.text,cex.lab=cex.text) @@ -1308,18 +1308,18 @@ par(family="Times") legend(4,0.08,c("Monte Carlo","Markov chain"), lty=1:2,col=1,cex=cex.text,bty="n") -@ +@ \includegraphics[width=9cm]{plots/monitoringCounts-catARL.pdf} } -\caption{(a) Results of the monitoring with categoricalCUSUM of the proportion of Salmonella cases that were hospitalized in Germany in Jan 2013 - Jan 2014. (b) +\caption{(a) Results of the monitoring with categoricalCUSUM of the proportion of Salmonella cases that were hospitalized in Germany in Jan 2013 - Jan 2014. (b) Probability of a false alarm within the 56 timepoints of the monitoring as a function of the threshold $h$.} \label{fig:catDouble} \end{figure} -The procedure for using the function for multicategorical variables follows the same steps (as illustrated later). Moreover, one could expand the approach -to utilize the multiple regression possibilities offered by GAMLSS. Here we chose to try to detect a change in the mean of the distribution of counts but as GAMLSS provides more general regression tools -than GLM we could also aim at detecting a change in the time trend included in the model for the mean. +The procedure for using the function for multicategorical variables follows the same steps (as illustrated later). Moreover, one could expand the approach +to utilize the multiple regression possibilities offered by GAMLSS. Here we chose to try to detect a change in the mean of the distribution of counts but as GAMLSS provides more general regression tools +than GLM we could also aim at detecting a change in the time trend included in the model for the mean. \subsubsection{Categorical CUSUM for multinomial models} @@ -1335,7 +1335,7 @@ <>= data("rotaBB") -plot(rotaBB, xlab = "Time (months)", +plot(rotaBB, xlab = "Time (months)", ylab = "Proportion of reported cases") @ @@ -1371,7 +1371,7 @@ fun(epoch(rotaBB),observed(rotaBB)[,i],type="l",xlab="Time (months)",ylab="Reported cases",ylim=c(0,max(observed(rotaBB))),col=pal[i],lwd=2) } else { fun(epoch(rotaBB),observed(rotaBB)[,i,drop=FALSE]/rowSums(observed(rotaBB)),type="l",xlab="Time (months)",ylab="Proportion of reported cases",ylim=c(0,max(observed(rotaBB)/rowSums(observed(rotaBB)))),col=pal[i],lwd=2) - } + } } # Add legend axis(1,at=as.numeric(epoch(rotaBB)),label=NA,tck=-0.01) @@ -1388,35 +1388,35 @@ @ Hence, our interest is in prospectively detecting a possible age-shift. Since the vaccine was recommended for routine vaccination in Brandenburg in 2009 we choose to start -the monitoring at that time point. We do so by fitting a multinomial logit-model containing a trend as well as one harmonic wave and use the age group 0--4 years as reference category, +the monitoring at that time point. We do so by fitting a multinomial logit-model containing a trend as well as one harmonic wave and use the age group 0--4 years as reference category, to the data from the years 2002-2008. Different \proglang{R} packages implement such type of modeling, but we shall use the \pkg{MGLM} package~\citep{MGLM}, because it also offers the fitting of extended multinomial regression models allowing for extra dispersion. <>= rotaBB.df <- as.data.frame(rotaBB) - + X <- with(rotaBB.df, cbind(intercept = 1, epoch, - sin1 = sin(2 * pi * epochInPeriod), + sin1 = sin(2 * pi * epochInPeriod), cos1 = cos(2 * pi * epochInPeriod))) -phase1 <- epoch(rotaBB) < as.Date("2009-01-01") +phase1 <- epoch(rotaBB) < as.Date("2009-01-01") phase2 <- !phase1 order <- c(2:5, 1); reorder <- c(5, 1:4) library("MGLM") -m0 <- MGLMreg(as.matrix(rotaBB.df[phase1, order]) ~ -1 + X[phase1, ], - dist = "MN") +m0 <- MGLMreg(as.matrix(rotaBB.df[phase1, order]) ~ -1 + X[phase1, ], + dist = "MN") @ <>= # Convert sts object to data.frame useful for regression modelling rotaBB.df <- as.data.frame(rotaBB) -# Create matrix +# Create matrix X <- with(rotaBB.df,cbind(intercept=1,epoch, sin1=sin(2*pi*epochInPeriod),cos1=cos(2*pi*epochInPeriod))) # Fit model to 2002-2009 data -phase1 <- epoch(rotaBB) < as.Date("2009-01-01") +phase1 <- epoch(rotaBB) < as.Date("2009-01-01") phase2 <- !phase1 # MGLMreg automatically takes the last class as ref so we reorder @@ -1424,13 +1424,13 @@ # Fit multinomial logit model (i.e. dist="MN") to phase1 data library("MGLM") -m0 <- MGLMreg(as.matrix(rotaBB.df[phase1,order])~ -1 + X[phase1,], dist="MN") +m0 <- MGLMreg(as.matrix(rotaBB.df[phase1,order])~ -1 + X[phase1,], dist="MN") @ <<>>= # Set threshold and option object h <- 2 @ -As described in \citet{hoehle2010} we can try to detect a specific shift in the intercept coefficients of the model. For example, a multiplicative shift of factor 7 in the example below, in the odds of each of the four age categories against the reference category is modelled by changing the intercept value of each category. +As described in \citet{hoehle2010} we can try to detect a specific shift in the intercept coefficients of the model. For example, a multiplicative shift of factor 7 in the example below, in the odds of each of the four age categories against the reference category is modelled by changing the intercept value of each category. Based on this, the \textit{in-control} and \textit{out-of-control} proportions are easily computed using the \code{predict} function for \code{MGLMreg} objects. <>= @@ -1443,7 +1443,7 @@ @ <>= m1 <- m0 -# Out-of control model: shift in all intercept coeffs +# Out-of control model: shift in all intercept coeffs m1$coefficients[1,] <- m0$coefficients[1,] + log(2) # Proportion over time for phase2 based on fitted model (re-order back) pi0 <- t(predict(m0, newdata=X[phase2,])[,reorder]) @@ -1452,13 +1452,13 @@ For applying the \code{categoricalCUSUM} function one needs to define a compatible wrapper function for the multinomial as in the binomial example. -With $\bm{\pi}^0$ and $\bm{\pi}^1$ in place one only needs to define a wrapper function, which defines the PMF of the sampling distribution -- in this case the multinomial -- in a \code{categoricalCUSUM} compatible way. +With $\bm{\pi}^0$ and $\bm{\pi}^1$ in place one only needs to define a wrapper function, which defines the PMF of the sampling distribution -- in this case the multinomial -- in a \code{categoricalCUSUM} compatible way. <>= dfun <- function(y, size, mu, log = FALSE) { return(dmultinom(x = y, size = size, prob = mu, log = log)) } -control <- list(range = seq(nrow(rotaBB))[phase2], h = h, pi0 = pi0, +control <- list(range = seq(nrow(rotaBB))[phase2], h = h, pi0 = pi0, pi1 = pi1, ret = "value", dfun = dfun) surv <- categoricalCUSUM(rotaBB,control=control) @ @@ -1466,7 +1466,7 @@ #Number of MC samples nSamples <- 1e4 -#Do MC +#Do MC simone.stop <- function(sts, control) { phase2Times <- seq(nrow(sts))[phase2] #Generate new phase2 data from the fitted in control model @@ -1495,68 +1495,68 @@ @ -With $\bm{\pi}^0$ and $\bm{\pi}^1$ in place one only needs to define a wrapper function, which defines the PMF of the sampling distribution -- in this case the multinomial -- in a \code{categoricalCUSUM} compatible way. +With $\bm{\pi}^0$ and $\bm{\pi}^1$ in place one only needs to define a wrapper function, which defines the PMF of the sampling distribution -- in this case the multinomial -- in a \code{categoricalCUSUM} compatible way. <>= <> @ The resulting CUSUM statistic $C_t$ as a function of time is shown in Figure~\ref{fig:ct}(a). The first time an aberration is detected is July 2009. Using 10000 Monte Carlo simulations we estimate that with the chosen threshold $h=2$ the probability for a false alarm within the 60 time points of \code{phase2} is 0.02. -As the above example shows, the LR based categorical CUSUM is rather flexible in handling any type of multivariate GLM modeling to specify the \textit{in-control} and \textit{out-of-control} proportions. However, it requires a direction of the change to be specified -- for which detection is optimal. One sensitive part of such monitoring is the fit of the multinomial distribution to a multivariate time series of proportions, which usually exhibit extra dispersion when compared to the multinomial. For example comparing the AIC between the multinomial logit-model and a Dirichlet-multinomial model with $\alpha_{ti} = \exp(\bm{x}_t^\top\bm{\beta})$~\citep{MGLM} shows that overdispersion is present. -The Dirichlet distribution is the multicategorical equivalent of the beta-binomial distribution. We exemplify its use in the code below. +As the above example shows, the LR based categorical CUSUM is rather flexible in handling any type of multivariate GLM modeling to specify the \textit{in-control} and \textit{out-of-control} proportions. However, it requires a direction of the change to be specified -- for which detection is optimal. One sensitive part of such monitoring is the fit of the multinomial distribution to a multivariate time series of proportions, which usually exhibit extra dispersion when compared to the multinomial. For example comparing the AIC between the multinomial logit-model and a Dirichlet-multinomial model with $\alpha_{ti} = \exp(\bm{x}_t^\top\bm{\beta})$~\citep{MGLM} shows that overdispersion is present. +The Dirichlet distribution is the multicategorical equivalent of the beta-binomial distribution. We exemplify its use in the code below. <>= -m0.dm <- MGLMreg(as.matrix(rotaBB.df[phase1, 1:5]) ~ -1 + X[phase1, ], +m0.dm <- MGLMreg(as.matrix(rotaBB.df[phase1, 1:5]) ~ -1 + X[phase1, ], dist = "DM") c(m0$AIC, m0.dm$AIC) @ Hence, the above estimated false alarm probability might be too low for the actual monitoring problem, because the variation in the time series is larger than implied by the multinomial. Hence, it appears prudent to repeat the analysis using the more flexible Dirichlet-multinomial model. This is straightforward with \code{categoricalCUSUM} once the \textit{out-of-control} proportions are specified in terms of the model. Such a specification is, however, hampered by the fact that the two models use different parametrizations. -For performing monitoring in this new setting we first need to calculate the $\alpha$'s of the multinomial-Dirichlet for the \textit{in-control} and \textit{out-of-control} +For performing monitoring in this new setting we first need to calculate the $\alpha$'s of the multinomial-Dirichlet for the \textit{in-control} and \textit{out-of-control} distributions. <>= delta <- 2 -m1.dm <- m0.dm -m1.dm$coefficients[1, ] <- m0.dm$coefficients[1, ] + +m1.dm <- m0.dm +m1.dm$coefficients[1, ] <- m0.dm$coefficients[1, ] + c(-delta, rep(delta/4, 4)) alpha0 <- exp(X[phase2,] %*% m0.dm$coefficients) alpha1 <- exp(X[phase2,] %*% m1.dm$coefficients) - + dfun <- function(y, size, mu, log = FALSE) { dLog <- ddirm(t(y), t(mu)) - if (log) { return(dLog) } else { return(exp(dLog)) } + if (log) { return(dLog) } else { return(exp(dLog)) } } h <- 2 -control <- list(range = seq(nrow(rotaBB))[phase2], h = h, - pi0 = t(alpha0), pi1 = t(alpha1), +control <- list(range = seq(nrow(rotaBB))[phase2], h = h, + pi0 = t(alpha0), pi1 = t(alpha1), ret = "value", dfun = dfun) surv.dm <- categoricalCUSUM(rotaBB, control = control) @ <>= # Change intercept in the first class (for DM all 5 classes are modeled) delta <- 2 -m1.dm <- m0.dm -m1.dm$coefficients[1,] <- m0.dm$coefficients[1,] + +m1.dm <- m0.dm +m1.dm$coefficients[1,] <- m0.dm$coefficients[1,] + c(-delta,rep(delta/4,4)) # Calculate the alphas of the multinomial-Dirichlet in the two cases alpha0 <- exp(X[phase2,] %*% m0.dm$coefficients) alpha1 <- exp(X[phase2,] %*% m1.dm$coefficients) - + # Use alpha vector as mu magnitude # (not possible to compute it from mu and size) dfun <- function(y, size, mu, log=FALSE) { dLog <- ddirm(t(y), t(mu)) - if (log) { return(dLog) } else {return(exp(dLog))} + if (log) { return(dLog) } else {return(exp(dLog))} } # Threshold h <- 2 -control <- list(range=seq(nrow(rotaBB))[phase2],h=h,pi0=t(alpha0), +control <- list(range=seq(nrow(rotaBB))[phase2],h=h,pi0=t(alpha0), pi1=t(alpha1), ret="value",dfun=dfun) surv.dm <- categoricalCUSUM(rotaBB,control=control) @ @@ -1572,7 +1572,7 @@ \hspace{-1em} \subfloat[]{ -<>= +<>= surv@observed[,1] <- 0 surv@multinomialTS <- FALSE surv.dm@observed[,1] <- 0 @@ -1580,29 +1580,29 @@ y.max <- max(observed(surv.dm[,1]),upperbound(surv.dm[,1]),observed(surv[,1]),upperbound(surv[,1]),na.rm=TRUE) plotOpts3 <- modifyList(plotOpts,list(x=surv[,1],ylim=c(0,y.max),ylab=expression(C[t]),xlab="")) plotOpts3$legend.opts <- list(x="topleft",bty="n",legend="R",lty=1,lwd=line.lwd,col=alarm.symbol$col,horiz=TRUE,cex=cex.leg) -do.call("plot",plotOpts3) +do.call("plot",plotOpts3) lines( c(0,1e99), rep(h,2),lwd=2,col="darkgray",lty=1) par(family="Times") mtext(side=1,text="Time (weeks)", - las=0,line=3, cex=cex.text) + las=0,line=3, cex=cex.text) @ - + \includegraphics[width=9cm]{plots/monitoringCounts-ctPlot1.pdf} } \hspace{-3em} \subfloat[]{ -<>= +<>= plotOpts3 <- modifyList(plotOpts,list(x=surv.dm[,1],ylim=c(0,y.max),ylab=expression(C[t]),xlab="")) plotOpts3$legend.opts <- list(x="topleft",bty="n",legend="R",lty=1,lwd=line.lwd,col=alarm.symbol$col,horiz=TRUE,cex=cex.text) y.max <- max(observed(surv.dm[,1]),upperbound(surv.dm[,1]),observed(surv[,1]),upperbound(surv[,1]),na.rm=TRUE) -do.call("plot",plotOpts3) +do.call("plot",plotOpts3) lines( c(0,1e99), rep(h,2),lwd=2,col="darkgray",lty=1) par(family="Times") mtext(side=1,text="Time (weeks)", - las=0,line=3, cex=cex.text) -@ + las=0,line=3, cex=cex.text) +@ \includegraphics[width=9cm]{plots/monitoringCounts-ctPlot2.pdf} } \caption{Categorical CUSUM statistic $C_t$. Once $C_t>\Sexpr{h}$ an alarm is sounded and the statistic is reset. In (a) surveillance uses the multinomial distribution and in (b) surveillance uses the Dirichlet-multinomial distribution.} @@ -1610,33 +1610,33 @@ \end{figure} -The resulting CUSUM statistic $C_t$ using the Dirichlet multinomial distribution is shown in Figure~\ref{fig:ct}(b). -We notice a rather similar behavior even though the shift-type specified by this model is slightly different than in the model of Figure~\ref{fig:ct}(a). +The resulting CUSUM statistic $C_t$ using the Dirichlet multinomial distribution is shown in Figure~\ref{fig:ct}(b). +We notice a rather similar behavior even though the shift-type specified by this model is slightly different than in the model of Figure~\ref{fig:ct}(a). \subsubsection{Categorical data in routine surveillance} -The multidimensionality of data available in public health surveillance creates many opportunities for the application of categorical time series: one could, e.g., look at the sex ratio of cases of a given disease, at the age group distribution, -at the regions sending data, etc. If one is interested in monitoring with respect to a categorical variable, a choice has to be made between monitoring each time series individually, -for instance a time series of \textit{Salmonella} cases for each age category, or to monitor the distribution of cases with respect to that factor jointly \textit{via} \code{categoricalCUSUM}. A downside of the latter -solution is that one has to specify the change parameter \code{R} in advance, which can be quite a hurdle if one has no pre-conceived idea of what could happen for, say, the age shift after the introduction of a vaccine. Alternatively, one could employ an ensemble of monitors or monitor an aggregate. However, more straightforward applications could be found in the (binomial) surveillance of positive diagnostics if one were to obtain data about tests performed by laboratories and not only about confirmed cases. An alternative would be to apply +The multidimensionality of data available in public health surveillance creates many opportunities for the application of categorical time series: one could, e.g., look at the sex ratio of cases of a given disease, at the age group distribution, +at the regions sending data, etc. If one is interested in monitoring with respect to a categorical variable, a choice has to be made between monitoring each time series individually, +for instance a time series of \textit{Salmonella} cases for each age category, or to monitor the distribution of cases with respect to that factor jointly \textit{via} \code{categoricalCUSUM}. A downside of the latter +solution is that one has to specify the change parameter \code{R} in advance, which can be quite a hurdle if one has no pre-conceived idea of what could happen for, say, the age shift after the introduction of a vaccine. Alternatively, one could employ an ensemble of monitors or monitor an aggregate. However, more straightforward applications could be found in the (binomial) surveillance of positive diagnostics if one were to obtain data about tests performed by laboratories and not only about confirmed cases. An alternative would be to apply \code{farringtonFlexible} while using the number of tests as \code{populationOffset}. \subsubsection{Similar methods in the package} The package also offers another CUSUM method suitable for binary data, \code{pairedbinCUSUM} that implements the method introduced by~\citet{Steiner1999}, which does not, however, take overdispersion into account as well as \code{glrnb}. The algorithm \code{rogerson} also supports the analysis of binomial data. See Table~\ref{table:ref} for the corresponding references. \subsection{Other algorithms implemented in the package} -We conclude this description of surveillance methods by giving an overview of all algorithms implemented in the package with the corresponding references in Table~\ref{table:ref}. +We conclude this description of surveillance methods by giving an overview of all algorithms implemented in the package with the corresponding references in Table~\ref{table:ref}. One can refer to the relative reference articles and to the reference manual of the package for more information about each method. - + Criteria for choosing a method in practice are numerous. First one needs to ponder on the amount of historical data at hand -- for instance the EARS methods only need data for the last -timepoints whereas the Farrington methods use data up to $b$ years in the past. Then one should consider the amount of past data used by the algorithm -- historical reference methods use only a subset of the past data, namely +timepoints whereas the Farrington methods use data up to $b$ years in the past. Then one should consider the amount of past data used by the algorithm -- historical reference methods use only a subset of the past data, namely the timepoints located around the same timepoint in the past years, whereas other methods use all past data included in the reference data. This can be a criterion of choice since one can prefer using all available data. It is also important to decide whether one wants to detect one-timepoint aberration or more prolonged shifts. -And lastly, an important criterion is how much work needs to be done for finetuning the algorithm for each specific time series. +And lastly, an important criterion is how much work needs to be done for finetuning the algorithm for each specific time series. -The package on the one hand provides the means for analysing nearly all type of surveillance data +The package on the one hand provides the means for analysing nearly all type of surveillance data and on the other hand makes the comparison of algorithms possible. This is useful in practical applications when those algorithms are implemented into routine use, which will be the topic of Section~\ref{sec:routine}. @@ -1674,21 +1674,21 @@ \label{sec:3} Combining \pkg{surveillance} with other \proglang{R} packages and programs is easy, allowing the integration of -the aberration detection into a comprehensive surveillance system to be used in routine practice. In our opinion, such a surveillance system has to at least support the following process: loading data from local databases, analysing them within \pkg{surveillance} -and sending the results of this analysis to the end-user who is typically an epidemiologist in charge of the specific pathogen. This section exemplifies the integration of the package +the aberration detection into a comprehensive surveillance system to be used in routine practice. In our opinion, such a surveillance system has to at least support the following process: loading data from local databases, analysing them within \pkg{surveillance} +and sending the results of this analysis to the end-user who is typically an epidemiologist in charge of the specific pathogen. This section exemplifies the integration of the package into a whole analysis stack, first through the introduction of a simple workflow from data query to a \code{Sweave}~\citep{sweave} or \pkg{knitr}~\citep{knitr} report of signals, and secondly through the presentation of the more elaborate system in use at the German Robert Koch Institute. \subsection{A simple surveillance system} Suppose you have a database with surveillance time series but little resources to build a surveillance system encompassing all the above stages. Using \proglang{R} and \code{Sweave} or \code{knitr} for \LaTeX~you can still set up - a simple surveillance analysis without having to do everything by hand. You only need to input the data into \proglang{R} and create \code{sts} objects for each time series of interest - as explained thoroughly in~\citet{hoehle-mazick-2010}. Then, after choosing a surveillance algorithm, say \code{farringtonFlexible}, and - feeding it with the appropriate \code{control} argument, you can get a \code{sts} object with upperbounds and alarms for each of your time series of interest over the \code{range} - supplied in \code{control}. For defining the range automatically one could use the \proglang{R} function \code{Sys.Date()} to get today's date. - These steps can be introduced as a code chunk in a \code{Sweave} or \code{knitr} code that will translate it into a report that you can send to the epidemiologists in charge of the respective pathogen whose cases are monitored. - - Below is an example of a short code segment showing the analysis of the \textit{S. Newport} weekly counts of cases in the German federal states Baden-W\"{u}rttemberg and North Rhine-Westphalia -with the improved method implemented in \code{farringtonFlexible}. The package provides a \code{toLatex} method for \code{sts} objects that produces a table with the observed number of counts and upperbound for each column in + a simple surveillance analysis without having to do everything by hand. You only need to input the data into \proglang{R} and create \code{sts} objects for each time series of interest + as explained thoroughly in~\citet{hoehle-mazick-2010}. Then, after choosing a surveillance algorithm, say \code{farringtonFlexible}, and + feeding it with the appropriate \code{control} argument, you can get a \code{sts} object with upperbounds and alarms for each of your time series of interest over the \code{range} + supplied in \code{control}. For defining the range automatically one could use the \proglang{R} function \code{Sys.Date()} to get today's date. + These steps can be introduced as a code chunk in a \code{Sweave} or \code{knitr} code that will translate it into a report that you can send to the epidemiologists in charge of the respective pathogen whose cases are monitored. + + Below is an example of a short code segment showing the analysis of the \textit{S. Newport} weekly counts of cases in the German federal states Baden-W\"{u}rttemberg and North Rhine-Westphalia +with the improved method implemented in \code{farringtonFlexible}. The package provides a \code{toLatex} method for \code{sts} objects that produces a table with the observed number of counts and upperbound for each column in \code{observed}, where alarms can be highlighted by for instance bold text. The resulting table is shown in Table~\ref{tableResults}. <>= data("salmNewport") @@ -1696,7 +1696,7 @@ rangeAnalysis <- (today - 4):today in2013 <- which(isoWeekYear(epoch(salmNewport))$ISOYear == 2013) -algoParameters <- list(range = rangeAnalysis, noPeriods = 10, +algoParameters <- list(range = rangeAnalysis, noPeriods = 10, populationBool = FALSE, b = 4, w = 3, weightsThreshold = 2.58, pastWeeksNotIncluded = 26, pThresholdTrend = 1, @@ -1709,21 +1709,21 @@ start <- isoWeekYear(epoch(salmNewport)[range(range)[1]]) end <- isoWeekYear(epoch(salmNewport)[range(range)[2]]) -caption <- paste("Results of the analysis of reported S. Newport +caption <- paste("Results of the analysis of reported S. Newport counts in two German federal states for the weeks W-", start$ISOWeek, "-", start$ISOYear, " - W-", end$ISOWeek, "-", end$ISOYear, " performed on ", Sys.Date(), - ". Bold upperbounds (UB) indicate weeks with alarms.", + ". Bold upperbounds (UB) indicate weeks with alarms.", sep="") toLatex(results, caption = caption) @ <>= # In this example the sts-object already exists. -# Supply the code with the date of a Monday and look for the +# Supply the code with the date of a Monday and look for the # corresponding index in the sts-object today <- which(epoch(salmNewport)==as.Date("2013-12-23")) -# The analysis will be performed for the given week -# and the 4 previous ones +# The analysis will be performed for the given week +# and the 4 previous ones range <- (today-4):today in2013 <- which(isoWeekYear(epoch(salmNewport))$ISOYear==2013) # Control argument for using the improved method @@ -1738,11 +1738,11 @@ # Export the results as a tex table start <- isoWeekYear(epoch(salmNewport)[range(range)[1]]) end <- isoWeekYear(epoch(salmNewport)[range(range)[2]]) -caption <- paste("Results of the analysis of reported S. Newport +caption <- paste("Results of the analysis of reported S. Newport counts in two German federal states for the weeks W-", start$ISOWeek," ",start$ISOYear," - W-",end$ISOWeek, " ",end$ISOYear," performed on ",Sys.Date(), - ". Bold upperbounds (thresholds) indicate weeks with alarms.", + ". Bold upperbounds (thresholds) indicate weeks with alarms.", sep="") toLatex(results, table.placement="h", size = "normalsize", sanitize.text.function = identity, @@ -1753,21 +1753,21 @@ caption=caption,label="tableResults") @ -The advantage of this approach is that it can be made automatic. The downside of such a system is that the report is not interactive, for instance one cannot click on the cases and get the linelist. Nevertheless, this is a workable solution in -many cases -- especially when human and financial resources are narrow. +The advantage of this approach is that it can be made automatic. The downside of such a system is that the report is not interactive, for instance one cannot click on the cases and get the linelist. Nevertheless, this is a workable solution in +many cases -- especially when human and financial resources are narrow. In the next section, we present a more advanced surveillance system built on the package. \subsection{Automatic detection of outbreaks at the Robert Koch Institute} \label{sec:RKI} The package \pkg{surveillance} was used as a core building block for designing and implementing the automated outbreak detection system at the RKI in Germany~\citep{Dirk}. The text below describes the system as it was in early 2014. -Due to the Infection Protection Act (IfSG) the RKI daily receives over 1,000 notifiable disease reports. The system analyses about half a million time series per day to identify possible aberrations in the reported number of cases. +Due to the Infection Protection Act (IfSG) the RKI daily receives over 1,000 notifiable disease reports. The system analyses about half a million time series per day to identify possible aberrations in the reported number of cases. Structurally, it consists of two components: an analytical process written in \proglang{R} that daily monitors the data and a reporting component that compiles and communicates the results to the epidemiologists. The analysis task in the described version of the system relied on \pkg{surveillance} and three other \proglang{R} packages, namely \pkg{data.table}, \pkg{RODBC} and \pkg{testthat} as described in the following. The data-backend is an OLAP-system~\citep{SSAS} and relational databases, which are queried using \pkg{RODBC}~\citep{rodbc2013}. The case reports are then rapidly aggregated into univariate time series using \pkg{data.table}~\citep{datatable2013}. To each time series we apply the \code{farringtonFlexible} algorithm on univariate \code{sts} objects and store the analysis results in another SQL-database. We make intensive use of \pkg{testthat}~\citep{testthat2013} for automatic testing of the component. -Although \proglang{R} is not the typical language to write bigger software components for production, choosing \proglang{R} in combination with \pkg{surveillance} enabled us to quickly develop the analysis workflow. We can hence report positive experience using \proglang{R} also for larger software components in production. +Although \proglang{R} is not the typical language to write bigger software components for production, choosing \proglang{R} in combination with \pkg{surveillance} enabled us to quickly develop the analysis workflow. We can hence report positive experience using \proglang{R} also for larger software components in production. The reporting component was realized using Microsoft Reporting Services~\citep{SSRS}, because this technology is widely used within the RKI. It allows quick development of reports and works well with existing Microsoft Office tools, which the end-user, the epidemiologist, is used to. For example, one major requirement by the epidemiologists was to have the results compiled as Excel documents. @@ -1801,7 +1801,7 @@ \section*{Acknowledgments} -The authors would like to express their gratitude to all contributors to the package, in particular +The authors would like to express their gratitude to all contributors to the package, in particular Juliane Manitz, University of G\"{o}ttingen, Germany, for her work on the \texttt{boda} code and Angela Noufaily, The Open University, Milton Keynes, UK, for providing us the code used in her article that we extended for \texttt{farringtonFlexible}. The work of M. Salmon was financed by a PhD grant of the RKI. \bibliography{monitoringCounts,references} Binary files /tmp/tmpFLLNnA/6m_V_jmLxa/r-cran-surveillance-1.12.2/vignettes/surveillance-hmm.pdf and /tmp/tmpFLLNnA/NJ1xxKg7_3/r-cran-surveillance-1.13.0/vignettes/surveillance-hmm.pdf differ