diff -Nru r-cran-foreach-1.3.2/DESCRIPTION r-cran-foreach-1.4.0/DESCRIPTION --- r-cran-foreach-1.3.2/DESCRIPTION 2011-05-20 12:05:29.000000000 +0000 +++ r-cran-foreach-1.4.0/DESCRIPTION 2012-04-16 06:09:53.000000000 +0000 @@ -1,7 +1,7 @@ Package: foreach Type: Package Title: Foreach looping construct for R -Version: 1.3.2 +Version: 1.4.0 Author: Revolution Analytics Maintainer: Revolution Analytics Description: Support for the foreach looping construct. Foreach is an @@ -12,12 +12,13 @@ standard lapply function, but doesn't require the evaluation of a function. Using foreach without side effects also facilitates executing the loop in parallel. -Depends: R (>= 2.5.0), iterators(>= 1.0.0), codetools, utils +Depends: R (>= 2.5.0) +Imports: codetools, utils, iterators Suggests: randomForest -Enhances: compiler, doMC, RUnit, doSMP +Enhances: compiler, doMC, RUnit, doParallel License: Apache License (== 2.0) Repository: CRAN Repository/R-Forge/Project: foreach -Repository/R-Forge/Revision: 14 -Date/Publication: 2011-05-20 12:05:29 -Packaged: 2011-05-18 20:56:21 UTC; rforge +Repository/R-Forge/Revision: 22 +Packaged: 2012-04-13 19:51:13 UTC; rforge +Date/Publication: 2012-04-16 06:09:53 diff -Nru r-cran-foreach-1.3.2/MD5 r-cran-foreach-1.4.0/MD5 --- r-cran-foreach-1.3.2/MD5 1970-01-01 00:00:00.000000000 +0000 +++ r-cran-foreach-1.4.0/MD5 2012-04-16 06:09:53.000000000 +0000 @@ -0,0 +1,61 @@ +138aa66013a68444b9300e5bd1644b43 *DESCRIPTION +c32fc6487cdb79913750f7097de2075f *NAMESPACE +30bc1844d81b52c7b15ba18648308e14 *NEWS +47e33082104d2bc4bbf6903a8f5ee69a *R/acc.R +5027fbd0eb0875545deefc4e77df3085 *R/do.R +a7eea424b2f51e3fdc52da6a58a0c278 *R/foreach.R +5663b485e61fe4127c4a55d80ccb3e71 *R/getsyms.R +ba7e6bee7fc596e6ee08feaa2ee44a62 *R/times.R +85b4a5712555f13aefa4d723ba78fcd9 *R/zzz.R +caa28a573b448cbd60f22052c2bca7a3 *demo/00Index +515798525e9e08e68a66ab24df3d17a0 *demo/sincSEQ.R +f8ac22a80a28c04f29ae2a88686d07b7 *inst/doc/foreach.Rnw +e23b13b994fcc332fc3c449f1d41ad57 *inst/doc/foreach.pdf +b58f77e7ee3b70d93050f5bb19502b5a *inst/doc/nested.Rnw +4e42381845e79aa70c3590fa30803939 *inst/doc/nested.pdf +3380c6bfe2789c1316d36c368d968bc1 *inst/examples/apply.R +500fc0fa2cb4b07b809e974bc99ba4b9 *inst/examples/bigmax.R +4549e5165479d323e9348b2d726de9da *inst/examples/bigmean.R +31be8c935f6fef084c3b9f69148d9d95 *inst/examples/bigmean2.R +20be562ced9134739ff69cbe09a20b7c *inst/examples/bootpar.R +53e2a90eb9cf9eb9f6a02f1658b87b7c *inst/examples/bootpar2.R +00f17395946b1090a308985cbca56515 *inst/examples/bootseq.R +e0be352dcb9674bd2e97f522ae9a4bf6 *inst/examples/colMeans.R +40c1d5c69df84a9d96c21091999335bb *inst/examples/comprehensions.R +684705c4a63eeadc8cc37f25a2606269 *inst/examples/cross.R +4fb83e90b01ab5da4e25805cce489393 *inst/examples/feapply.R +f99bbd4ecd00c5558e4c130c6b453161 *inst/examples/for.R +dd6d1bd8f4bcc4555b900b9c00955c33 *inst/examples/germandata.txt +b0684796a1576e974134f62b8fc5e6fc *inst/examples/isplit.R +f0e946d73e7dea4c65dcd1d2754b7ad5 *inst/examples/matmul.R +abf01cf248cf054f26c0a29ced984e8a *inst/examples/matmul2.R +cadff38eb4c9fc2fa385d600e42046b3 *inst/examples/output.R +acd652ebc9903fb4a039b09f3ac102a9 *inst/examples/pi.R +62900b80c46aa3fa40b69edef2e51291 *inst/examples/qsort.R +81fecace2a92963972db2adb2fc9fed7 *inst/examples/rf.R +db5fcdbddb502aab39e82d17a10b0c1a *inst/examples/sinc.R +23d155b4116b87e6304a954d630371d3 *inst/examples/sinc2.R +4aaa9d9782f8d0b331e3dcc021bd9c14 *inst/examples/sqlite.R +80d22f8c75b2d99335d8f09c9c22dc34 *inst/examples/tuneRF.R +3d981a90b7471c26347ec0593ca55167 *inst/unitTests/combineTest.R +bdc6faf27f9438670191fb0e9571bd01 *inst/unitTests/errorTest.R +ff12ff3dd5c50845c81e15178de13b36 *inst/unitTests/foreachTest.R +1f4ee6f110624a1678ba8976ba7aec1d *inst/unitTests/iteratorTest.R +c14fa871ff9d9fc719fdce467f0d717d *inst/unitTests/loadFactorTest.R +9321ccd8b46047e704893cab4c5c6795 *inst/unitTests/mergeTest.R +e8cdce27b2b33bd51fcb021f9034d276 *inst/unitTests/nestedTest.R +ddcfe0035e22f2d6c2f9e454b439cffa *inst/unitTests/packagesTest.R +d96e58771409bee69f000df068219695 *inst/unitTests/runTestSuite.sh +fb0831e84e6f8d7b062bc7d2c3cbd892 *inst/unitTests/stressTest.R +802149beb60028bba88d9206d6056118 *inst/unitTests/whenTest.R +3f8eb3a21cfa4ca1720d247a89dd671b *man/foreach-ext.Rd +6d93867afe82453e171d5dbea2814347 *man/foreach-package.Rd +bd6f8b6d151c6bfb91f493c8d9d3beb2 *man/foreach.Rd +ccb36ea1d3b7a8df1b3507663fc2a8b8 *man/getDoParWorkers.Rd +649d347d1fc6c8bfcce655b70e7ff43d *man/getDoSeqWorkers.Rd +807d977bb241d21ac67742178d24c4e0 *man/registerDoSEQ.Rd +a2b2665ab547a5ceb315dd36a5dcdffa *man/setDoPar.Rd +6c3efff477172db6a6b8ac789c125f74 *man/setDoSeq.Rd +810a0add5503c91c3cf9c01561397af8 *tests/doRUnit.R +f8ac22a80a28c04f29ae2a88686d07b7 *vignettes/foreach.Rnw +b58f77e7ee3b70d93050f5bb19502b5a *vignettes/nested.Rnw diff -Nru r-cran-foreach-1.3.2/NAMESPACE r-cran-foreach-1.4.0/NAMESPACE --- r-cran-foreach-1.3.2/NAMESPACE 2011-05-18 20:55:10.000000000 +0000 +++ r-cran-foreach-1.4.0/NAMESPACE 2012-04-11 18:07:29.000000000 +0000 @@ -1,7 +1,8 @@ export(foreach, when, times, "%do%", "%dopar%", "%:%", registerDoSEQ, - getDoParRegistered, getDoParWorkers, getDoParName, getDoParVersion, - setDoPar, getResult, getErrorValue, getErrorIndex, accumulate, - makeAccum, getexports) + getDoSeqRegistered, getDoSeqWorkers, getDoSeqName, getDoSeqVersion, + setDoSeq, getDoParRegistered, getDoParWorkers, getDoParName, + getDoParVersion, setDoPar, getResult, getErrorValue, getErrorIndex, + accumulate, makeAccum, getexports) S3method("iter", "foreach") S3method("iter", "filteredforeach") S3method("iter", "xforeach") @@ -21,4 +22,5 @@ S3method("accumulate", "ifilteredforeach") S3method("accumulate", "ixforeach") import(iterators) -import(codetools) +importFrom(codetools, "findGlobals") +import(utils) diff -Nru r-cran-foreach-1.3.2/NEWS r-cran-foreach-1.4.0/NEWS --- r-cran-foreach-1.3.2/NEWS 1970-01-01 00:00:00.000000000 +0000 +++ r-cran-foreach-1.4.0/NEWS 2012-04-13 16:49:35.000000000 +0000 @@ -0,0 +1,28 @@ +NEWS/ChangeLog for foreach +-------------------------- + +1.4.0 2012-04-11 + o Removed spurious warning from getDoSEQ. Bug report from Ben + Barnes. + o Moved welcome message from .onLoad to .onAttach. Bug report + from Benilton Carvalho. + o Modified setDoPar and setDoSeq to undo changes to .foreachGlobals + on error. Bug report from Benilton Carvalho. + o Moved vignettes from inst/doc to vignettes. + o Modified DESCRIPTION file by moving codetools, iterators, and utils + from Depends to Imports. Bug report from Suraj Gupta. + +1.3.5 2012-03-14 + o Cleanup from previous patch. Bug report from Brian Ripley. + +1.3.4 2012-03-12 + o Added support for multiple sequential backends. (Idea and patch + from Tyler Pirtle, Matt Furia, and Joseph Hellerstein.) + o Modified doRUnit.R to use no more than two cores during R CMD check. + +1.3.2 2011-05-08 + o Regularized unit tests so they can run through R CMD check + o Added support for compiler package of 2.13.0 and later. + +1.3.1 2010-11-22 + o First R-forge release. diff -Nru r-cran-foreach-1.3.2/R/do.R r-cran-foreach-1.4.0/R/do.R --- r-cran-foreach-1.3.2/R/do.R 2011-05-18 20:55:11.000000000 +0000 +++ r-cran-foreach-1.4.0/R/do.R 2012-04-13 16:49:35.000000000 +0000 @@ -18,17 +18,48 @@ # this is called to register a parallel backend setDoPar <- function(fun, data=NULL, info=function(data, item) NULL) { - assign('fun', fun, pos=.foreachGlobals, inherits=FALSE) - assign('data', data, pos=.foreachGlobals, inherits=FALSE) - assign('info', info, pos=.foreachGlobals, inherits=FALSE) + tryCatch( + { + assign('fun', fun, pos=.foreachGlobals, inherits=FALSE) + assign('data', data, pos=.foreachGlobals, inherits=FALSE) + assign('info', info, pos=.foreachGlobals, inherits=FALSE) + }, error = function(e) { + if (exists('fun', where=.foreachGlobals, inherits=FALSE)) + remove('fun', envir=.foreachGlobals) + if (exists('data', where=.foreachGlobals, inherits=FALSE)) + remove('data', envir=.foreachGlobals) + if (exists('info', where=.foreachGlobals, inherits=FALSE)) + remove('info', envir=.foreachGlobals) + e + }) } -# this explicitly registers a sequential parallel backend + +# this is called to register a sequential backend +setDoSeq <- function(fun, data=NULL, info=function(data, item) NULL) { + tryCatch( + { + assign('seqFun', fun, pos=.foreachGlobals, inherits=FALSE) + assign('seqData', data, pos=.foreachGlobals, inherits=FALSE) + assign('seqInfo', info, pos=.foreachGlobals, inherits=FALSE) + }, error = function(e) { + if (exists('fun', where=.foreachGlobals, inherits=FALSE)) + remove('fun', envir = .foreachGlobals) + if (exists('data', where=.foreachGlobals, inherits=FALSE)) + remove('data', envir = .foreachGlobals) + if (exists('info', where=.foreachGlobals, inherits=FALSE)) + remove('info', envir = .foreachGlobals) + e + }) +} + +# this explicitly registers a sequential backend for do and dopar. registerDoSEQ <- function() { setDoPar(doSEQ, NULL, info) + setDoSeq(doSEQ, NULL, info) } -# passed to setDoPar via registerDoSEQ, and called by getDoParWorkers, etc +# passed to setDoPar via registerDoSEQ, and called by getDoSeqWorkers, etc info <- function(data, item) { switch(item, workers=1L, @@ -37,6 +68,12 @@ NULL) } +# this returns a logical value indicating if a sequential backend +# has been registered or not +getDoSeqRegistered <- function() { + exists('seqFun', where=.foreachGlobals, inherits=FALSE) +} + # this returns a logical value indicating if a parallel backend # has been registered or not getDoParRegistered <- function() { @@ -44,6 +81,19 @@ } # this returns the number of workers used by the currently registered +# sequential backend +getDoSeqWorkers <- function() { + wc <- if (exists('seqInfo', where=.foreachGlobals, inherits=FALSE)) + .foreachGlobals$seqInfo(.foreachGlobals$seqData, 'workers') + else + NULL + + # interpret a NULL as a single worker, but the backend + # can return NA without interference + if (is.null(wc)) 1L else wc +} + +# this returns the number of workers used by the currently registered # parallel backend getDoParWorkers <- function() { wc <- if (exists('info', where=.foreachGlobals, inherits=FALSE)) @@ -56,6 +106,14 @@ if (is.null(wc)) 1L else wc } +# this returns the name of the currently registered sequential backend +getDoSeqName <- function() { + if (exists('seqInfo', where=.foreachGlobals, inherits=FALSE)) + .foreachGlobals$seqInfo(.foreachGlobals$seqData, 'name') + else + NULL +} + # this returns the name of the currently registered parallel backend getDoParName <- function() { if (exists('info', where=.foreachGlobals, inherits=FALSE)) @@ -64,6 +122,14 @@ NULL } +# this returns the version of the currently registered sequential backend +getDoSeqVersion <- function() { + if (exists('seqInfo', where=.foreachGlobals, inherits=FALSE)) + .foreachGlobals$seqInfo(.foreachGlobals$seqData, 'version') + else + NULL +} + # this returns the version of the currently registered parallel backend getDoParVersion <- function() { if (exists('info', where=.foreachGlobals, inherits=FALSE)) @@ -73,21 +139,31 @@ } # used internally to get the currently registered parallel backend +getDoSeq <- function() { + if (exists('seqFun', where=.foreachGlobals, inherits=FALSE)) { + list(fun=.foreachGlobals$seqFun, data=.foreachGlobals$seqdata) + } else { + list(fun=doSEQ, data=NULL) + } +} + +# used internally to get the currently registered parallel backend getDoPar <- function() { if (exists('fun', where=.foreachGlobals, inherits=FALSE)) { list(fun=.foreachGlobals$fun, data=.foreachGlobals$data) } else { - if (!exists('warningIssued', where=.foreachGlobals, inherits=FALSE)) { + if (!exists('parWarningIssued', where=.foreachGlobals, inherits=FALSE)) { warning('executing %dopar% sequentially: no parallel backend registered', call.=FALSE) - assign('warningIssued', TRUE, pos=.foreachGlobals, inherits=FALSE) + assign('parWarningIssued', TRUE, pos=.foreachGlobals, inherits=FALSE) } list(fun=doSEQ, data=NULL) } } '%do%' <- function(obj, ex) { - doSEQ(obj, substitute(ex), parent.frame()) + e <- getDoSeq() + e$fun(obj, substitute(ex), parent.frame(), e$data) } '%dopar%' <- function(obj, ex) { diff -Nru r-cran-foreach-1.3.2/R/foreach.R r-cran-foreach-1.4.0/R/foreach.R --- r-cran-foreach-1.3.2/R/foreach.R 2011-05-18 20:55:11.000000000 +0000 +++ r-cran-foreach-1.4.0/R/foreach.R 2012-02-24 22:35:18.000000000 +0000 @@ -239,7 +239,7 @@ # put the result in our buffer cache name <- paste('result', tag, sep='.') - assign(name, result, obj$state, inherit=FALSE) + assign(name, result, obj$state, inherits=FALSE) ibuf <- if (obj$combineInfo$in.order) { tag - obj$state$buf.off } else { diff -Nru r-cran-foreach-1.3.2/R/zzz.R r-cran-foreach-1.4.0/R/zzz.R --- r-cran-foreach-1.3.2/R/zzz.R 2011-05-18 20:55:11.000000000 +0000 +++ r-cran-foreach-1.4.0/R/zzz.R 2012-04-11 18:07:29.000000000 +0000 @@ -14,7 +14,7 @@ # limitations under the License. # -.onLoad <- function(lib, pkg) { +.onAttach <- function(lib, pkg) { if (interactive()) { packageStartupMessage('foreach: simple, scalable parallel programming from Revolution Analytics\n', 'Use Revolution R for scalability, fault tolerance and more.\n', diff -Nru r-cran-foreach-1.3.2/debian/changelog r-cran-foreach-1.4.0/debian/changelog --- r-cran-foreach-1.3.2/debian/changelog 2013-05-05 02:37:17.000000000 +0000 +++ r-cran-foreach-1.4.0/debian/changelog 2013-05-05 02:37:17.000000000 +0000 @@ -1,3 +1,41 @@ +r-cran-foreach (1.4.0-2precise0) precise; urgency=low + + * Compilation for Ubuntu 12.04.2 LTS + + -- Michael Rutter Sun, 05 May 2013 02:34:12 +0000 + +r-cran-foreach (1.4.0-2) unstable; urgency=low + + * debian/control: Set Build-Depends: to current R version + + * (Re-)building with R 3.0.0 (beta) + + -- Dirk Eddelbuettel Sun, 31 Mar 2013 07:30:07 -0500 + +r-cran-foreach (1.4.0-1) unstable; urgency=low + + * New upstream release + + * debian/control: Set Build-Depends: to current R version + + -- Dirk Eddelbuettel Tue, 17 Apr 2012 08:28:13 -0500 + +r-cran-foreach (1.3.5-1) unstable; urgency=low + + * New upstream release + + -- Dirk Eddelbuettel Wed, 14 Mar 2012 19:19:44 -0500 + +r-cran-foreach (1.3.4-1) unstable; urgency=low + + * New upstream release + + * debian/control: Set Build-Depends: to current R version + * debian/control: Change Depends to ${R:Depends} + * debian/control: Set Standards-Version: to current version + + -- Dirk Eddelbuettel Tue, 13 Mar 2012 10:12:21 -0500 + r-cran-foreach (1.3.2-1) unstable; urgency=low * New upstream release diff -Nru r-cran-foreach-1.3.2/debian/control r-cran-foreach-1.4.0/debian/control --- r-cran-foreach-1.3.2/debian/control 2013-05-05 02:37:17.000000000 +0000 +++ r-cran-foreach-1.4.0/debian/control 2013-05-05 02:37:17.000000000 +0000 @@ -2,13 +2,13 @@ Section: gnu-r Priority: optional Maintainer: Dirk Eddelbuettel -Build-Depends: debhelper (>= 7.0.0), r-base-dev (>= 2.13.0), cdbs, r-cran-codetools, r-cran-iterators -Standards-Version: 3.9.1 +Build-Depends: debhelper (>= 7.0.0), r-base-dev (>= 3.0.0~20130327), cdbs, r-cran-codetools, r-cran-iterators +Standards-Version: 3.9.4 Homepage: http://cran.r-project.org/web/packages/foreach/index.html Package: r-cran-foreach Architecture: all -Depends: r-base-core (>= 2.13.0), r-cran-codetools, r-cran-iterators +Depends: ${R:Depends}, r-cran-codetools, r-cran-iterators Description: GNU R foreach looping support This package provides support for the foreach looping construct. Foreach is an idiom that allows for iterating over elements in a diff -Nru r-cran-foreach-1.3.2/inst/doc/foreach.Rnw r-cran-foreach-1.4.0/inst/doc/foreach.Rnw --- r-cran-foreach-1.3.2/inst/doc/foreach.Rnw 2011-05-18 20:55:10.000000000 +0000 +++ r-cran-foreach-1.4.0/inst/doc/foreach.Rnw 2012-04-13 19:51:13.000000000 +0000 @@ -297,6 +297,7 @@ for each time it is called. For example: <>= +library(iterators) x <- foreach(a=irnorm(4, count=4), .combine='cbind') %do% a x @ Binary files /tmp/PKhSjYos7f/r-cran-foreach-1.3.2/inst/doc/foreach.pdf and /tmp/VKlCrD7XHK/r-cran-foreach-1.4.0/inst/doc/foreach.pdf differ diff -Nru r-cran-foreach-1.3.2/inst/doc/nested.Rnw r-cran-foreach-1.4.0/inst/doc/nested.Rnw --- r-cran-foreach-1.3.2/inst/doc/nested.Rnw 2011-05-18 20:55:10.000000000 +0000 +++ r-cran-foreach-1.4.0/inst/doc/nested.Rnw 2012-04-13 19:51:13.000000000 +0000 @@ -215,7 +215,7 @@ Task chunking allows you to send multiple tasks to the workers at once. This can be much more efficient, especially for short tasks. Currently, -only the \texttt{doNWS} and \texttt{doSMP} backends support task +only the \texttt{doNWS} backend supports task chunking. Here's how it's done with \texttt{doNWS}: <>= @@ -330,6 +330,7 @@ \texttt{foreach} function.} <>= +library(iterators) opts <- list(chunkSize=2) d <- foreach(b=bvec, j=icount(), .combine='rbind', .options.nws=opts) %:% Binary files /tmp/PKhSjYos7f/r-cran-foreach-1.3.2/inst/doc/nested.pdf and /tmp/VKlCrD7XHK/r-cran-foreach-1.4.0/inst/doc/nested.pdf differ diff -Nru r-cran-foreach-1.3.2/inst/unitTests/runTestSuite.sh r-cran-foreach-1.4.0/inst/unitTests/runTestSuite.sh --- r-cran-foreach-1.3.2/inst/unitTests/runTestSuite.sh 2011-05-18 20:55:10.000000000 +0000 +++ r-cran-foreach-1.4.0/inst/unitTests/runTestSuite.sh 2012-04-11 18:07:29.000000000 +0000 @@ -27,28 +27,6 @@ cat('** Using multicore backend\n') library(doMC) registerDoMC() -} else if (method == 'SMP') { - cat('** Using SMP backend\n') - library(doSMP) - w <- startWorkers(verbose=verbose) - .Last <- function() { - cat('shutting down SMP workers...\n') - stopWorkers(w) - cat('shutdown complete\n') - } - registerDoSMP(w) - - # initialize the workers that we've just registered to use - # a sequential backend so we don't get warning messages from - # nestedTest.R when running the test suite using doSMP - initEnvir <- function(e) { - library(foreach) - registerDoSEQ() - } - smpopts <- list(initEnvir=initEnvir) - r <- foreach(icount(getDoParWorkers()), .options.smp=smpopts) %dopar% { - Sys.sleep(3) # XXX hack: need a barrier of some kind - } } else if (method == 'SEQ') { cat('** Using sequential backend\n') registerDoSEQ() diff -Nru r-cran-foreach-1.3.2/man/foreach.Rd r-cran-foreach-1.4.0/man/foreach.Rd --- r-cran-foreach-1.3.2/man/foreach.Rd 2011-05-18 20:55:11.000000000 +0000 +++ r-cran-foreach-1.4.0/man/foreach.Rd 2012-04-11 18:07:29.000000000 +0000 @@ -108,12 +108,16 @@ This faciliates parallelization, but looks more natural to people that prefer \code{for} loops to \code{lapply}. +The \code{\%:\%} operator is the \emph{nesting} operator, used for creating +nested foreach loops. Type \code{vignette("nested")} at the R prompt for +more details. + Parallel computation depends upon a \emph{parallel backend} that must be registered before performing the computation. The parallel backends available -will be system-specific, but include \code{doNWS}, which uses the NetWorkSpaces -parallelization system, \code{doMC}, which uses the \code{multicore} package, -and \code{doSNOW}. Each parallel backend has a specific registration function, -such as \code{registerDoNWS} or \code{registerDoSNOW}. +will be system-specific, but include \code{doParallel}, which uses R's built-in +\pkg{parallel} package, \pkg{doMC}, which uses the \pkg{multicore} package, +and \pkg{doSNOW}. Each parallel backend has a specific registration function, +such as \code{registerDoParallel} or \code{registerDoSNOW}. The \code{times} function is a simple convenience function that calls \code{foreach}. It is useful for evaluating an \code{R} expression multiple @@ -144,6 +148,7 @@ (m[i,] / mean(m[i,])) # simple (and inefficient) parallel matrix multiply +library(iterators) a <- matrix(1:16, 4, 4) b <- t(a) foreach(b=iter(b, by='col'), .combine=cbind) \%dopar\% diff -Nru r-cran-foreach-1.3.2/man/getDoSeqWorkers.Rd r-cran-foreach-1.4.0/man/getDoSeqWorkers.Rd --- r-cran-foreach-1.3.2/man/getDoSeqWorkers.Rd 1970-01-01 00:00:00.000000000 +0000 +++ r-cran-foreach-1.4.0/man/getDoSeqWorkers.Rd 2012-02-13 23:31:28.000000000 +0000 @@ -0,0 +1,40 @@ +\name{getDoSeqWorkers} +\alias{getDoSeqWorkers} +\alias{getDoSeqRegistered} +\alias{getDoSeqName} +\alias{getDoSeqVersion} +\title{Functions Providing Information on the doSeq Backend} +\description{ +The \code{getDoSeqWorkers} function returns the number of +execution workers there are in the currently registered doSeq backend. +A \code{1} is returned by default. + +The \code{getDoSeqRegistered} function returns TRUE if a doSeq backend +has been registered, otherwise FALSE. + +The \code{getDoSeqName} function returns the name of the currently +registered doSeq backend. A \code{NULL} is returned if no backend is +registered. + +The \code{getDoSeqVersion} function returns the version of the currently +registered doSeq backend. A \code{NULL} is returned if no backend is +registered. +} +\usage{ +getDoSeqWorkers() +getDoSeqRegistered() +getDoSeqName() +getDoSeqVersion() +} + +\examples{ +cat(sprintf('\%s backend is registered\n', + if(getDoSeqRegistered()) 'A' else 'No')) +cat(sprintf('Running with \%d worker(s)\n', getDoSeqWorkers())) +(name <- getDoSeqName()) +(ver <- getDoSeqVersion()) +if (getDoSeqRegistered()) + cat(sprintf('Currently using \%s [\%s]\n', name, ver)) +} + +\keyword{utilities} diff -Nru r-cran-foreach-1.3.2/man/setDoSeq.Rd r-cran-foreach-1.4.0/man/setDoSeq.Rd --- r-cran-foreach-1.3.2/man/setDoSeq.Rd 1970-01-01 00:00:00.000000000 +0000 +++ r-cran-foreach-1.4.0/man/setDoSeq.Rd 2012-02-13 23:31:28.000000000 +0000 @@ -0,0 +1,22 @@ +\name{setDoSeq} +\alias{setDoSeq} +\title{setDoSeq} +\description{ +The \code{setDoSeq} function is used to register a sequential backend with the +foreach package. This isn't normally executed by the user. Instead, packages +that provide a sequential backend provide a function named \code{registerDoSeq} +that calls \code{setDoSeq} using the appropriate arguments. +} +\usage{ +setDoSeq(fun, data=NULL, info=function(data, item) NULL) +} +\arguments{ + \item{fun}{A function that implements the functionality of \code{\%dopar\%}.} + \item{data}{Data to be passed to the registered function.} + \item{info}{Function that retrieves information about the backend.} +} +\seealso{ + \code{\link{\%dopar\%}} +} + +\keyword{utilities} diff -Nru r-cran-foreach-1.3.2/tests/doRUnit.R r-cran-foreach-1.4.0/tests/doRUnit.R --- r-cran-foreach-1.3.2/tests/doRUnit.R 2011-05-18 20:55:11.000000000 +0000 +++ r-cran-foreach-1.4.0/tests/doRUnit.R 2012-04-11 18:07:29.000000000 +0000 @@ -21,30 +21,19 @@ ################################################################ ## BEGIN PACKAGE SPECIFIC CONFIGURATION # ################################################################ - if ("doSMP" %in% row.names(installed.packages())){ - library(doSMP) - w <- startWorkers() + if ("doParallel" %in% row.names(installed.packages())){ + library(doParallel) + w <- makeCluster(2) .Last <- function(){ - cat('shutting down SMP workers...\n') - stopWorkers(w) + cat('shutting down cluster...\n') + stopCluster(w) cat('shutdown complete\n') } - registerDoSMP(w) - # initialize the workers that we've just registered to use - # a sequential backend so we don't get warning messages from - # nestedTest.R when running the test suite using doSMP - initEnvir <- function(e) { - library(foreach) - registerDoSEQ() - } - smpopts <- list(initEnvir=initEnvir) - r <- foreach(icount(getDoParWorkers()), .options.smp=smpopts) %dopar% { - Sys.sleep(3) # XXX hack: need a barrier of some kind - } + registerDoParallel(cl=w) } else if ("doMC" %in% row.names(installed.packages())) { library(doMC) - registerDoMC() + registerDoMC(2) } else { # default to sequential registerDoSEQ() diff -Nru r-cran-foreach-1.3.2/vignettes/foreach.Rnw r-cran-foreach-1.4.0/vignettes/foreach.Rnw --- r-cran-foreach-1.3.2/vignettes/foreach.Rnw 1970-01-01 00:00:00.000000000 +0000 +++ r-cran-foreach-1.4.0/vignettes/foreach.Rnw 2012-04-11 18:07:29.000000000 +0000 @@ -0,0 +1,567 @@ +% \VignetteIndexEntry{foreach Manual} +% \VignetteDepends{foreach} +% \VignettePackage{foreach} +\documentclass[12pt]{article} +\usepackage{amsmath} +\usepackage[pdftex]{graphicx} +\usepackage{color} +\usepackage{xspace} +\usepackage{fancyvrb} +\usepackage{fancyhdr} + \usepackage[ + colorlinks=true, + linkcolor=blue, + citecolor=blue, + urlcolor=blue] + {hyperref} + \usepackage{lscape} + +\usepackage{Sweave} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +% define new colors for use +\definecolor{darkgreen}{rgb}{0,0.6,0} +\definecolor{darkred}{rgb}{0.6,0.0,0} +\definecolor{lightbrown}{rgb}{1,0.9,0.8} +\definecolor{brown}{rgb}{0.6,0.3,0.3} +\definecolor{darkblue}{rgb}{0,0,0.8} +\definecolor{darkmagenta}{rgb}{0.5,0,0.5} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +\newcommand{\bld}[1]{\mbox{\boldmath $#1$}} +\newcommand{\shell}[1]{\mbox{$#1$}} +\renewcommand{\vec}[1]{\mbox{\bf {#1}}} + +\newcommand{\ReallySmallSpacing}{\renewcommand{\baselinestretch}{.6}\Large\normalsize} +\newcommand{\SmallSpacing}{\renewcommand{\baselinestretch}{1.1}\Large\normalsize} + +\newcommand{\halfs}{\frac{1}{2}} + +\setlength{\oddsidemargin}{-.25 truein} +\setlength{\evensidemargin}{0truein} +\setlength{\topmargin}{-0.2truein} +\setlength{\textwidth}{7 truein} +\setlength{\textheight}{8.5 truein} +\setlength{\parindent}{0.20truein} +\setlength{\parskip}{0.10truein} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\pagestyle{fancy} +\lhead{} +\chead{Using The {\tt foreach} Package} +\rhead{} +\lfoot{} +\cfoot{} +\rfoot{\thepage} +\renewcommand{\headrulewidth}{1pt} +\renewcommand{\footrulewidth}{1pt} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +\title{Using The {\tt foreach} Package} +\author{Steve Weston \\ doc@revolutionanalytics.com} + + +\begin{document} + +\maketitle + +\thispagestyle{empty} + +\section{Introduction} + +One of R's most useful features is its interactive interpreter. This +makes it very easy to learn and experiment with R. It allows you to +use R like a calculator to perform arithmetic operations, display data +sets, generate plots, and create models. + +Before too long, new R users will find a need to perform some +operation repeatedly. Perhaps they want to run a simulation repeatedly +in order to find the distribution of the results. Perhaps they need to +execute a function with a variety a different arguments passed to it. +Or maybe they need to create a model for many different data sets. + +Repeated executions can be done manually, but it becomes quite +tedious to execute repeated operations, even with the use of command +line editing. Fortunately, R is much more than an interactive +calculator. It has its own built-in language that is intended to +automate tedious tasks, such as repeatedly executing R calculations. + +R comes with various looping constructs that solve this problem. The +\texttt{for} loop is one of the more common looping constructs, but +the \texttt{repeat} and \texttt{while} statements are also quite useful. +In addition, there is the family of ``apply'' functions, which includes +\texttt{apply}, \texttt{lapply}, \texttt{sapply}, \texttt{eapply}, +\texttt{mapply}, \texttt{rapply}, and others. + +The \texttt{foreach} package provides a new looping construct for +executing R code repeatedly. With the bewildering variety of existing +looping constructs, you may doubt that there is a need for yet another +construct. The main reason for using the \texttt{foreach} package is +that it supports {\em parallel execution}, that is, it can execute those +repeated operations on multiple processors/cores on your computer, or on +multiple nodes of a cluster. If each operation takes over a minute, and +you want to execute it hundreds of times, the overall runtime can take +hours. But using \texttt{foreach}, that operation can be executed in +parallel on hundreds of processors on a cluster, reducing the execution +time back down to minutes. + +But parallel execution is not the only reason for using the +\texttt{foreach} package. There are other reasons that you might choose +to use it to execute quick executing operations, as we will see later in +the document. + +\section{Getting Started} + +Let's take a look at a simple example use of the \texttt{foreach} package. +Assuming that you have the \texttt{foreach} package installed, you first +need to load it: + +<>= +library(foreach) +@ + +Note that all of the packages that \texttt{foreach} depends on will be +loaded as well. + +Now I can use \texttt{foreach} to execute the \texttt{sqrt} function +repeatedly, passing it the values 1 through 3, and returning the results +in a list, called \texttt{x}\footnote{Of course, \texttt{sqrt} is a +vectorized function, so you would never really do this. But later, +we'll see how to take advantage of vectorized functions with +\texttt{foreach}.}: + +<>= +x <- foreach(i=1:3) %do% sqrt(i) +x +@ + +This is a bit odd looking, because it looks vaguely like a \texttt{for} +loop, but is implemented using a binary operator, called +\texttt{\%do\%}. Also, unlike a \texttt{for} loop, it returns a +value. This is quite important. The purpose of this statement is to +compute the list of results. Generally, \texttt{foreach} with +\texttt{\%do\%} is used to execute an R expression repeatedly, and return +the results in some data structure or object, which is a list by +default. + +You will note in the previous example that we used a variable \texttt{i} as +the argument to the \texttt{sqrt} function. We specified the values of the +\texttt{i} variable using a named argument to the \texttt{foreach} function. We +could have called that variable anything we wanted, for example, \texttt{a}, +or \texttt{b}. We could also specify other variables to be used in the R +expression, as in the following example: + +<>= +x <- foreach(a=1:3, b=rep(10, 3)) %do% (a + b) +x +@ + +Note that parentheses are needed here. We can also use braces: + +<>= +x <- foreach(a=1:3, b=rep(10, 3)) %do% { + a + b +} +x +@ + +We call \texttt{a} and \texttt{b} the {\em iteration variables}, since those are the variables that are changing during the multiple executions. Note that +we are iterating over them in parallel, that is, they are both changing +at the same time. In this case, the same number of values are being +specified for both iteration variables, but that need not be the case. +If we only supplied two values for \texttt{b}, the result would be a list of +length two, even if we specified a thousand values for \texttt{a}: + +<>= +x <- foreach(a=1:1000, b=rep(10, 2)) %do% { + a + b +} +x +@ + +Note that you can put multiple statements between the braces, and you +can use assignment statements to save intermediate values of +computations. However, if you use an assignment as a way of +communicating between the different executions of your loop, then your +code won't work correctly in parallel, which we will discuss later. + +\section{The \texttt{.combine} Option} + +So far, all of our examples have returned a list of results. This is a +good default, since a list can contain any R object. But sometimes +we'd like the results to be returned in a numeric vector, for example. +This can be done by using the \texttt{.combine} option to \texttt{foreach}: + +<>= +x <- foreach(i=1:3, .combine='c') %do% exp(i) +x +@ + +The result is returned as a numeric vector, because the standard R \texttt{c} +function is being used to concatenate all the results. Since the +\texttt{exp} function returns numeric values, concatenating them with +the \texttt{c} function will result in a numeric vector of length three. + +What if the R expression returns a vector, and we want to combine those +vectors into a matrix? One way to do that is with the \texttt{cbind} function: + +<>= +x <- foreach(i=1:4, .combine='cbind') %do% rnorm(4) +x +@ + +This generates four vectors of four random numbers, and combines them by +column to produce a 4 by 4 matrix. + +We can also use the \texttt{"+"} or \texttt{"*"} functions to combine our results: + +<>= +x <- foreach(i=1:4, .combine='+') %do% rnorm(4) +x +@ + +You can also specify a user-written function to combine the results. +Here's an example that throws away the results: + +<>= +cfun <- function(a, b) NULL +x <- foreach(i=1:4, .combine='cfun') %do% rnorm(4) +x +@ + +Note that this \texttt{cfun} function takes two arguments. The +\texttt{foreach} function knows that the functions \texttt{c}, +\texttt{cbind}, and \texttt{rbind} take many arguments, and +will call them with up to 100 arguments (by default) in order to improve +performance. But if any +other function is specified (such as \texttt{"+"}), it assumes that it only +takes two arguments. If the function does allow many arguments, you can +specify that using the \texttt{.multicombine} argument: + +<>= +cfun <- function(...) NULL +x <- foreach(i=1:4, .combine='cfun', .multicombine=TRUE) %do% rnorm(4) +x +@ + +If you want the combine function to be called with no more than 10 +arguments, you can specify that using the \texttt{.maxcombine} option: + +<>= +cfun <- function(...) NULL +x <- foreach(i=1:4, .combine='cfun', .multicombine=TRUE, .maxcombine=10) %do% rnorm(4) +x +@ + +The \texttt{.inorder} option is used to specify whether the order in which the +arguments are combined is important. The default value is +\texttt{TRUE}, but if the combine function is \texttt{"+"}, you could specify +\texttt{.inorder} to be \texttt{FALSE}. Actually, this option is important +only when executing the R expression in parallel, since results are always +computed in order when running sequentially. This is not necessarily true when +executing in parallel, however. In fact, if the expressions take very +different lengths of time to execute, the results could be returned in +any order. Here's a contrived example, that executes the tasks in +parallel to demonstrate the difference. The example uses the +\texttt{Sys.sleep} function +to cause the earlier tasks to take longer to execute: + +<>= +foreach(i=4:1, .combine='c') %dopar% { + Sys.sleep(3 * i) + i +} +foreach(i=4:1, .combine='c', .inorder=FALSE) %dopar% { + Sys.sleep(3 * i) + i +} +@ + +The results of the first of these two examples is guaranteed to be the +vector c(4, 3, 2, 1). The second example will return the same values, +but they will probably be in a different order. + +\section{Iterators} + +The values for the iteration variables don't have to be specified with +only vectors or lists. They can be specified with an {\em iterator}, many +of which come with the \texttt{iterators} package. An iterator is an +abstract source of data. A vector isn't itself an iterator, but the +\texttt{foreach} function automatically creates an iterator from a +vector, list, matrix, or data frame, for example. You can also create +an iterator from a file or a data base query, which are natural sources +of data. The \texttt{iterators} package supplies a function called +\texttt{irnorm} which can return a specified number of random numbers +for each time it is called. For example: + +<>= +library(iterators) +x <- foreach(a=irnorm(4, count=4), .combine='cbind') %do% a +x +@ + +This becomes useful when dealing with large amounts of data. Iterators +allow the data to be generated on-the-fly, as it is needed by your +operations, rather than requiring all of the data to be generated at the +beginning. + +For example, let's say that we want to sum together a thousand random +vectors: + +<>= +set.seed(123) +x <- foreach(a=irnorm(4, count=1000), .combine='+') %do% a +x +@ + +This uses very little memory, since it is equivalent to the following +\texttt{while} loop: + +<>= +set.seed(123) +x <- numeric(4) +i <- 0 +while (i < 1000) { + x <- x + rnorm(4) + i <- i + 1 +} +x +@ + +This could have been done using the \texttt{icount} function, which +generates the values from one to 1000: +<>= +set.seed(123) +x <- foreach(icount(1000), .combine='+') %do% rnorm(4) +x +@ + +but sometimes it's preferable to generate the actual data with the +iterator (as we'll see later when we execute in parallel). + +In addition to introducing the \texttt{icount} function from the +\texttt{iterators} package, the last example also used an unnamed +argument to the \texttt{foreach} function. This can be useful when +we're not intending to generate variable values, but only controlling +the number of times that the R expression is executed. + +There's a lot more that I could say about iterators, but for now, +let's move on to parallel execution. + +\section{Parallel Execution} + +Although \texttt{foreach} can be a useful construct in its own right, +the real point of the \texttt{foreach} package is to do parallel computing. +To make any of the previous examples run in parallel, all you have to do +is to replace \texttt{\%do\%} with \texttt{\%dopar\%}. But for the +kinds of quick running operations that we've been doing, there wouldn't +be much point to executing them in parallel. Running many tiny tasks +in parallel will usually take more time to execute than running them +sequentially, and if it already runs fast, there's no motivation to make +it run faster anyway. But if the operation that we're executing in +parallel takes a minute or longer, there starts to be some motivation. + +\subsection{Parallel Random Forest} + +Let's take random forest as an example of an operation that can take +a while to execute. Let's say our inputs are the matrix \texttt{x}, and the +factor \texttt{y}: + +<>= +x <- matrix(runif(500), 100) +y <- gl(2, 50) +@ + +We've already loaded the \texttt{foreach} package, but we'll also need +to load the \texttt{randomForest} package: + +<>= +library(randomForest) +@ + +If we want want to create a random forest model with a 1000 trees, and +our computer has four cores in it, we can split up the problem into four +pieces by executing the \texttt{randomForest} function four times, with +the \texttt{ntree} argument set to 250. Of course, we have to combine +the resulting \texttt{randomForest} objects, but the +\texttt{randomForest} package comes with a function called +\texttt{combine} that does just that. + +Let's do that, but first, we'll do the work sequentially: + +<>= +rf <- foreach(ntree=rep(250, 4), .combine=combine) %do% + randomForest(x, y, ntree=ntree) +rf +@ + +To run this in parallel, we need to change \texttt{\%do\%}, but we also need to +use another \texttt{foreach} option called \texttt{.packages} to tell +the \texttt{foreach} package that the R expression needs to have the +\texttt{randomForest} package loaded in order to execute successfully. +Here's the parallel version: + +<>= +rf <- foreach(ntree=rep(250, 4), .combine=combine, .packages='randomForest') %dopar% + randomForest(x, y, ntree=ntree) +rf +@ + +If you've done any parallel computing, particularly on a cluster, you +may wonder why I didn't have to do anything special to handle \texttt{x} and +\texttt{y}. The reason is that the \texttt{\%dopar\%} function noticed that +those variables were referenced, and that they were defined in the current +environment. In that case \text{\%dopar\%} will automatically export +them to the parallel execution workers once, and use them for all of the +expression evaluations for that \texttt{foreach} execution. That is +true for functions that are defined in the current environment as well, +but in this case, the function is defined in a package, so we had to +specify the package to load with the \texttt{.packages} option instead. + +\subsection{Parallel Apply} + +Now let's take a look at how to make a parallel version of the standard +R \texttt{apply} function. The \texttt{apply} function is written in R, +and although it's only about 100 lines of code, it's a bit difficult to +understand on a first reading. However, it all really comes down two +\texttt{for} loops, the slightly more complicated of which looks like: + +<>= +applyKernel <- function(newX, FUN, d2, d.call, dn.call=NULL, ...) { + ans <- vector("list", d2) + for(i in 1:d2) { + tmp <- FUN(array(newX[,i], d.call, dn.call), ...) + if(!is.null(tmp)) ans[[i]] <- tmp + } + ans +} +applyKernel(matrix(1:16, 4), mean, 4, 4) +@ + +I've turned this into a function, because otherwise, R will complain +that I'm using ``...'' in an invalid context. + +This could be executed using \texttt{foreach} as follows: + +<>= +applyKernel <- function(newX, FUN, d2, d.call, dn.call=NULL, ...) { + foreach(i=1:d2) %dopar% + FUN(array(newX[,i], d.call, dn.call), ...) +} +applyKernel(matrix(1:16, 4), mean, 4, 4) +@ + +But this approach will cause the entire \texttt{newX} array to be sent +to each of the parallel execution workers. Since each task needs only +one column of the array, we'd like to avoid this extra data +communication. + +One way to solve this problem is to use an iterator that iterates over +the matrix by column: + +<>= +applyKernel <- function(newX, FUN, d2, d.call, dn.call=NULL, ...) { + foreach(x=iter(newX, by='col')) %dopar% + FUN(array(x, d.call, dn.call), ...) +} +applyKernel(matrix(1:16, 4), mean, 4, 4) +@ + +Now we're only sending any given column of the matrix to one parallel +execution worker. But it would be even more efficient if we sent the +matrix in bigger chunks. To do that, we use a function called +\texttt{iblkcol} that returns an iterator that will return multiple columns +of the original matrix. That means that the R expression will need to +execute the user's function once for every column in its submatrix. + +<>= +iblkcol <- function(a, chunks) { + n <- ncol(a) + i <- 1 + + nextElem <- function() { + if (chunks <= 0 || n <= 0) stop('StopIteration') + m <- ceiling(n / chunks) + r <- seq(i, length=m) + i <<- i + m + n <<- n - m + chunks <<- chunks - 1 + a[,r, drop=FALSE] + } + + structure(list(nextElem=nextElem), class=c('iblkcol', 'iter')) +} +nextElem.iblkcol <- function(obj) obj$nextElem() +@ + +<>= +applyKernel <- function(newX, FUN, d2, d.call, dn.call=NULL, ...) { + foreach(x=iblkcol(newX, 3), .combine='c', .packages='foreach') %dopar% { + foreach(i=1:ncol(x)) %do% FUN(array(x[,i], d.call, dn.call), ...) + } +} +applyKernel(matrix(1:16, 4), mean, 4, 4) +@ + +Note the use of the \texttt{\%do\%} inside the \texttt{\%dopar\%} to +call the function on the columns of the submatrix \texttt{x}. Now that +we're using \texttt{\%do\%} again, it makes sense for the iterator to be +an index into the matrix \texttt{x}, since \texttt{\%do\%} doesn't need to +copy \texttt{x} the way that \texttt{\%dopar\%} does. + +\section{List Comprehensions} + +If you're familar with the Python programming language, it may have +occurred to you that the \texttt{foreach} package provides something +that is not too different from Python's {\em list comprehensions}. +In fact, the \texttt{foreach} package also includes a function called +\texttt{when} which can prevent some of the evaluations from happening, +very much like the ``if'' clause in Python's list comprehensions. +For example, you could filter out negative values of an iterator using +\texttt{when} as follows: + +<>= +x <- foreach(a=irnorm(1, count=10), .combine='c') %:% when(a >= 0) %do% sqrt(a) +x +@ + +I won't say much on this topic, but I can't help showing how +\texttt{foreach} with \texttt{when} can be used to write a simple quick +sort function, in the classic Haskell fashion: + +<>= +qsort <- function(x) { + n <- length(x) + if (n == 0) { + x + } else { + p <- sample(n, 1) + smaller <- foreach(y=x[-p], .combine=c) %:% when(y <= x[p]) %do% y + larger <- foreach(y=x[-p], .combine=c) %:% when(y > x[p]) %do% y + c(qsort(smaller), x[p], qsort(larger)) + } +} + +qsort(runif(12)) +@ + +Not that I recommend this over the standard R \texttt{sort} function. +But it's a pretty interesting example use of \texttt{foreach}. + +\section{Conclusion} + +Much of parallel computing comes to doing three things: splitting the +problem into pieces, executing the pieces in parallel, and combining the +results back together. Using the \texttt{foreach} package, the +iterators help you to split the problem into pieces, the +\texttt{\%dopar\%} function executes the pieces in parallel, and the +specified \texttt{.combine} function puts the results back together. +We've demonstrated how simple things can be done in parallel quite +easily using the \texttt{foreach} package, and given some ideas about +how more complex problems can be solved. But it's a fairly new package, +and we will continue to work on ways of making it a more powerful system +for doing parallel computing. + +\end{document} diff -Nru r-cran-foreach-1.3.2/vignettes/nested.Rnw r-cran-foreach-1.4.0/vignettes/nested.Rnw --- r-cran-foreach-1.3.2/vignettes/nested.Rnw 1970-01-01 00:00:00.000000000 +0000 +++ r-cran-foreach-1.4.0/vignettes/nested.Rnw 2012-04-11 18:07:29.000000000 +0000 @@ -0,0 +1,361 @@ +% \VignetteIndexEntry{Nesting Foreach Loops} +% \VignetteDepends{foreach} +% \VignettePackage{foreach} +\documentclass[12pt]{article} +\usepackage{amsmath} +\usepackage[pdftex]{graphicx} +\usepackage{color} +\usepackage{xspace} +\usepackage{fancyvrb} +\usepackage{fancyhdr} + \usepackage[ + colorlinks=true, + linkcolor=blue, + citecolor=blue, + urlcolor=blue] + {hyperref} + \usepackage{lscape} + +\usepackage{Sweave} +\usepackage{float} + +\floatstyle{plain} +\newfloat{example}{thp}{lop} +\floatname{example}{Example} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +% define new colors for use +\definecolor{darkgreen}{rgb}{0,0.6,0} +\definecolor{darkred}{rgb}{0.6,0.0,0} +\definecolor{lightbrown}{rgb}{1,0.9,0.8} +\definecolor{brown}{rgb}{0.6,0.3,0.3} +\definecolor{darkblue}{rgb}{0,0,0.8} +\definecolor{darkmagenta}{rgb}{0.5,0,0.5} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +\newcommand{\bld}[1]{\mbox{\boldmath $#1$}} +\newcommand{\shell}[1]{\mbox{$#1$}} +\renewcommand{\vec}[1]{\mbox{\bf {#1}}} + +\newcommand{\ReallySmallSpacing}{\renewcommand{\baselinestretch}{.6}\Large\normalsize} +\newcommand{\SmallSpacing}{\renewcommand{\baselinestretch}{1.1}\Large\normalsize} + +\newcommand{\halfs}{\frac{1}{2}} + +\setlength{\oddsidemargin}{-.25 truein} +\setlength{\evensidemargin}{0truein} +\setlength{\topmargin}{-0.2truein} +\setlength{\textwidth}{7 truein} +\setlength{\textheight}{8.5 truein} +\setlength{\parindent}{0.20truein} +\setlength{\parskip}{0.10truein} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\pagestyle{fancy} +\lhead{} +\chead{Nesting {\tt Foreach} Loops} +\rhead{} +\lfoot{} +\cfoot{} +\rfoot{\thepage} +\renewcommand{\headrulewidth}{1pt} +\renewcommand{\footrulewidth}{1pt} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +\title{Nesting {\tt Foreach} Loops} +\author{Steve Weston \\ doc@revolutionanalytics.com} + + +\begin{document} + +\maketitle + +\thispagestyle{empty} + +\section{Introduction} + +<>= +library(foreach) +registerDoSEQ() +@ + +The \texttt{foreach} package provides a looping construct for executing +R code repeatedly. It is similar to the standard \texttt{for} loop, +which makes it easy to convert a \texttt{for} loop to a \texttt{foreach} +loop. Unlike many parallel programming packages for R, \texttt{foreach} +doesn't require the body of the \texttt{for} loop to be turned into a +function. \texttt{foreach} differs from a \texttt{for} loop in that its +return is a list of values, whereas a \texttt{for} loop has no value and +uses side effects to convey its result. Because of this, +\texttt{foreach} loops have a few advantages over \texttt{for} loops +when the purpose of the loop is to create a data structure such as a +vector, list, or matrix: First, there is less code duplication, and +hence, less chance for an error because the initialization of the vector +or matrix is unnecessary. Second, a \texttt{foreach} loop may be easily +parallelized by changing only a single keyword. + +\section{The nesting operator: \%:\%} + +An important feature of \texttt{foreach} is the \texttt{\%:\%} operator. +I call this the {\em nesting} operator because it is used to create +nested \texttt{foreach} loops. Like the \texttt{\%do\%} and +\texttt{\%dopar\%} operators, it is a binary operator, but it operates +on two \texttt{foreach} objects. It also returns a \texttt{foreach} +object, which is essentially a special merger of its operands. + +Let's say that we want to perform a Monte Carlo simulation using a +function called \texttt{sim}.\footnote{Remember that \texttt{sim} needs +to be rather compute intensive to be worth executing in parallel.} The +\texttt{sim} function takes two arguments, and we want to call it with +all combinations of the values that are stored in the vectors +\texttt{avec} and \texttt{bvec}. The following doubly-nested +\texttt{for} loop does that. For testing purposes, the \texttt{sim} +function is defined to return $10 a + b$:\footnote{Of course, an +operation this trivial is not worth executing in parallel.} + +<>= +sim <- function(a, b) 10 * a + b +avec <- 1:2 +bvec <- 1:4 +@ + +<>= +x <- matrix(0, length(avec), length(bvec)) +for (j in 1:length(bvec)) { + for (i in 1:length(avec)) { + x[i,j] <- sim(avec[i], bvec[j]) + } +} +x +@ + +In this case, it makes sense to store the results in a matrix, so we +create one of the proper size called \texttt{x}, and assign the return +value of \texttt{sim} to the appropriate element of \texttt{x} each time +through the inner loop. + +When using \texttt{foreach}, we don't create a matrix and assign values into +it. Instead, the inner loop returns the columns of the result matrix as +vectors, which are combined in the outer loop into a matrix. +Here's how to do that using the \texttt{\%:\%} operator:\footnote{Due to +operator precedence, you cannot put braces around the inner +\texttt{foreach} loop. Unfortunately, that causes Sweave to format this +example rather badly, in my opinion.} + +<>= +x <- + foreach(b=bvec, .combine='cbind') %:% + foreach(a=avec, .combine='c') %do% { + sim(a, b) + } +x +@ + +This is structured very much like the nested \texttt{for} loop. +The outer \texttt{foreach} is iterating over the values in ``bvec'', +passing them to the inner \texttt{foreach}, which iterates over the +values in ``avec'' for each value of ``bvec''. Thus, the ``sim'' +function is called in the same way in both cases. The code is slightly +cleaner in this version, and has the advantage of being easily parallelized. + +\section{Using \texttt{\%:\%} with \texttt{\%dopar\%}} + +When parallelizing nested \texttt{for} loops, there is always a question +of which loop to parallelize. The standard advice is to parallelize the +outer loop. This results in larger individual tasks, and larger tasks +can often be performed more efficiently than smaller tasks. However, if +the outer loop doesn't have many iterations and the tasks are already +large, parallelizing the outer loop results in a small number of huge +tasks, which may not allow you to use all of your processors, and can +also result in load balancing problems. You could parallelize an inner +loop instead, but that could be inefficient because you're repeatedly +waiting for all the results to be returned every time through the outer +loop. And if the tasks and number of iterations vary in size, then it's +really hard to know which loop to parallelize. + +But in our Monte Carlo example, all of the tasks are completely +independent of each other, and so they can all be executed in parallel. +You really want to think of the loops as specifying a single stream of +tasks. You just need to be careful to process all of the results +correctly, depending on which iteration of the inner loop they came +from. + +That is exactly what the \texttt{\%:\%} operator does: it turns multiple +\texttt{foreach} loops into a single loop. That is why there is only +one \texttt{\%do\%} operator in the example above. And when we +parallelize that nested \texttt{foreach} loop by changing the +\texttt{\%do\%} into a \texttt{\%dopar\%}, we are creating a single +stream of tasks that can all be executed in parallel: + +<>= +x <- + foreach(b=bvec, .combine='cbind') %:% + foreach(a=avec, .combine='c') %dopar% { + sim(a, b) + } +x +@ + +Of course, we'll actually only run as many tasks in parallel as we have +processors, but the parallel backend takes care of all that. The point +is that the \texttt{\%:\%} operator makes it easy to specify the stream +of tasks to be executed, and the \texttt{.combine} argument to +\texttt{foreach} allows us to specify how the results should be processed. +The backend handles executing the tasks in parallel. + +\section{Chunking tasks} + +Of course, there has to be a snag to this somewhere. What if the tasks +are quite small, so that you really might want to execute the entire +inner loop as a single task? Well, small tasks are a problem even for a +singly-nested loop. The solution to this problem, whether you have a +single loop or nested loops, is to use {\em task chunking}. + +Task chunking allows you to send multiple tasks to the workers at once. +This can be much more efficient, especially for short tasks. Currently, +only the \texttt{doNWS} backend supports task +chunking. Here's how it's done with \texttt{doNWS}: + +<>= +opts <- list(chunkSize=2) +x <- + foreach(b=bvec, .combine='cbind', .options.nws=opts) %:% + foreach(a=avec, .combine='c') %dopar% { + sim(a, b) + } +x +@ + +If you're not using \texttt{doNWS}, then this argument is ignored, which +allows you to write code that is backend-independent. You can also +specify options for multiple backends, and only the option list that +matches the registered backend will be used. + +It would be nice if the chunk size could be picked automatically, but I +haven't figured out a good, safe way to do that. So for now, you need +to specify the chunk size manually.\footnote{In the future, the backend +might decide that it will execute the tasks in parallel. That +could be very useful when running on a cluster with multiprocessor +nodes. Multiple tasks are sent across the network to each node, which +then executes them in parallel on its cores. Maybe in the next +release...} + +The point is that by using the \texttt{\%:\%} operator, you can convert +a nested \texttt{for} loop to a nested \texttt{foreach} loop, use +\texttt{\%dopar\%} to run in parallel, and then tune the size of the +tasks using the ``chunkSize'' option so that they are big enough to be +executed efficiently, but not so big that they cause load balancing +problems. You don't have to worry about which loop to parallelize, +because you're turning the nested loops into a single stream of tasks +that can all be executed in parallel by the parallel backend. + +\section{Another example} + +Now let's imagine that the ``sim'' function returns a object that +includes an error estimate. We want to return the result with the +lowest error for each value of b, along with the arguments that +generated that result. Here's how that might be done with nested +\texttt{for} loops: + +<>= +sim <- function(a, b) { + x <- 10 * a + b + err <- abs(a - b) + list(x=x, err=err) +} +@ + +<>= +n <- length(bvec) +d <- data.frame(x=numeric(n), a=numeric(n), b=numeric(n), err=numeric(n)) + +for (j in 1:n) { + err <- Inf + best <- NULL + for (i in 1:length(avec)) { + obj <- sim(avec[i], bvec[j]) + if (obj$err < err) { + err <- obj$err + best <- data.frame(x=obj$x, a=avec[i], b=bvec[j], err=obj$err) + } + } + d[j,] <- best +} +d +@ + +This is also quite simple to convert to \texttt{foreach}. We just need +to supply the appropriate ``.combine'' functions. For the outer +\texttt{foreach}, we can use the standard ``rbind'' function which can +be used with data frames. For the inner \texttt{foreach}, we write a +function that compares two data frames, each with a single row, +returning the one with a smaller error estimate: + +<>= +comb <- function(d1, d2) if (d1$err < d2$err) d1 else d2 +@ + +Now we specify it with the ``.combine'' argument to the inner +\texttt{foreach}: + +<>= +opts <- list(chunkSize=2) +d <- + foreach(b=bvec, .combine='rbind', .options.nws=opts) %:% + foreach(a=avec, .combine='comb', .inorder=FALSE) %dopar% { + obj <- sim(a, b) + data.frame(x=obj$x, a=a, b=b, err=obj$err) + } +d +@ + +Note that since the order of the arguments to the ``comb'' function is +unimportant, I have set the ``.inorder'' argument to \texttt{FALSE}. +This reduces the number of results that need to be saved on the master +before they can be combined in case they are returned out of order. +But even with niceties such as parallelization, backend-specific +options, and the ``.inorder'' argument, the nested \texttt{foreach} +version is quite readable. + +But what if we would like to return the indices into ``avec'' and +``bvec'', rather than the data itself? A simple way to do that is to +create a couple of counting iterators that we pass to the +\texttt{foreach} functions:\footnote{It is very important that the call +to icount is passed as the argument to \texttt{foreach}. If the +iterators were created and passed to \texttt{foreach} using a variable, +for example, we would not get the desired effect. This is not a bug or +a limitation, but an important aspect of the design of the +\texttt{foreach} function.} + +<>= +library(iterators) +opts <- list(chunkSize=2) +d <- + foreach(b=bvec, j=icount(), .combine='rbind', .options.nws=opts) %:% + foreach(a=avec, i=icount(), .combine='comb', .inorder=FALSE) %dopar% { + obj <- sim(a, b) + data.frame(x=obj$x, i=i, j=j, err=obj$err) + } +d +@ + +These new iterators are infinite iterators, but that's no problem since +we have ``bvec'' and ``avec'' to control the number of iterations of +the loops. Making them infinite means we don't have to keep them in +sync with ``bvec'' and ``avec''. + +\section{Conclusion} + +Nested \texttt{for} loops are a common construct, and are often the most +time consuming part of R scripts, so they are prime candidates for +parallelization. The usual approach is to parallelize the outer loop, +but as we've seen, that can lead to suboptimal performance due to an +imbalance between the size and the number of tasks. By using +the \texttt{\%:\%} operator with \texttt{foreach}, and by using chunking +techniques, many of these problems can be overcome. The resulting code +is often clearer and more readable than the original R code, since +\texttt{foreach} was designed to deal with exactly this kind of problem. + +\end{document}