Test classic tokens v. hashed tokens

To test the performance of dfm construction using tokens versus classic tokenized methods.

require(quanteda, quietly = TRUE, warn.conflicts = FALSE)
## quanteda version 0.9.8.9017
data(SOTUCorpus, package = "quantedaData")
toks <- tokenize(SOTUCorpus)
toksh <- tokens(SOTUCorpus)

When already tokenized:

microbenchmark::microbenchmark(hashed = dfm(toksh, verbose = FALSE), 
                               classic = dfm(toks, verbose = FALSE), 
                               times = 20, unit = "relative")
## Unit: relative
##     expr      min       lq     mean   median       uq      max neval
##   hashed 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000    20
##  classic 4.470103 4.383913 3.709468 4.335245 3.107245 3.122707    20

Combining tokenization (as with dfm() on a character or corpus):

microbenchmark::microbenchmark(hashed = dfm(tokens(SOTUCorpus), verbose = FALSE), 
                               classic = dfm(tokenize(SOTUCorpus), verbose = FALSE), 
                               times = 20, unit = "relative")
## Unit: relative
##     expr      min       lq     mean   median       uq      max neval
##   hashed 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000    20
##  classic 1.239813 1.217851 1.217632 1.229421 1.217117 1.146923    20

Test i, j, x sparseMatrix v. i, p, x

Not much difference - but the ipx() could be taking longer because of the transpose operation.

ijx <- function(x) {
    # index documents
    nTokens <- lengths(x)
    i <- rep(seq_along(nTokens), nTokens)
    # index features
    allFeatures <- unlist(x)
    uniqueFeatures <- unique(allFeatures)
    j <- match(allFeatures, uniqueFeatures)

    new("dfmSparse", Matrix::sparseMatrix(i = i, j = j, x = 1L, 
                                  dimnames = list(docs = names(x), 
                                                  features = uniqueFeatures)))
}

ipx <- function(x) {  
    # index documents
    p <- cumsum(c(1, ntoken(x))) - 1
    # index features
    allFeatures <- unlist(x)
    uniqueFeatures <- unique(allFeatures)
    i <- match(allFeatures, uniqueFeatures)

    new("dfmSparse", t(Matrix::sparseMatrix(i = i, p = p, x = 1L, 
                                            dimnames = list(features = uniqueFeatures, 
                                                            docs = names(x)))))
}

microbenchmark::microbenchmark(ijx(toks), ipx(toks), 
                               times = 50, unit = "relative")
## Unit: relative
##       expr      min       lq    mean   median      uq      max neval
##  ijx(toks) 1.000000 1.000000 1.00000 1.000000 1.00000 1.000000    50
##  ipx(toks) 1.025118 1.024159 1.04169 1.007245 1.08364 1.604885    50