data.table references

Simon

data.table is considered to be the best performing and most fully featured package for working with data in R. However in many instances the syntax may not be as easy to follow compared to dplyr. This document is a rough collection of data.table references.

Overall commentary

Learning data.table

data.table and dplyr

using the packages together

Memory considerations

Create a data.table

# define a data.table like data.frame
DT <- data.table(letters = c("a","b","a","b"), numbers = 1:4)

# alternatively convert data.frame to data.table
DF <- data.frame(letters = c("a","b","a","b"), numbers = 1:4)
DT <- setDT(DF)

Row and column operations

Subset rows

# return data.table
DT[c(1,4)]
DT[letters == "a"]

Subset columns

# return vector
DT$numbers
DT[[1]]
DT[["numbers"]]

# return data.table
DT[, c(2)]
DT[, .(numbers)]
DT[, c("numbers")]
temp_cols <- "numbers"; DT[, ..temp_cols]; rm(temp_cols)
temp_cols <- "numbers"; DT[, temp_cols, with = FALSE]; rm(temp_cols)

References

Create / modify new columns

# creating 1 new column
DT[, numbersx2 := numbers*2]
temp_cols <- "numbersx3"; DT[, (temp_cols) := numbers*3]; rm(temp_cols)

# creating multiple columns
DT[, `:=` (numbersx4 = numbers*4, numbersx5 = numbers*5)]
DT[, c("numbersx6", "numbersx7") := .(numbers*6, numbers*7)]

# Deleting a column
DT[ , numbersx7 := NULL]

References

Aggregations

# table aggregates
DT[, .(count = .N, numbers = sum(numbers), meanx2 = mean(numbersx2))]
##    count numbers meanx2
## 1:     4      10      5
# aggregate by letters
# note using keyby instead of by will also sort by groups
# .(letters) is the same as c("letters")
DT[, .(count = .N, numbers = sum(numbers)), by = c("letters")]
##    letters count numbers
## 1:       a     2       4
## 2:       b     2       6
# SD is the (S)ubset of (D)ata excluding group columns where SDcols must be a character vector
DT[, lapply(.SD, sum), by = .(letters), .SDcols = c("numbers")]
##    letters numbers
## 1:       a       4
## 2:       b       6
# Use grep to find all fields that contain "numbersx"
DT[, lapply(.SD, sum), by = .(letters), .SDcols = grep("numbersx", names(DT), value = TRUE)]
##    letters numbersx2 numbersx3 numbersx4 numbersx5 numbersx6
## 1:       a         8        12        16        20        24
## 2:       b        12        18        24        30        36

References

Sorting data

setorder(DT, letters)
# Note that queries like x[order(.)] are optimised internally to use data.table's fast order.
DT[order(letters, -numbers)]
##    letters numbers numbersx2 numbersx3 numbersx4 numbersx5 numbersx6
## 1:       a       3         6         9        12        15        18
## 2:       a       1         2         3         4         5         6
## 3:       b       4         8        12        16        20        24
## 4:       b       2         4         6         8        10        12

Joining tables

Left joins

X <- data.table(letters = c("a","b","c","d","e"), numbers1 = 11:15, numbers2 = 21:25)
Y <- data.table(letters = c("c","d","e"), numbers2 = 207:209, numbers3 = 307:309, numbers4 = 407:409)

Left outer join

# note that X.numbers1 becomes i.numbers1
Y[X, on = .(letters = letters)]
##    letters numbers2 numbers3 numbers4 numbers1 i.numbers2
## 1:       a       NA       NA       NA       11         21
## 2:       b       NA       NA       NA       12         22
## 3:       c      207      307      407       13         23
## 4:       d      208      308      408       14         24
## 5:       e      209      309      409       15         25
# matching records
Y[X, on = .(letters = letters), .N]
## [1] 5

Left update join

# i.numbers2 and i.numbers3 refers to Y.numbers2 and Y.numbers3
X[Y, on = .(letters = letters), c("numbers2","numbers3") := .(i.numbers2, i.numbers3)]
print(X)
##    letters numbers1 numbers2 numbers3
## 1:       a       11       21       NA
## 2:       b       12       22       NA
## 3:       c       13      207      307
## 4:       d       14      208      308
## 5:       e       15      209      309
# Include all columns in Y
temp_cols <- names(Y); X[Y, on = .(letters = letters), (temp_cols) := mget(paste0("i.", temp_cols))]; rm(temp_cols)
print(X)
##    letters numbers1 numbers2 numbers3 numbers4
## 1:       a       11       21       NA       NA
## 2:       b       12       22       NA       NA
## 3:       c       13      207      307      407
## 4:       d       14      208      308      408
## 5:       e       15      209      309      409

References

Non-equi joins

DT1 <- data.table(letters = c("a","a","b","b","c","d","e"), pos = c(2,3,3,12,20,52,10))
DT2 <- data.table(letters = c("a","b","c","d","e"), start = c(1,5,19,30,7), end = c(3,11,22,39,25))

# find all records where D1.letters = DT2.letters and DT2.start <= DT1.pos <= DT2.end
# x.pos refers to DT1.pos
DT1[DT2, .(letters, pos = x.pos, start, end), on = .(letters = letters, pos >= start, pos <= end), nomatch = 0L]
##    letters pos start end
## 1:       a   2     1   3
## 2:       a   3     1   3
## 3:       c  20    19  22
## 4:       e  10     7  25

References

First and last item by group

DT <- data.table(letters1 = rep(c("A","B"), times = 4),
                 letters2 = rep(c("A","A","B","B"), times = 2),
                 numbers = 1:8)[order(letters1, letters2, numbers)]                 

Method 1: Identify the first and last elements with unique

# First element
unique(DT, by = c("letters1", "letters2"), fromLast = FALSE)
##    letters1 letters2 numbers
## 1:        A        A       1
## 2:        A        B       3
## 3:        B        A       2
## 4:        B        B       4
# Last element
unique(DT, by = c("letters1", "letters2"), fromLast = TRUE)
##    letters1 letters2 numbers
## 1:        A        A       5
## 2:        A        B       7
## 3:        B        A       6
## 4:        B        B       8

Method 2: Merge the elements together mult option

uDT <- unique(DT)
DT[uDT, on = .(letters1 = letters1, letters2 = letters2), first := 1L, mult = "first"]
DT[uDT, on = .(letters1 = letters1, letters2 = letters2), last := 1L, mult = "last"]
print(DT)
##    letters1 letters2 numbers first last
## 1:        A        A       1     1   NA
## 2:        A        A       5    NA    1
## 3:        A        B       3     1   NA
## 4:        A        B       7    NA    1
## 5:        B        A       2     1   NA
## 6:        B        A       6    NA    1
## 7:        B        B       4     1   NA
## 8:        B        B       8    NA    1

References

Other commands

Cubes

Panel graphics

# works with R Base graphics and some glitches with Rmarkdown
# not really recommended
par(mfrow = c(rows, columns))
DT[, plot(x, y), keyby = ...]

Previous row value by group

DT <- data.table(letters = c("a","a","b","b"), numbers = 1:4)
DT[ , B := shift(numbers, 1L, type="lag"), by = .(letters)]
print(DT)
##    letters numbers  B
## 1:       a       1 NA
## 2:       a       2  1
## 3:       b       3 NA
## 4:       b       4  3

References

Row numbers

DT <- data.table(letters1 = rep(c("A","B"), times = 4),
                 letters2 = rep(c("A","A","B","B"), times = 2),
                 numbers = 1:8)[order(letters1, letters2, numbers)]
# row number
DT[, ID := .I]
# group number
DT[ , GRPID := .GRP, by = .(letters1, letters2)]
# row number within group
DT[, byID := rowid(letters1, letters2)]
print(DT)
##    letters1 letters2 numbers ID GRPID byID
## 1:        A        A       1  1     1    1
## 2:        A        A       5  2     1    2
## 3:        A        B       3  3     2    1
## 4:        A        B       7  4     2    2
## 5:        B        A       2  5     3    1
## 6:        B        A       6  6     3    2
## 7:        B        B       4  7     4    1
## 8:        B        B       8  8     4    2

Useful functions

Reference