Data.Table Functions

library(data.table)
library(pastecs)

## 
## Attaching package: 'pastecs'

## The following objects are masked from 'package:data.table':
## 
##     first, last

Create a data table.

DT <- data.table(ID = 1:50, 
                 Capacity = sample(100:100, size = 50, replace = F),
                 Code = sample(LETTERS[1:4], 50, replace = T),
                 State = rep(c("Alabama", "Indiana", "Texas", "Nevada"), 50))
head(DT, 3)

##    ID Capacity Code   State
## 1:  1       38    A Alabama
## 2:  2       74    B Indiana
## 3:  3       36    B   Texas

Find State that’s Code “C” and get the sum of capacity. Group by State.

DT[Code == "C", sum(Capacity), State]

##      State  V1
## 1:   Texas 384
## 2:  Nevada 862
## 3: Indiana 862
## 4: Alabama 384

Find the average capacity per state.

DT[, mean(Capacity), by=State]

##      State    V1
## 1: Alabama 52.36
## 2: Indiana 55.96
## 3:   Texas 52.36
## 4:  Nevada 55.96

Find the average capacity of all states coded as “A”.

DT[Code=="A", mean(Capacity)]

## [1] 51.90909

Use fread() to load data faster.

dt <- fread("GB_full.csv")

Check data.

tail(dt, 6)

##    V1      V2                  V3       V4  V5       V6 V7
## 1: GB ZE3 9JT Shetland South Ward Scotland SCT Shetland   
## 2: GB ZE3 9JU Shetland South Ward Scotland SCT Shetland   
## 3: GB ZE3 9JW Shetland South Ward Scotland SCT Shetland   
## 4: GB ZE3 9JX Shetland South Ward Scotland SCT Shetland   
## 5: GB ZE3 9JY Shetland South Ward Scotland SCT Shetland   
## 6: GB ZE3 9JZ Shetland South Ward Scotland SCT Shetland   
##                  V8        V9      V10       V11 V12
## 1: Shetland Islands S12000027 59.87262 -1.306772   6
## 2: Shetland Islands S12000027 59.88954 -1.307206   6
## 3: Shetland Islands S12000027 59.87365 -1.305697   6
## 4: Shetland Islands S12000027 59.87529 -1.307502   6
## 5: Shetland Islands S12000027 59.89157 -1.313847   6
## 6: Shetland Islands S12000027 59.89239 -1.310899   6

Subset rows that meet conditions on two different columns.

sub_rows <- dt[V4 == "England" & V3 == "Beswick"]
sub_rows

##    V1   V2      V3      V4  V5                       V6       V7 V8 V9 V10
## 1: GB YO25 Beswick England ENG East Riding of Yorkshire 11609011        NA
##    V11 V12
## 1:  NA   4

Get all rows but only 3 columns. Take note of the “.”. If .() is used, the returned value is a data.table. If .() is not used, the result is a vector

sub_columns <- dt[,.(V2, V3,V4)]
head(sub_columns)

##          V2       V3       V4
## 1:     AB10 Aberdeen Scotland
## 2: AB10 1AB Aberdeen Scotland
## 3: AB10 1AF Aberdeen Scotland
## 4: AB10 1AG Aberdeen Scotland
## 5: AB10 1AH Aberdeen Scotland
## 6: AB10 1AL Aberdeen Scotland

Sort by V3 in descending order

dt_order <- dt[order(-V3)]
head(dt_order,10)

##     V1       V2           V3      V4  V5              V6 V7
##  1: GB TS10 3AP Zetland Ward England ENG North Yorkshire   
##  2: GB TS10 3AR Zetland Ward England ENG North Yorkshire   
##  3: GB TS10 3AS Zetland Ward England ENG North Yorkshire   
##  4: GB TS10 3AT Zetland Ward England ENG North Yorkshire   
##  5: GB TS10 3AU Zetland Ward England ENG North Yorkshire   
##  6: GB TS10 3AX Zetland Ward England ENG North Yorkshire   
##  7: GB TS10 3AY Zetland Ward England ENG North Yorkshire   
##  8: GB TS10 3AZ Zetland Ward England ENG North Yorkshire   
##  9: GB TS10 3BA Zetland Ward England ENG North Yorkshire   
## 10: GB TS10 3BB Zetland Ward England ENG North Yorkshire   
##                           V8        V9      V10       V11 V12
##  1: Redcar and Cleveland (B) E06000003 54.61640 -1.055022   6
##  2: Redcar and Cleveland (B) E06000003 54.61688 -1.056931   6
##  3: Redcar and Cleveland (B) E06000003 54.61699 -1.057238   6
##  4: Redcar and Cleveland (B) E06000003 54.61546 -1.053681   6
##  5: Redcar and Cleveland (B) E06000003 54.61419 -1.053803   6
##  6: Redcar and Cleveland (B) E06000003 54.61315 -1.052883   6
##  7: Redcar and Cleveland (B) E06000003 54.61149 -1.051373   6
##  8: Redcar and Cleveland (B) E06000003 54.61133 -1.050897   6
##  9: Redcar and Cleveland (B) E06000003 54.61120 -1.050435   6
## 10: Redcar and Cleveland (B) E06000003 54.61555 -1.053989   6

dt[V8 == "Aberdeen City", V8 := "Abr Cty"]
head(dt)

##    V1       V2       V3       V4  V5               V6 V7      V8        V9
## 1: GB     AB10 Aberdeen Scotland SCT    Aberdeenshire                     
## 2: GB AB10 1AB Aberdeen Scotland SCT City of Aberdeen    Abr Cty S12000033
## 3: GB AB10 1AF Aberdeen Scotland SCT City of Aberdeen    Abr Cty S12000033
## 4: GB AB10 1AG Aberdeen Scotland SCT City of Aberdeen    Abr Cty S12000033
## 5: GB AB10 1AH Aberdeen Scotland SCT City of Aberdeen    Abr Cty S12000033
## 6: GB AB10 1AL Aberdeen Scotland SCT City of Aberdeen    Abr Cty S12000033
##         V10       V11 V12
## 1:       NA        NA   4
## 2: 57.14961 -2.096916   6
## 3: 57.14871 -2.097806   6
## 4: 57.14907 -2.096997   6
## 5: 57.14808 -2.094664   6
## 6: 57.14954 -2.095412   6

All rows, turn columns V6 and V7 into Null

dt[,c("V6", "V7") := NULL]
head(dt)

##    V1       V2       V3       V4  V5      V8        V9      V10       V11
## 1: GB     AB10 Aberdeen Scotland SCT                         NA        NA
## 2: GB AB10 1AB Aberdeen Scotland SCT Abr Cty S12000033 57.14961 -2.096916
## 3: GB AB10 1AF Aberdeen Scotland SCT Abr Cty S12000033 57.14871 -2.097806
## 4: GB AB10 1AG Aberdeen Scotland SCT Abr Cty S12000033 57.14907 -2.096997
## 5: GB AB10 1AH Aberdeen Scotland SCT Abr Cty S12000033 57.14808 -2.094664
## 6: GB AB10 1AL Aberdeen Scotland SCT Abr Cty S12000033 57.14954 -2.095412
##    V12
## 1:   4
## 2:   6
## 3:   6
## 4:   6
## 5:   6
## 6:   6

Create new column V-New by adding V10 and V11

dt[,V_New := V10 + V11]
head(dt)

##    V1       V2       V3       V4  V5      V8        V9      V10       V11
## 1: GB     AB10 Aberdeen Scotland SCT                         NA        NA
## 2: GB AB10 1AB Aberdeen Scotland SCT Abr Cty S12000033 57.14961 -2.096916
## 3: GB AB10 1AF Aberdeen Scotland SCT Abr Cty S12000033 57.14871 -2.097806
## 4: GB AB10 1AG Aberdeen Scotland SCT Abr Cty S12000033 57.14907 -2.096997
## 5: GB AB10 1AH Aberdeen Scotland SCT Abr Cty S12000033 57.14808 -2.094664
## 6: GB AB10 1AL Aberdeen Scotland SCT Abr Cty S12000033 57.14954 -2.095412
##    V12    V_New
## 1:   4       NA
## 2:   6 55.05269
## 3:   6 55.05090
## 4:   6 55.05207
## 5:   6 55.05342
## 6:   6 55.05413

cor.test(dt$V10, dt$V11)

## 
##  Pearson's product-moment correlation
## 
## data:  dt$V10 and dt$V11
## t = -565.78, df = 1693200, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.4000021 -0.3974686
## sample estimates:
##        cor 
## -0.3987361

library(pastecs)
attach(dt)
colnames(dt)

##  [1] "V1"    "V2"    "V3"    "V4"    "V5"    "V8"    "V9"    "V10"  
##  [9] "V11"   "V12"   "V_New"

Get summary stats

stat.desc(dt$V10, desc=F)

##      nbr.val     nbr.null       nbr.na          min          max 
## 1.693242e+06 0.000000e+00 2.743100e+04 4.989517e+01 6.080069e+01 
##        range          sum 
## 1.090552e+01 8.924279e+07

Move decimal places

options(scipen = 100)
options(digits=2)
stat.desc(dt$V10)

##       nbr.val      nbr.null        nbr.na           min           max 
##  1693242.0000        0.0000    27431.0000       49.8952       60.8007 
##         range           sum        median          mean       SE.mean 
##       10.9055 89242786.7103       52.4001       52.7053        0.0012 
##  CI.mean.0.95           var       std.dev      coef.var 
##        0.0024        2.5126        1.5851        0.0301

Filter Scotland and V12 is more than or equal to 4

subrows <- dt[V4 == "Scotland" & V12 >= 4]
head(subrows,5)

##    V1       V2       V3       V4  V5      V8        V9 V10  V11 V12 V_New
## 1: GB     AB10 Aberdeen Scotland SCT                    NA   NA   4    NA
## 2: GB AB10 1AB Aberdeen Scotland SCT Abr Cty S12000033  57 -2.1   6    55
## 3: GB AB10 1AF Aberdeen Scotland SCT Abr Cty S12000033  57 -2.1   6    55
## 4: GB AB10 1AG Aberdeen Scotland SCT Abr Cty S12000033  57 -2.1   6    55
## 5: GB AB10 1AH Aberdeen Scotland SCT Abr Cty S12000033  57 -2.1   6    55

Get the average of V12 grouped by country

dt[,.(average = mean(V12)), by=V4]

##                  V4 average
## 1:         Scotland      NA
## 2:          England     6.0
## 3: Northern Ireland     3.7
## 4:            Wales     5.9
## 5:                      3.2