library(data.table)
library(pastecs)
##
## Attaching package: 'pastecs'
## The following objects are masked from 'package:data.table':
##
## first, last
Create a data table.
DT <- data.table(ID = 1:50,
Capacity = sample(100:100, size = 50, replace = F),
Code = sample(LETTERS[1:4], 50, replace = T),
State = rep(c("Alabama", "Indiana", "Texas", "Nevada"), 50))
head(DT, 3)
## ID Capacity Code State
## 1: 1 38 A Alabama
## 2: 2 74 B Indiana
## 3: 3 36 B Texas
Find State that’s Code “C” and get the sum of capacity. Group by State.
DT[Code == "C", sum(Capacity), State]
## State V1
## 1: Texas 384
## 2: Nevada 862
## 3: Indiana 862
## 4: Alabama 384
Find the average capacity per state.
DT[, mean(Capacity), by=State]
## State V1
## 1: Alabama 52.36
## 2: Indiana 55.96
## 3: Texas 52.36
## 4: Nevada 55.96
Find the average capacity of all states coded as “A”.
DT[Code=="A", mean(Capacity)]
## [1] 51.90909
Use fread() to load data faster.
dt <- fread("GB_full.csv")
Check data.
tail(dt, 6)
## V1 V2 V3 V4 V5 V6 V7
## 1: GB ZE3 9JT Shetland South Ward Scotland SCT Shetland
## 2: GB ZE3 9JU Shetland South Ward Scotland SCT Shetland
## 3: GB ZE3 9JW Shetland South Ward Scotland SCT Shetland
## 4: GB ZE3 9JX Shetland South Ward Scotland SCT Shetland
## 5: GB ZE3 9JY Shetland South Ward Scotland SCT Shetland
## 6: GB ZE3 9JZ Shetland South Ward Scotland SCT Shetland
## V8 V9 V10 V11 V12
## 1: Shetland Islands S12000027 59.87262 -1.306772 6
## 2: Shetland Islands S12000027 59.88954 -1.307206 6
## 3: Shetland Islands S12000027 59.87365 -1.305697 6
## 4: Shetland Islands S12000027 59.87529 -1.307502 6
## 5: Shetland Islands S12000027 59.89157 -1.313847 6
## 6: Shetland Islands S12000027 59.89239 -1.310899 6
Subset rows that meet conditions on two different columns.
sub_rows <- dt[V4 == "England" & V3 == "Beswick"]
sub_rows
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
## 1: GB YO25 Beswick England ENG East Riding of Yorkshire 11609011 NA
## V11 V12
## 1: NA 4
Get all rows but only 3 columns. Take note of the “.”. If .() is used, the returned value is a data.table. If .() is not used, the result is a vector
sub_columns <- dt[,.(V2, V3,V4)]
head(sub_columns)
## V2 V3 V4
## 1: AB10 Aberdeen Scotland
## 2: AB10 1AB Aberdeen Scotland
## 3: AB10 1AF Aberdeen Scotland
## 4: AB10 1AG Aberdeen Scotland
## 5: AB10 1AH Aberdeen Scotland
## 6: AB10 1AL Aberdeen Scotland
Sort by V3 in descending order
dt_order <- dt[order(-V3)]
head(dt_order,10)
## V1 V2 V3 V4 V5 V6 V7
## 1: GB TS10 3AP Zetland Ward England ENG North Yorkshire
## 2: GB TS10 3AR Zetland Ward England ENG North Yorkshire
## 3: GB TS10 3AS Zetland Ward England ENG North Yorkshire
## 4: GB TS10 3AT Zetland Ward England ENG North Yorkshire
## 5: GB TS10 3AU Zetland Ward England ENG North Yorkshire
## 6: GB TS10 3AX Zetland Ward England ENG North Yorkshire
## 7: GB TS10 3AY Zetland Ward England ENG North Yorkshire
## 8: GB TS10 3AZ Zetland Ward England ENG North Yorkshire
## 9: GB TS10 3BA Zetland Ward England ENG North Yorkshire
## 10: GB TS10 3BB Zetland Ward England ENG North Yorkshire
## V8 V9 V10 V11 V12
## 1: Redcar and Cleveland (B) E06000003 54.61640 -1.055022 6
## 2: Redcar and Cleveland (B) E06000003 54.61688 -1.056931 6
## 3: Redcar and Cleveland (B) E06000003 54.61699 -1.057238 6
## 4: Redcar and Cleveland (B) E06000003 54.61546 -1.053681 6
## 5: Redcar and Cleveland (B) E06000003 54.61419 -1.053803 6
## 6: Redcar and Cleveland (B) E06000003 54.61315 -1.052883 6
## 7: Redcar and Cleveland (B) E06000003 54.61149 -1.051373 6
## 8: Redcar and Cleveland (B) E06000003 54.61133 -1.050897 6
## 9: Redcar and Cleveland (B) E06000003 54.61120 -1.050435 6
## 10: Redcar and Cleveland (B) E06000003 54.61555 -1.053989 6
dt[V8 == "Aberdeen City", V8 := "Abr Cty"]
head(dt)
## V1 V2 V3 V4 V5 V6 V7 V8 V9
## 1: GB AB10 Aberdeen Scotland SCT Aberdeenshire
## 2: GB AB10 1AB Aberdeen Scotland SCT City of Aberdeen Abr Cty S12000033
## 3: GB AB10 1AF Aberdeen Scotland SCT City of Aberdeen Abr Cty S12000033
## 4: GB AB10 1AG Aberdeen Scotland SCT City of Aberdeen Abr Cty S12000033
## 5: GB AB10 1AH Aberdeen Scotland SCT City of Aberdeen Abr Cty S12000033
## 6: GB AB10 1AL Aberdeen Scotland SCT City of Aberdeen Abr Cty S12000033
## V10 V11 V12
## 1: NA NA 4
## 2: 57.14961 -2.096916 6
## 3: 57.14871 -2.097806 6
## 4: 57.14907 -2.096997 6
## 5: 57.14808 -2.094664 6
## 6: 57.14954 -2.095412 6
All rows, turn columns V6 and V7 into Null
dt[,c("V6", "V7") := NULL]
head(dt)
## V1 V2 V3 V4 V5 V8 V9 V10 V11
## 1: GB AB10 Aberdeen Scotland SCT NA NA
## 2: GB AB10 1AB Aberdeen Scotland SCT Abr Cty S12000033 57.14961 -2.096916
## 3: GB AB10 1AF Aberdeen Scotland SCT Abr Cty S12000033 57.14871 -2.097806
## 4: GB AB10 1AG Aberdeen Scotland SCT Abr Cty S12000033 57.14907 -2.096997
## 5: GB AB10 1AH Aberdeen Scotland SCT Abr Cty S12000033 57.14808 -2.094664
## 6: GB AB10 1AL Aberdeen Scotland SCT Abr Cty S12000033 57.14954 -2.095412
## V12
## 1: 4
## 2: 6
## 3: 6
## 4: 6
## 5: 6
## 6: 6
Create new column V-New by adding V10 and V11
dt[,V_New := V10 + V11]
head(dt)
## V1 V2 V3 V4 V5 V8 V9 V10 V11
## 1: GB AB10 Aberdeen Scotland SCT NA NA
## 2: GB AB10 1AB Aberdeen Scotland SCT Abr Cty S12000033 57.14961 -2.096916
## 3: GB AB10 1AF Aberdeen Scotland SCT Abr Cty S12000033 57.14871 -2.097806
## 4: GB AB10 1AG Aberdeen Scotland SCT Abr Cty S12000033 57.14907 -2.096997
## 5: GB AB10 1AH Aberdeen Scotland SCT Abr Cty S12000033 57.14808 -2.094664
## 6: GB AB10 1AL Aberdeen Scotland SCT Abr Cty S12000033 57.14954 -2.095412
## V12 V_New
## 1: 4 NA
## 2: 6 55.05269
## 3: 6 55.05090
## 4: 6 55.05207
## 5: 6 55.05342
## 6: 6 55.05413
cor.test(dt$V10, dt$V11)
##
## Pearson's product-moment correlation
##
## data: dt$V10 and dt$V11
## t = -565.78, df = 1693200, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.4000021 -0.3974686
## sample estimates:
## cor
## -0.3987361
library(pastecs)
attach(dt)
colnames(dt)
## [1] "V1" "V2" "V3" "V4" "V5" "V8" "V9" "V10"
## [9] "V11" "V12" "V_New"
Get summary stats
stat.desc(dt$V10, desc=F)
## nbr.val nbr.null nbr.na min max
## 1.693242e+06 0.000000e+00 2.743100e+04 4.989517e+01 6.080069e+01
## range sum
## 1.090552e+01 8.924279e+07
Move decimal places
options(scipen = 100)
options(digits=2)
stat.desc(dt$V10)
## nbr.val nbr.null nbr.na min max
## 1693242.0000 0.0000 27431.0000 49.8952 60.8007
## range sum median mean SE.mean
## 10.9055 89242786.7103 52.4001 52.7053 0.0012
## CI.mean.0.95 var std.dev coef.var
## 0.0024 2.5126 1.5851 0.0301
Filter Scotland and V12 is more than or equal to 4
subrows <- dt[V4 == "Scotland" & V12 >= 4]
head(subrows,5)
## V1 V2 V3 V4 V5 V8 V9 V10 V11 V12 V_New
## 1: GB AB10 Aberdeen Scotland SCT NA NA 4 NA
## 2: GB AB10 1AB Aberdeen Scotland SCT Abr Cty S12000033 57 -2.1 6 55
## 3: GB AB10 1AF Aberdeen Scotland SCT Abr Cty S12000033 57 -2.1 6 55
## 4: GB AB10 1AG Aberdeen Scotland SCT Abr Cty S12000033 57 -2.1 6 55
## 5: GB AB10 1AH Aberdeen Scotland SCT Abr Cty S12000033 57 -2.1 6 55
Get the average of V12 grouped by country
dt[,.(average = mean(V12)), by=V4]
## V4 average
## 1: Scotland NA
## 2: England 6.0
## 3: Northern Ireland 3.7
## 4: Wales 5.9
## 5: 3.2