Using data.table to manipulate large sets of data.
library(data.table)
temp = fread("Temperature.csv")
dim(temp)
## [1] 8528 16
class(temp)
## [1] "data.table" "data.frame"
Extract all winter observations:
setDT(temp)
winter_data = temp[Season == "winter"]
print(winter_data)
## Sample Date DateNr dDay1 dDay2 dDay3 Station Area
## <char> <int> <char> <int> <int> <int> <char> <char>
## 1: DANT.19900110 19900110 10/1/90 7 9 9 DANT WZ
## 2: DANT.19900206 19900206 6/2/90 34 36 36 DANT WZ
## 3: DANT.19901212 19901212 12/12/90 343 345 345 DANT WZ
## 4: DANT.19910116 19910116 1/16/1991 378 380 15 DANT WZ
## 5: DANT.19910226 19910226 2/26/1991 419 421 56 DANT WZ
## ---
## 1702: ZUID.20040216 20040216 2/16/2004 5157 5159 46 ZUID WZ
## 1703: ZUID.20041208 20041208 8/12/04 5453 5455 342 ZUID WZ
## 1704: ZUID.20050119 20050119 1/19/2005 5495 5497 18 ZUID WZ
## 1705: ZUID.20050218 20050218 2/18/2005 5525 5527 48 ZUID WZ
## 1706: ZUID.20051212 20051212 12/12/05 5822 5824 345 ZUID WZ
## 31UE_ED50 31UN_ED50 Year Month Season Salinity Temperature CHLFa
## <num> <num> <int> <int> <char> <num> <num> <num>
## 1: 681379.6 5920571 1990 1 winter 29.19 4.00 1.30
## 2: 681379.6 5920571 1990 2 winter 27.37 6.00 NA
## 3: 681379.6 5920571 1990 12 winter 31.50 4.20 60.50
## 4: 681379.6 5920571 1991 1 winter 20.83 -0.30 2.30
## 5: 681379.6 5920571 1991 2 winter 28.06 3.90 3.52
## ---
## 1702: 733386.3 5928197 2004 2 winter 24.70 4.40 NA
## 1703: 733386.3 5928197 2004 12 winter 28.67 4.72 1.52
## 1704: 733386.3 5928197 2005 1 winter 28.39 3.57 4.12
## 1705: 733386.3 5928197 2005 2 winter 29.31 1.31 6.56
## 1706: 733386.3 5928197 2005 12 winter 29.08 5.13 2.38
Extract all winter observations for zone NC:
winter_nc = temp[tolower(trimws(Season)) == "winter" & Area == "NC"]
print(winter_nc)
## Sample Date DateNr dDay1 dDay2 dDay3 Station Area
## <char> <int> <char> <int> <int> <int> <char> <char>
## 1: T100.19900103 19900103 3/1/90 0 2 2 T100 NC
## 2: T100.19900205 19900205 5/2/90 33 35 35 T100 NC
## 3: T100.19901218 19901218 12/18/1990 349 351 351 T100 NC
## 4: T100.19910116 19910116 1/16/1991 378 380 15 T100 NC
## 5: T100.19910205 19910205 5/2/91 398 400 35 T100 NC
## ---
## 177: T235.20040209 20040209 9/2/04 5150 5152 39 T235 NC
## 178: T235.20041214 20041214 12/14/2004 5459 5461 348 T235 NC
## 179: T235.20050126 20050126 1/26/2005 5502 5504 25 T235 NC
## 180: T235.20050214 20050214 2/14/2005 5521 5523 44 T235 NC
## 181: T235.20051220 20051220 12/20/2005 5830 5832 353 T235 NC
## 31UE_ED50 31UN_ED50 Year Month Season Salinity Temperature CHLFa
## <num> <num> <int> <int> <char> <num> <num> <num>
## 1: 587650.2 6001110 1990 1 winter 34.82 8.50 0.30
## 2: 587650.2 6001110 1990 2 winter NA NA NA
## 3: 587650.2 6001110 1990 12 winter 34.80 9.20 0.40
## 4: 587650.2 6001110 1991 1 winter 34.86 6.10 0.68
## 5: 587650.2 6001110 1991 2 winter 34.53 5.20 0.34
## ---
## 177: 510032.3 6114101 2004 2 winter NA NA NA
## 178: 510032.3 6114101 2004 12 winter 34.96 8.47 0.62
## 179: 510032.3 6114101 2005 1 winter 35.09 6.44 0.62
## 180: 510032.3 6114101 2005 2 winter NA NA NA
## 181: 510032.3 6114101 2005 12 winter 33.87 8.19 1.12
Select only the columns Area, Season and Temperature:
ast = temp[, .(Area, Season, Temperature)]
head(ast)
## Area Season Temperature
## <char> <char> <num>
## 1: WZ winter 4.0
## 2: WZ winter 6.0
## 3: WZ spring 7.3
## 4: WZ spring 8.2
## 5: WZ spring 17.4
## 6: WZ summer 18.1
Select only the columns Area and Temperature but only for winter observations:
ast_winter = temp[tolower(trimws(Season)) == "winter" ,.(Area, Temperature)]
head(ast_winter)
## Area Temperature
## <char> <num>
## 1: WZ 4.0
## 2: WZ 6.0
## 3: WZ 4.2
## 4: WZ -0.3
## 5: WZ 3.9
## 6: WZ 3.9
Find the total number of observations in winter:
winter_num = temp[Season == "winter", .N]
winter_num
## [1] 1706
Calculate the mean temperature and mean salinity in winter:
winter_means = temp[tolower(trimws(Season)) == "winter", .(
mean_Temperature = mean(Temperature, na.rm = TRUE),
mean_Salinity = mean(Salinity, na.rm = TRUE)
)]
Find the number of observations per station in winter:
winter_counts = temp[
tolower(trimws(Season)) == "winter",
.N,
by = Station
]
print(winter_counts)
## Station N
## <char> <int>
## 1: DANT 50
## 2: DREI 52
## 3: G6 101
## 4: GROO 50
## 5: HAMM 55
## 6: HANS 56
## 7: HUIB 50
## 8: LODS 54
## 9: MARS 49
## 10: N02 115
## 11: N10 131
## 12: N20 50
## 13: N70 50
## 14: R03 32
## 15: SOEL 50
## 16: T004 97
## 17: T010 45
## 18: T100 45
## 19: T135 46
## 20: T175 45
## 21: T235 45
## 22: VLIS 84
## 23: W02 99
## 24: W20 47
## 25: W70 47
## 26: WISS 55
## 27: ZIJP 54
## 28: ZUID 52
## Station N
Find the number of observations per station per season:
counts = temp[, .N, by = .(Station, Season)]
print(counts)
## Station Season N
## <char> <char> <int>
## 1: DANT winter 50
## 2: DANT spring 89
## 3: DANT summer 89
## 4: DANT autumn 72
## 5: DREI winter 52
## ---
## 114: ZIJP autumn 61
## 115: ZUID winter 52
## 116: ZUID spring 89
## 117: ZUID summer 89
## 118: ZUID autumn 73
Estimate average temperatures by month:
monthly_avg_temp = temp[, .(mean_Temperature = mean(Temperature, na.rm = TRUE)), by = Month]
Estimate average temperatures by month by area:
monthly_station_avg = temp[, .(mean_Temperature = mean(Temperature, na.rm = TRUE)),
by = .(Month, Station)]
wide_avg = dcast(monthly_station_avg, Month ~ Station, value.var = "mean_Temperature")
print(wide_avg)
## Key: <Month>
## Month DANT DREI G6 GROO HAMM HANS HUIB
## <int> <num> <num> <num> <num> <num> <num> <num>
## 1: 1 2.916250 4.308750 5.118788 2.687333 5.055556 4.841765 3.485333
## 2: 2 4.133750 3.904444 5.209000 3.214444 4.548333 5.245882 3.398889
## 3: 3 5.931786 6.343548 6.219500 6.086000 6.031613 6.642941 5.210000
## 4: 4 9.610000 9.867778 8.399286 10.622414 9.183929 9.264737 8.356000
## 5: 5 13.765806 14.420303 12.306250 15.199355 12.948621 13.601667 12.850968
## 6: 6 16.896129 17.867407 15.327692 17.528929 16.610345 17.288710 16.217857
## 7: 7 19.141034 19.901481 18.125625 19.255000 18.758276 19.694063 18.246250
## 8: 8 19.918571 20.576061 19.801250 19.854615 20.318710 20.635000 19.630741
## 9: 9 15.577143 17.200000 17.581429 15.716552 17.513929 17.808621 16.383448
## 10: 10 11.760769 12.950000 13.955238 11.766818 13.854706 14.346250 12.739545
## 11: 11 7.500625 8.612667 10.096667 6.691875 9.640625 9.705556 8.242500
## 12: 12 4.060000 5.149412 7.387500 3.608125 6.426471 6.112353 4.851250
## LODS MARS N02 N10 N20 N70 R03
## <num> <num> <num> <num> <num> <num> <num>
## 1: 4.537778 4.464667 5.062727 5.439750 5.912632 7.606111 2.833333
## 2: 4.173889 4.461250 4.685238 5.191923 5.215455 6.614545 4.256000
## 3: 6.250968 5.927083 6.360345 6.299500 6.208462 6.811600 3.660000
## 4: 9.761786 8.538571 8.745000 8.506481 8.271579 8.203889 7.926154
## 5: 13.750345 12.800000 12.123125 11.479661 11.193871 10.450000 12.238333
## 6: 17.531724 15.977586 15.526522 14.861404 14.631200 13.286800 15.416923
## 7: 19.299310 18.163103 18.287692 17.542222 17.675769 16.219615 18.280000
## 8: 20.710000 19.619310 19.488214 19.276719 19.182759 17.908621 19.678000
## 9: 17.210714 16.658148 17.782500 17.879302 17.855625 17.470000 16.393333
## 10: 13.247647 13.435200 14.123333 14.475758 14.671875 15.242353 12.122308
## 11: 8.709375 9.843529 10.134583 10.610588 11.442143 12.648571 7.975000
## 12: 5.342353 6.540714 6.968000 7.348214 7.680667 9.411765 4.700000
## R50 R70 SOEL T004 T010 T100 T135
## <num> <num> <num> <num> <num> <num> <num>
## 1: NA NA 4.310000 5.032963 5.661538 7.036923 6.926154
## 2: NA NA 4.695333 4.442500 4.966667 5.748750 5.788750
## 3: NA NA 6.844687 5.763333 6.141176 5.769412 5.845000
## 4: 6.907692 6.660769 10.812500 8.438095 7.998571 7.086667 6.961905
## 5: 9.857500 9.781667 14.866129 11.817500 11.103913 10.075417 9.855833
## 6: 13.426923 13.733077 18.454815 15.243636 14.738636 13.430000 13.302083
## 7: 16.523571 16.847857 19.950714 17.936538 17.494615 16.517692 16.362000
## 8: 18.396000 18.375333 20.635556 19.153448 18.939310 18.004643 17.827143
## 9: 17.137500 16.865833 17.245000 16.987692 17.198462 16.289167 16.007500
## 10: 14.532727 14.711818 13.335294 13.000588 14.193077 14.323846 13.946154
## 11: NA NA 8.467222 9.445714 10.690769 11.574615 11.300769
## 12: NA NA 5.138125 6.625000 7.473333 9.140000 9.022500
## T175 T235 VLIS W02 W20 W70 WISS
## <num> <num> <num> <num> <num> <num> <num>
## 1: 6.737692 6.458462 5.071667 4.616061 6.314706 8.668235 5.178824
## 2: 5.410000 5.797143 5.237083 4.904000 5.542727 7.332000 4.726111
## 3: 5.816667 5.911053 5.877600 6.233000 6.192500 7.406471 6.114194
## 4: 6.773000 7.103500 8.539259 8.492143 8.265714 8.343846 9.018571
## 5: 9.674583 9.610833 12.591053 12.478750 11.700625 10.738667 12.618276
## 6: 13.108750 12.875833 16.382000 15.743077 14.773077 12.902308 16.259643
## 7: 16.120385 15.741923 18.898780 18.433750 17.649375 15.754667 18.400333
## 8: 17.647778 17.141429 19.942564 19.900625 19.367500 17.716250 20.102581
## 9: 15.555000 15.296667 17.507778 17.500714 18.096154 17.593571 17.501071
## 10: 13.540769 12.860714 14.301429 13.770526 14.768571 15.770000 13.982353
## 11: 11.009231 10.028462 10.011071 9.865600 11.737143 13.219286 9.901875
## 12: 8.690000 7.949091 6.498696 7.012857 8.111875 10.470000 6.649412
## ZIJP ZUID
## <num> <num>
## 1: 4.717222 2.783333
## 2: 4.314444 3.265000
## 3: 5.992581 5.604444
## 4: 9.501429 9.626897
## 5: 13.320000 13.648065
## 6: 17.183103 16.722333
## 7: 18.841667 18.935926
## 8: 20.427812 19.135667
## 9: 17.297500 15.492222
## 10: 13.540588 11.691600
## 11: 9.211250 7.395882
## 12: 5.881765 4.520000
Plot the output of the previous question using ggplot2 using the geom_line() geometry
library(ggplot2)
ggplot(monthly_station_avg, aes(x = Month, y = mean_Temperature, color = Station, group = Station)) +
geom_line(linewidth = 1) +
geom_point() +
labs(
title = "Average Temperature by Month and Station",
x = "Month",
y = "Average Temperature"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
The end.