Module 11: Working with large data sets

Using data.table to manipulate large sets of data.

library(data.table)
temp = fread("Temperature.csv")
dim(temp)
## [1] 8528   16
class(temp)
## [1] "data.table" "data.frame"

Extract all winter observations:

setDT(temp)
winter_data = temp[Season == "winter"]
print(winter_data)
##              Sample     Date    DateNr dDay1 dDay2 dDay3 Station   Area
##              <char>    <int>    <char> <int> <int> <int>  <char> <char>
##    1: DANT.19900110 19900110   10/1/90     7     9     9    DANT     WZ
##    2: DANT.19900206 19900206    6/2/90    34    36    36    DANT     WZ
##    3: DANT.19901212 19901212  12/12/90   343   345   345    DANT     WZ
##    4: DANT.19910116 19910116 1/16/1991   378   380    15    DANT     WZ
##    5: DANT.19910226 19910226 2/26/1991   419   421    56    DANT     WZ
##   ---                                                                  
## 1702: ZUID.20040216 20040216 2/16/2004  5157  5159    46    ZUID     WZ
## 1703: ZUID.20041208 20041208   8/12/04  5453  5455   342    ZUID     WZ
## 1704: ZUID.20050119 20050119 1/19/2005  5495  5497    18    ZUID     WZ
## 1705: ZUID.20050218 20050218 2/18/2005  5525  5527    48    ZUID     WZ
## 1706: ZUID.20051212 20051212  12/12/05  5822  5824   345    ZUID     WZ
##       31UE_ED50 31UN_ED50  Year Month Season Salinity Temperature CHLFa
##           <num>     <num> <int> <int> <char>    <num>       <num> <num>
##    1:  681379.6   5920571  1990     1 winter    29.19        4.00  1.30
##    2:  681379.6   5920571  1990     2 winter    27.37        6.00    NA
##    3:  681379.6   5920571  1990    12 winter    31.50        4.20 60.50
##    4:  681379.6   5920571  1991     1 winter    20.83       -0.30  2.30
##    5:  681379.6   5920571  1991     2 winter    28.06        3.90  3.52
##   ---                                                                  
## 1702:  733386.3   5928197  2004     2 winter    24.70        4.40    NA
## 1703:  733386.3   5928197  2004    12 winter    28.67        4.72  1.52
## 1704:  733386.3   5928197  2005     1 winter    28.39        3.57  4.12
## 1705:  733386.3   5928197  2005     2 winter    29.31        1.31  6.56
## 1706:  733386.3   5928197  2005    12 winter    29.08        5.13  2.38

Extract all winter observations for zone NC:

winter_nc = temp[tolower(trimws(Season)) == "winter" & Area == "NC"]
print(winter_nc)
##             Sample     Date     DateNr dDay1 dDay2 dDay3 Station   Area
##             <char>    <int>     <char> <int> <int> <int>  <char> <char>
##   1: T100.19900103 19900103     3/1/90     0     2     2    T100     NC
##   2: T100.19900205 19900205     5/2/90    33    35    35    T100     NC
##   3: T100.19901218 19901218 12/18/1990   349   351   351    T100     NC
##   4: T100.19910116 19910116  1/16/1991   378   380    15    T100     NC
##   5: T100.19910205 19910205     5/2/91   398   400    35    T100     NC
##  ---                                                                   
## 177: T235.20040209 20040209     9/2/04  5150  5152    39    T235     NC
## 178: T235.20041214 20041214 12/14/2004  5459  5461   348    T235     NC
## 179: T235.20050126 20050126  1/26/2005  5502  5504    25    T235     NC
## 180: T235.20050214 20050214  2/14/2005  5521  5523    44    T235     NC
## 181: T235.20051220 20051220 12/20/2005  5830  5832   353    T235     NC
##      31UE_ED50 31UN_ED50  Year Month Season Salinity Temperature CHLFa
##          <num>     <num> <int> <int> <char>    <num>       <num> <num>
##   1:  587650.2   6001110  1990     1 winter    34.82        8.50  0.30
##   2:  587650.2   6001110  1990     2 winter       NA          NA    NA
##   3:  587650.2   6001110  1990    12 winter    34.80        9.20  0.40
##   4:  587650.2   6001110  1991     1 winter    34.86        6.10  0.68
##   5:  587650.2   6001110  1991     2 winter    34.53        5.20  0.34
##  ---                                                                  
## 177:  510032.3   6114101  2004     2 winter       NA          NA    NA
## 178:  510032.3   6114101  2004    12 winter    34.96        8.47  0.62
## 179:  510032.3   6114101  2005     1 winter    35.09        6.44  0.62
## 180:  510032.3   6114101  2005     2 winter       NA          NA    NA
## 181:  510032.3   6114101  2005    12 winter    33.87        8.19  1.12

Select only the columns Area, Season and Temperature:

ast = temp[, .(Area, Season, Temperature)]
head(ast)
##      Area Season Temperature
##    <char> <char>       <num>
## 1:     WZ winter         4.0
## 2:     WZ winter         6.0
## 3:     WZ spring         7.3
## 4:     WZ spring         8.2
## 5:     WZ spring        17.4
## 6:     WZ summer        18.1

Select only the columns Area and Temperature but only for winter observations:

ast_winter = temp[tolower(trimws(Season)) == "winter" ,.(Area, Temperature)]
head(ast_winter)
##      Area Temperature
##    <char>       <num>
## 1:     WZ         4.0
## 2:     WZ         6.0
## 3:     WZ         4.2
## 4:     WZ        -0.3
## 5:     WZ         3.9
## 6:     WZ         3.9

Find the total number of observations in winter:

winter_num = temp[Season == "winter", .N]
winter_num
## [1] 1706

Calculate the mean temperature and mean salinity in winter:

winter_means = temp[tolower(trimws(Season)) == "winter", .(
  mean_Temperature = mean(Temperature, na.rm = TRUE),
  mean_Salinity = mean(Salinity, na.rm = TRUE)
)]

Find the number of observations per station in winter:

winter_counts = temp[
  tolower(trimws(Season)) == "winter",
  .N,
  by = Station
]
print(winter_counts)
##     Station     N
##      <char> <int>
##  1:    DANT    50
##  2:    DREI    52
##  3:      G6   101
##  4:    GROO    50
##  5:    HAMM    55
##  6:    HANS    56
##  7:    HUIB    50
##  8:    LODS    54
##  9:    MARS    49
## 10:     N02   115
## 11:     N10   131
## 12:     N20    50
## 13:     N70    50
## 14:     R03    32
## 15:    SOEL    50
## 16:    T004    97
## 17:    T010    45
## 18:    T100    45
## 19:    T135    46
## 20:    T175    45
## 21:    T235    45
## 22:    VLIS    84
## 23:     W02    99
## 24:     W20    47
## 25:     W70    47
## 26:    WISS    55
## 27:    ZIJP    54
## 28:    ZUID    52
##     Station     N

Find the number of observations per station per season:

counts = temp[, .N, by = .(Station, Season)]
print(counts)
##      Station Season     N
##       <char> <char> <int>
##   1:    DANT winter    50
##   2:    DANT spring    89
##   3:    DANT summer    89
##   4:    DANT autumn    72
##   5:    DREI winter    52
##  ---                     
## 114:    ZIJP autumn    61
## 115:    ZUID winter    52
## 116:    ZUID spring    89
## 117:    ZUID summer    89
## 118:    ZUID autumn    73

Estimate average temperatures by month:

monthly_avg_temp = temp[, .(mean_Temperature = mean(Temperature, na.rm = TRUE)), by = Month]

Estimate average temperatures by month by area:

monthly_station_avg = temp[, .(mean_Temperature = mean(Temperature, na.rm = TRUE)), 
                            by = .(Month, Station)]
wide_avg = dcast(monthly_station_avg, Month ~ Station, value.var = "mean_Temperature")
print(wide_avg)
## Key: <Month>
##     Month      DANT      DREI        G6      GROO      HAMM      HANS      HUIB
##     <int>     <num>     <num>     <num>     <num>     <num>     <num>     <num>
##  1:     1  2.916250  4.308750  5.118788  2.687333  5.055556  4.841765  3.485333
##  2:     2  4.133750  3.904444  5.209000  3.214444  4.548333  5.245882  3.398889
##  3:     3  5.931786  6.343548  6.219500  6.086000  6.031613  6.642941  5.210000
##  4:     4  9.610000  9.867778  8.399286 10.622414  9.183929  9.264737  8.356000
##  5:     5 13.765806 14.420303 12.306250 15.199355 12.948621 13.601667 12.850968
##  6:     6 16.896129 17.867407 15.327692 17.528929 16.610345 17.288710 16.217857
##  7:     7 19.141034 19.901481 18.125625 19.255000 18.758276 19.694063 18.246250
##  8:     8 19.918571 20.576061 19.801250 19.854615 20.318710 20.635000 19.630741
##  9:     9 15.577143 17.200000 17.581429 15.716552 17.513929 17.808621 16.383448
## 10:    10 11.760769 12.950000 13.955238 11.766818 13.854706 14.346250 12.739545
## 11:    11  7.500625  8.612667 10.096667  6.691875  9.640625  9.705556  8.242500
## 12:    12  4.060000  5.149412  7.387500  3.608125  6.426471  6.112353  4.851250
##          LODS      MARS       N02       N10       N20       N70       R03
##         <num>     <num>     <num>     <num>     <num>     <num>     <num>
##  1:  4.537778  4.464667  5.062727  5.439750  5.912632  7.606111  2.833333
##  2:  4.173889  4.461250  4.685238  5.191923  5.215455  6.614545  4.256000
##  3:  6.250968  5.927083  6.360345  6.299500  6.208462  6.811600  3.660000
##  4:  9.761786  8.538571  8.745000  8.506481  8.271579  8.203889  7.926154
##  5: 13.750345 12.800000 12.123125 11.479661 11.193871 10.450000 12.238333
##  6: 17.531724 15.977586 15.526522 14.861404 14.631200 13.286800 15.416923
##  7: 19.299310 18.163103 18.287692 17.542222 17.675769 16.219615 18.280000
##  8: 20.710000 19.619310 19.488214 19.276719 19.182759 17.908621 19.678000
##  9: 17.210714 16.658148 17.782500 17.879302 17.855625 17.470000 16.393333
## 10: 13.247647 13.435200 14.123333 14.475758 14.671875 15.242353 12.122308
## 11:  8.709375  9.843529 10.134583 10.610588 11.442143 12.648571  7.975000
## 12:  5.342353  6.540714  6.968000  7.348214  7.680667  9.411765  4.700000
##           R50       R70      SOEL      T004      T010      T100      T135
##         <num>     <num>     <num>     <num>     <num>     <num>     <num>
##  1:        NA        NA  4.310000  5.032963  5.661538  7.036923  6.926154
##  2:        NA        NA  4.695333  4.442500  4.966667  5.748750  5.788750
##  3:        NA        NA  6.844687  5.763333  6.141176  5.769412  5.845000
##  4:  6.907692  6.660769 10.812500  8.438095  7.998571  7.086667  6.961905
##  5:  9.857500  9.781667 14.866129 11.817500 11.103913 10.075417  9.855833
##  6: 13.426923 13.733077 18.454815 15.243636 14.738636 13.430000 13.302083
##  7: 16.523571 16.847857 19.950714 17.936538 17.494615 16.517692 16.362000
##  8: 18.396000 18.375333 20.635556 19.153448 18.939310 18.004643 17.827143
##  9: 17.137500 16.865833 17.245000 16.987692 17.198462 16.289167 16.007500
## 10: 14.532727 14.711818 13.335294 13.000588 14.193077 14.323846 13.946154
## 11:        NA        NA  8.467222  9.445714 10.690769 11.574615 11.300769
## 12:        NA        NA  5.138125  6.625000  7.473333  9.140000  9.022500
##          T175      T235      VLIS       W02       W20       W70      WISS
##         <num>     <num>     <num>     <num>     <num>     <num>     <num>
##  1:  6.737692  6.458462  5.071667  4.616061  6.314706  8.668235  5.178824
##  2:  5.410000  5.797143  5.237083  4.904000  5.542727  7.332000  4.726111
##  3:  5.816667  5.911053  5.877600  6.233000  6.192500  7.406471  6.114194
##  4:  6.773000  7.103500  8.539259  8.492143  8.265714  8.343846  9.018571
##  5:  9.674583  9.610833 12.591053 12.478750 11.700625 10.738667 12.618276
##  6: 13.108750 12.875833 16.382000 15.743077 14.773077 12.902308 16.259643
##  7: 16.120385 15.741923 18.898780 18.433750 17.649375 15.754667 18.400333
##  8: 17.647778 17.141429 19.942564 19.900625 19.367500 17.716250 20.102581
##  9: 15.555000 15.296667 17.507778 17.500714 18.096154 17.593571 17.501071
## 10: 13.540769 12.860714 14.301429 13.770526 14.768571 15.770000 13.982353
## 11: 11.009231 10.028462 10.011071  9.865600 11.737143 13.219286  9.901875
## 12:  8.690000  7.949091  6.498696  7.012857  8.111875 10.470000  6.649412
##          ZIJP      ZUID
##         <num>     <num>
##  1:  4.717222  2.783333
##  2:  4.314444  3.265000
##  3:  5.992581  5.604444
##  4:  9.501429  9.626897
##  5: 13.320000 13.648065
##  6: 17.183103 16.722333
##  7: 18.841667 18.935926
##  8: 20.427812 19.135667
##  9: 17.297500 15.492222
## 10: 13.540588 11.691600
## 11:  9.211250  7.395882
## 12:  5.881765  4.520000

Plot the output of the previous question using ggplot2 using the geom_line() geometry

library(ggplot2)

ggplot(monthly_station_avg, aes(x = Month, y = mean_Temperature, color = Station, group = Station)) +
  geom_line(linewidth = 1) +
  geom_point() +
  labs(
    title = "Average Temperature by Month and Station",
    x = "Month",
    y = "Average Temperature"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

The end.