STAT 585X: Week 1: Scratch pad and lab notes

Ian Lyttle

library(knitr)
library(ggplot2)
library(plyr)
library(reshape2)
library(stringr)
library(lubridate)
## 
## Attaching package: 'lubridate'
## 
## The following object is masked from 'package:plyr':
## 
##     here
library(maps)

options(width = 100, str = strOptions(vec.len = 2))

opts_chunk$set(comment = NA, tidy = FALSE, fig.align = "center", fig.width = 10, 
    fig.height = 6, dev = "png")

Messing around with US Census API

library(cbapi)
Loading required package: rjson
Loading required package: XML

# as Heike says, my key - get your own :)
setkey("94779290e91e4722bea263a56d4bf2b9e3e53313")
key is saved, now you will be able to access data through the API. 

url <- "http://api.census.gov/data/2010/sf1?key=%s&get=P0010001,NAME&for=state:*"

popstate <- read.census(sprintf(url, getkey()))
head(popstate)
  P0010001       NAME state
1  4779736    Alabama    01
2   710231     Alaska    02
3  6392017    Arizona    04
4  2915918   Arkansas    05
5 37253956 California    06
6  5029196   Colorado    08

SAS

library(foreign)

fname <- file.path("..", "..", "data", "DEMO_F.XPT")

demo <- read.xport(fname)

data_small <- summarize(
  demo,
  age = RIDAGEEX,
  gender = mapvalues(as.factor(RIAGENDR), from = c(1, 2), to = c("male", "female"))
)

summary(data_small)
      age         gender    
 Min.   :  0   male  :5225  
 1st Qu.:120   female:5312  
 Median :326                
 Mean   :374                
 3rd Qu.:609                
 Max.   :959                
 NA's   :682                

ggplot(data_small) + geom_boxplot(aes(x = gender, y = age)) + labs(y = "age (months)")
Warning: Removed 682 rows containing non-finite values (stat_boxplot).

plot of chunk unnamed-chunk-2

USHCN

First, let's parse the stations' data.

fname <- file.path("..", "..", "data", "ushcn-stations.txt")
#file.exists(fname)

stations <- read.fwf(
  file = fname,
  widths = c(6, 9, 10, -7, 3, 31, -7, -7, -7, -3),
  col.names = c("id", "lat", "long", "state", "name"),
  strip.white = TRUE
)

head(stations)
     id   lat   long state             name
1 11084 31.06 -87.05    AL    BREWTON 3 SSE
2 12813 30.55 -87.88    AL    FAIRHOPE 2 NE
3 13160 32.83 -88.13    AL GAINESVILLE LOCK
4 13511 32.70 -87.58    AL       GREENSBORO
5 13816 31.87 -86.25    AL    HIGHLAND HOME
6 15749 34.74 -87.60    AL MUSCLE SHOALS AP
summary(stations)
       id              lat            long            state           name     
 Min.   : 11084   Min.   :24.6   Min.   :-124.3   NY     : 57   ASHLAND :   3  
 1st Qu.:135835   1st Qu.:35.8   1st Qu.:-107.1   CA     : 54   COLUMBUS:   3  
 Median :257292   Median :39.9   Median : -94.6   TX     : 49   FAIRMONT:   3  
 Mean   :259101   Mean   :39.5   Mean   : -95.7   NE     : 46   MIAMI   :   3  
 3rd Qu.:368559   3rd Qu.:43.2   3rd Qu.: -83.6   MT     : 44   ABERDEEN:   2  
 Max.   :489905   Max.   :49.0   Max.   : -67.0   OK     : 44   ADA     :   2  
                                                  (Other):924   (Other) :1202  

So, there are three Miami's. Next, let's look at the station data for Iowa.

fname <- file.path("..", "..", "data", "state13_IA.txt")
#file.exists(fname)

wx_data <- read.fwf(
  file = fname,
  widths = c(6, 4, 2, 4, rep(c(5, -1, -1, -1), times = 31)),
  col.names = c("id", "year", "month", "measurement", 
                str_join(rep("day_", 31) ,seq(1, 31))),
  strip.white = TRUE
)

summary(wx_data)
       id              year          month       measurement      day_1           day_2      
 Min.   :130112   Min.   :1893   Min.   : 1.00   PRCP:31665   Min.   :-9999   Min.   :-9999  
 1st Qu.:131635   1st Qu.:1926   1st Qu.: 3.00   SNOW:30149   1st Qu.:    0   1st Qu.:    0  
 Median :134063   Median :1956   Median : 6.00   SNWD:27347   Median :    0   Median :    0  
 Mean   :134164   Mean   :1955   Mean   : 6.49   TMAX:31625   Mean   : -212   Mean   : -220  
 3rd Qu.:135952   3rd Qu.:1984   3rd Qu.:10.00   TMIN:31618   3rd Qu.:   44   3rd Qu.:   44  
 Max.   :138688   Max.   :2012   Max.   :12.00                Max.   :  641   Max.   :  835  
     day_3           day_4           day_5           day_6           day_7           day_8      
 Min.   :-9999   Min.   :-9999   Min.   :-9999   Min.   :-9999   Min.   :-9999   Min.   :-9999  
 1st Qu.:    0   1st Qu.:    0   1st Qu.:    0   1st Qu.:    0   1st Qu.:    0   1st Qu.:    0  
 Median :    0   Median :    0   Median :    0   Median :    0   Median :    0   Median :    0  
 Mean   : -219   Mean   : -214   Mean   : -221   Mean   : -216   Mean   : -213   Mean   : -203  
 3rd Qu.:   44   3rd Qu.:   44   3rd Qu.:   44   3rd Qu.:   43   3rd Qu.:   43   3rd Qu.:   43  
 Max.   : 1010   Max.   :  600   Max.   :  611   Max.   :  557   Max.   : 7000   Max.   :  576  
     day_9           day_10          day_11          day_12          day_13          day_14     
 Min.   :-9999   Min.   :-9999   Min.   :-9999   Min.   :-9999   Min.   :-9999   Min.   :-9999  
 1st Qu.:    0   1st Qu.:    0   1st Qu.:    0   1st Qu.:    0   1st Qu.:    0   1st Qu.:    0  
 Median :    0   Median :    0   Median :    0   Median :    0   Median :    0   Median :    0  
 Mean   : -209   Mean   : -211   Mean   : -215   Mean   : -211   Mean   : -209   Mean   : -205  
 3rd Qu.:   43   3rd Qu.:   43   3rd Qu.:   43   3rd Qu.:   44   3rd Qu.:   43   3rd Qu.:   44  
 Max.   : 1015   Max.   :  720   Max.   :  550   Max.   :  651   Max.   :  650   Max.   :  638  
     day_15          day_16          day_17          day_18          day_19          day_20     
 Min.   :-9999   Min.   :-9999   Min.   :-9999   Min.   :-9999   Min.   :-9999   Min.   :-9999  
 1st Qu.:    0   1st Qu.:    0   1st Qu.:    0   1st Qu.:    0   1st Qu.:    0   1st Qu.:    0  
 Median :    0   Median :    0   Median :    0   Median :    0   Median :    0   Median :    0  
 Mean   : -202   Mean   : -206   Mean   : -208   Mean   : -208   Mean   : -205   Mean   : -207  
 3rd Qu.:   44   3rd Qu.:   44   3rd Qu.:   44   3rd Qu.:   44   3rd Qu.:   44   3rd Qu.:   44  
 Max.   : 6000   Max.   : 1060   Max.   :  713   Max.   : 2520   Max.   : 1121   Max.   : 3000  
     day_21          day_22          day_23          day_24          day_25          day_26     
 Min.   :-9999   Min.   :-9999   Min.   :-9999   Min.   :-9999   Min.   :-9999   Min.   :-9999  
 1st Qu.:    0   1st Qu.:    0   1st Qu.:    0   1st Qu.:    0   1st Qu.:    0   1st Qu.:    0  
 Median :    0   Median :    0   Median :    0   Median :    0   Median :    0   Median :    0  
 Mean   : -205   Mean   : -203   Mean   : -209   Mean   : -210   Mean   : -213   Mean   : -212  
 3rd Qu.:   44   3rd Qu.:   44   3rd Qu.:   44   3rd Qu.:   44   3rd Qu.:   44   3rd Qu.:   44  
 Max.   :  710   Max.   :  625   Max.   : 3333   Max.   : 3333   Max.   :  658   Max.   :  890  
     day_27          day_28          day_29          day_30          day_31     
 Min.   :-9999   Min.   :-9999   Min.   :-9999   Min.   :-9999   Min.   :-9999  
 1st Qu.:    0   1st Qu.:    0   1st Qu.:    0   1st Qu.:    0   1st Qu.:-9999  
 Median :    0   Median :    0   Median :    0   Median :    0   Median :    0  
 Mean   : -210   Mean   : -206   Mean   : -831   Mean   :-1027   Mean   :-4312  
 3rd Qu.:   43   3rd Qu.:   43   3rd Qu.:   42   3rd Qu.:   42   3rd Qu.:    8  
 Max.   : 3000   Max.   :  578   Max.   :  876   Max.   : 1010   Max.   : 1093  

It seems we should melt the days and cast the measurements.

wx_data_melt <- melt(
  data = wx_data,
  id.vars = c("id", "year", "month", "measurement"),
  variable.name = "char_day"
)

summary(wx_data_melt)
       id              year          month       measurement      char_day           value      
 Min.   :130112   Min.   :1893   Min.   : 1.00   PRCP:981615   day_1  : 152404   Min.   :-9999  
 1st Qu.:131635   1st Qu.:1926   1st Qu.: 3.00   SNOW:934619   day_2  : 152404   1st Qu.:    0  
 Median :134063   Median :1956   Median : 6.00   SNWD:847757   day_3  : 152404   Median :    0  
 Mean   :134164   Mean   :1955   Mean   : 6.49   TMAX:980375   day_4  : 152404   Mean   : -389  
 3rd Qu.:135952   3rd Qu.:1984   3rd Qu.:10.00   TMIN:980158   day_5  : 152404   3rd Qu.:   43  
 Max.   :138688   Max.   :2012   Max.   :12.00                 day_6  : 152404   Max.   : 7000  
                                                               (Other):3810100                  

Before casting, let's deal with the NA values properly.

wx_data_melt$value[wx_data_melt$value == -9999] <- NA

Now, let's cast.

wx_data_cast <- dcast(
  data = wx_data_melt,
  formula = id + year + month + char_day ~ measurement,
  value.var = "value"
)

summary(wx_data_cast)
       id              year          month         char_day           PRCP            SNOW      
 Min.   :130112   Min.   :1893   Min.   : 1.0   day_1  : 31740   Min.   :   0    Min.   :   0   
 1st Qu.:131635   1st Qu.:1924   1st Qu.: 4.0   day_2  : 31740   1st Qu.:   0    1st Qu.:   0   
 Median :134063   Median :1954   Median : 7.0   day_3  : 31740   Median :   0    Median :   0   
 Mean   :134138   Mean   :1954   Mean   : 6.5   day_4  : 31740   Mean   :   9    Mean   :   1   
 3rd Qu.:135952   3rd Qu.:1984   3rd Qu.:10.0   day_5  : 31740   3rd Qu.:   1    3rd Qu.:   0   
 Max.   :138688   Max.   :2012   Max.   :12.0   day_6  : 31740   Max.   :7000    Max.   :2520   
                                                (Other):793500   NA's   :40629   NA's   :82212  
      SNWD             TMAX            TMIN      
 Min.   :  0      Min.   : -89    Min.   :-91    
 1st Qu.:  0      1st Qu.:  40    1st Qu.: 23    
 Median :  0      Median :  63    Median : 39    
 Mean   :  2      Mean   :  59    Mean   : 38    
 3rd Qu.:  0      3rd Qu.:  79    3rd Qu.: 55    
 Max.   :604      Max.   :1093    Max.   :118    
 NA's   :209184   NA's   :28291   NA's   :28921  

We can make proper dates (although, if we wanted to skip right to the station low, we don't need this):

fn_make_date <- function(year, month, char_day){

  str_date <- str_join(
    as.character(year),
    as.character(month),
    str_replace(char_day, pattern = "day_", replacement = ""),
    sep = "-"
  )

  ymd(str_date) # lubridate will catch the "silly" dates and parse to NA

}

wx_data_tidy <- summarize(
  wx_data_cast,
  id,
  date = suppressWarnings(fn_make_date(year, month, char_day)),
  prcp = PRCP,
  snow = SNOW,
  snwd = SNWD,
  tmax = TMAX,
  tmin = TMIN
)

# get rid of NA date
wx_data_tidy <- wx_data_tidy[!is.na(wx_data_tidy$date), ]

summary(wx_data_tidy)
       id              date                          prcp            snow            snwd       
 Min.   :130112   Min.   :1893-01-01 00:00:00   Min.   :   0    Min.   :   0    Min.   :  0     
 1st Qu.:131635   1st Qu.:1924-12-17 00:00:00   1st Qu.:   0    1st Qu.:   0    1st Qu.:  0     
 Median :134063   Median :1954-11-27 00:00:00   Median :   0    Median :   0    Median :  0     
 Mean   :134138   Mean   :1954-05-29 17:51:59   Mean   :   9    Mean   :   1    Mean   :  2     
 3rd Qu.:135952   3rd Qu.:1984-01-10 00:00:00   3rd Qu.:   1    3rd Qu.:   0    3rd Qu.:  0     
 Max.   :138688   Max.   :2012-12-31 00:00:00   Max.   :7000    Max.   :2520    Max.   :604     
                                                NA's   :22783   NA's   :64366   NA's   :191338  
      tmax            tmin      
 Min.   : -89    Min.   :-91    
 1st Qu.:  40    1st Qu.: 23    
 Median :  63    Median : 39    
 Mean   :  59    Mean   : 38    
 3rd Qu.:  79    3rd Qu.: 55    
 Max.   :1093    Max.   :118    
 NA's   :10445   NA's   :11075  

Some of those temperatures look suspicious

qplot(x = tmin, data = wx_data_tidy)
stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-9


head(arrange(wx_data_tidy, tmin) )
      id       date prcp snow snwd tmax tmin
1 132999 1916-12-16    0    0    1   37  -91
2 132999 1953-03-21    0    0    0   73  -45
3 132864 1996-02-03    0    0   18  -20  -40
4 137147 1899-02-09    0    0   NA  -18  -40
5 137147 1912-01-12    0    0   NA  -18  -40
6 132864 1996-02-04    0    0   17  -15  -39

I'm going to go out on a limb and discard the -45 on 1953-03-21, and the -91 on 1916-12-16.

wx_data_tidy$tmin[wx_data_tidy$tmin < -41] <- NA

summary(wx_data_tidy)
       id              date                          prcp            snow            snwd       
 Min.   :130112   Min.   :1893-01-01 00:00:00   Min.   :   0    Min.   :   0    Min.   :  0     
 1st Qu.:131635   1st Qu.:1924-12-17 00:00:00   1st Qu.:   0    1st Qu.:   0    1st Qu.:  0     
 Median :134063   Median :1954-11-27 00:00:00   Median :   0    Median :   0    Median :  0     
 Mean   :134138   Mean   :1954-05-29 17:51:59   Mean   :   9    Mean   :   1    Mean   :  2     
 3rd Qu.:135952   3rd Qu.:1984-01-10 00:00:00   3rd Qu.:   1    3rd Qu.:   0    3rd Qu.:  0     
 Max.   :138688   Max.   :2012-12-31 00:00:00   Max.   :7000    Max.   :2520    Max.   :604     
                                                NA's   :22783   NA's   :64366   NA's   :191338  
      tmax            tmin      
 Min.   : -89    Min.   :-40    
 1st Qu.:  40    1st Qu.: 23    
 Median :  63    Median : 39    
 Mean   :  59    Mean   : 38    
 3rd Qu.:  79    3rd Qu.: 55    
 Max.   :1093    Max.   :118    
 NA's   :10445   NA's   :11077  
qplot(x = tmin, data = wx_data_tidy)
stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-10

Now, let's get the min for each station

wx_tmin <- ddply(
  .data = wx_data_tidy,
  .variables = .(id),
  .fun = summarize,
  tmin_hist = min(tmin, na.rm = TRUE),
  date = max(date[tmin == tmin_hist], na.rm = TRUE)
)

wx_tmin
       id tmin_hist       date
1  130112       -31 1996-02-03
2  130133       -36 1899-02-11
3  130600       -38 2009-01-15
4  131402       -34 2009-01-16
5  131533       -31 1912-01-13
6  131635       -29 1996-02-03
7  132724       -38 1899-02-09
8  132789       -31 1996-02-03
9  132864       -40 1996-02-03
10 132977       -36 1912-01-12
11 132999       -35 1912-01-13
12 134063       -35 1996-02-04
13 134142       -34 1912-01-12
14 134735       -37 1912-01-12
15 134894       -35 1912-01-12
16 135769       -30 1912-01-12
17 135796       -27 2009-01-16
18 135952       -34 1963-01-15
19 137147       -40 1912-01-12
20 137161       -35 1912-01-12
21 137979       -34 1905-02-03
22 138296       -34 1996-02-03
23 138688       -32 2009-01-16

This all looks plausible. Now, let's join in the station data.

wx_tmin_station <- join(wx_tmin, stations, by = "id")

wx_tmin_station
       id tmin_hist       date   lat   long state              name
1  130112       -31 1996-02-03 41.07 -92.79    IA       ALBIA 3 NNE
2  130133       -36 1899-02-11 43.07 -94.31    IA        ALGONA 3 W
3  130600       -38 2009-01-15 41.88 -92.28    IA      BELLE PLAINE
4  131402       -34 2009-01-16 43.08 -92.67    IA      CHARLES CITY
5  131533       -31 1912-01-13 40.72 -95.02    IA          CLARINDA
6  131635       -29 1996-02-03 41.79 -90.26    IA           CLINTON
7  132724       -38 1899-02-09 43.43 -94.82    IA   ESTHERVILLE 2 N
8  132789       -31 1996-02-03 41.02 -91.96    IA         FAIRFIELD
9  132864       -40 1996-02-03 42.85 -91.82    IA           FAYETTE
10 132977       -36 1912-01-12 43.28 -93.63    IA FOREST CITY 2 NNE
11 132999       -35 1912-01-13 42.58 -94.20    IA   FORT DODGE 5NNW
12 134063       -35 1996-02-04 41.37 -93.65    IA      INDIANOLA 2W
13 134142       -34 1912-01-12 42.52 -93.25    IA        IOWA FALLS
14 134735       -37 1912-01-12 42.78 -96.15    IA           LE MARS
15 134894       -35 1912-01-12 41.64 -95.79    IA             LOGAN
16 135769       -30 1912-01-12 40.71 -94.24    IA            MT AYR
17 135796       -27 2009-01-16 40.95 -91.56    IA MT PLEASANT 1 SSW
18 135952       -34 1963-01-15 43.05 -92.31    IA       NEW HAMPTON
19 137147       -40 1912-01-12 43.43 -96.17    IA       ROCK RAPIDS
20 137161       -35 1912-01-12 42.40 -94.63    IA     ROCKWELL CITY
21 137979       -34 1905-02-03 42.63 -95.17    IA    STORM LAKE 2 E
22 138296       -34 1996-02-03 42.04 -92.58    IA         TOLEDO 3N
23 138688       -32 2009-01-16 41.28 -91.71    IA        WASHINGTON

It remains to map the data.

iowa_map_data <- map_data('state', region = c("iowa"))

ggplot(wx_tmin_station, mapping = aes(x = long, y = lat)) +
  geom_text(mapping = aes(label = tmin_hist), color = "blue") + 
geom_polygon(data = iowa_map_data, alpha = 0.5)

plot of chunk unnamed-chunk-13

Finally!

NETCDF

library(ncdf)

fo <- open.ncdf("../../data/sweden.temp.01-10.nc")
sweden.temp <- get.var.ncdf(fo, "tasol")
dim(sweden.temp)
[1]   40   66 3287

##