Ian Lyttle
library(knitr)
library(ggplot2)
library(plyr)
library(reshape2)
library(stringr)
library(lubridate)
##
## Attaching package: 'lubridate'
##
## The following object is masked from 'package:plyr':
##
## here
library(maps)
options(width = 100, str = strOptions(vec.len = 2))
opts_chunk$set(comment = NA, tidy = FALSE, fig.align = "center", fig.width = 10,
fig.height = 6, dev = "png")
library(cbapi)
Loading required package: rjson
Loading required package: XML
# as Heike says, my key - get your own :)
setkey("94779290e91e4722bea263a56d4bf2b9e3e53313")
key is saved, now you will be able to access data through the API.
url <- "http://api.census.gov/data/2010/sf1?key=%s&get=P0010001,NAME&for=state:*"
popstate <- read.census(sprintf(url, getkey()))
head(popstate)
P0010001 NAME state
1 4779736 Alabama 01
2 710231 Alaska 02
3 6392017 Arizona 04
4 2915918 Arkansas 05
5 37253956 California 06
6 5029196 Colorado 08
library(foreign)
fname <- file.path("..", "..", "data", "DEMO_F.XPT")
demo <- read.xport(fname)
data_small <- summarize(
demo,
age = RIDAGEEX,
gender = mapvalues(as.factor(RIAGENDR), from = c(1, 2), to = c("male", "female"))
)
summary(data_small)
age gender
Min. : 0 male :5225
1st Qu.:120 female:5312
Median :326
Mean :374
3rd Qu.:609
Max. :959
NA's :682
ggplot(data_small) + geom_boxplot(aes(x = gender, y = age)) + labs(y = "age (months)")
Warning: Removed 682 rows containing non-finite values (stat_boxplot).
First, let's parse the stations' data.
fname <- file.path("..", "..", "data", "ushcn-stations.txt")
#file.exists(fname)
stations <- read.fwf(
file = fname,
widths = c(6, 9, 10, -7, 3, 31, -7, -7, -7, -3),
col.names = c("id", "lat", "long", "state", "name"),
strip.white = TRUE
)
head(stations)
id lat long state name
1 11084 31.06 -87.05 AL BREWTON 3 SSE
2 12813 30.55 -87.88 AL FAIRHOPE 2 NE
3 13160 32.83 -88.13 AL GAINESVILLE LOCK
4 13511 32.70 -87.58 AL GREENSBORO
5 13816 31.87 -86.25 AL HIGHLAND HOME
6 15749 34.74 -87.60 AL MUSCLE SHOALS AP
summary(stations)
id lat long state name
Min. : 11084 Min. :24.6 Min. :-124.3 NY : 57 ASHLAND : 3
1st Qu.:135835 1st Qu.:35.8 1st Qu.:-107.1 CA : 54 COLUMBUS: 3
Median :257292 Median :39.9 Median : -94.6 TX : 49 FAIRMONT: 3
Mean :259101 Mean :39.5 Mean : -95.7 NE : 46 MIAMI : 3
3rd Qu.:368559 3rd Qu.:43.2 3rd Qu.: -83.6 MT : 44 ABERDEEN: 2
Max. :489905 Max. :49.0 Max. : -67.0 OK : 44 ADA : 2
(Other):924 (Other) :1202
So, there are three Miami's. Next, let's look at the station data for Iowa.
fname <- file.path("..", "..", "data", "state13_IA.txt")
#file.exists(fname)
wx_data <- read.fwf(
file = fname,
widths = c(6, 4, 2, 4, rep(c(5, -1, -1, -1), times = 31)),
col.names = c("id", "year", "month", "measurement",
str_join(rep("day_", 31) ,seq(1, 31))),
strip.white = TRUE
)
summary(wx_data)
id year month measurement day_1 day_2
Min. :130112 Min. :1893 Min. : 1.00 PRCP:31665 Min. :-9999 Min. :-9999
1st Qu.:131635 1st Qu.:1926 1st Qu.: 3.00 SNOW:30149 1st Qu.: 0 1st Qu.: 0
Median :134063 Median :1956 Median : 6.00 SNWD:27347 Median : 0 Median : 0
Mean :134164 Mean :1955 Mean : 6.49 TMAX:31625 Mean : -212 Mean : -220
3rd Qu.:135952 3rd Qu.:1984 3rd Qu.:10.00 TMIN:31618 3rd Qu.: 44 3rd Qu.: 44
Max. :138688 Max. :2012 Max. :12.00 Max. : 641 Max. : 835
day_3 day_4 day_5 day_6 day_7 day_8
Min. :-9999 Min. :-9999 Min. :-9999 Min. :-9999 Min. :-9999 Min. :-9999
1st Qu.: 0 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0
Median : 0 Median : 0 Median : 0 Median : 0 Median : 0 Median : 0
Mean : -219 Mean : -214 Mean : -221 Mean : -216 Mean : -213 Mean : -203
3rd Qu.: 44 3rd Qu.: 44 3rd Qu.: 44 3rd Qu.: 43 3rd Qu.: 43 3rd Qu.: 43
Max. : 1010 Max. : 600 Max. : 611 Max. : 557 Max. : 7000 Max. : 576
day_9 day_10 day_11 day_12 day_13 day_14
Min. :-9999 Min. :-9999 Min. :-9999 Min. :-9999 Min. :-9999 Min. :-9999
1st Qu.: 0 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0
Median : 0 Median : 0 Median : 0 Median : 0 Median : 0 Median : 0
Mean : -209 Mean : -211 Mean : -215 Mean : -211 Mean : -209 Mean : -205
3rd Qu.: 43 3rd Qu.: 43 3rd Qu.: 43 3rd Qu.: 44 3rd Qu.: 43 3rd Qu.: 44
Max. : 1015 Max. : 720 Max. : 550 Max. : 651 Max. : 650 Max. : 638
day_15 day_16 day_17 day_18 day_19 day_20
Min. :-9999 Min. :-9999 Min. :-9999 Min. :-9999 Min. :-9999 Min. :-9999
1st Qu.: 0 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0
Median : 0 Median : 0 Median : 0 Median : 0 Median : 0 Median : 0
Mean : -202 Mean : -206 Mean : -208 Mean : -208 Mean : -205 Mean : -207
3rd Qu.: 44 3rd Qu.: 44 3rd Qu.: 44 3rd Qu.: 44 3rd Qu.: 44 3rd Qu.: 44
Max. : 6000 Max. : 1060 Max. : 713 Max. : 2520 Max. : 1121 Max. : 3000
day_21 day_22 day_23 day_24 day_25 day_26
Min. :-9999 Min. :-9999 Min. :-9999 Min. :-9999 Min. :-9999 Min. :-9999
1st Qu.: 0 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0
Median : 0 Median : 0 Median : 0 Median : 0 Median : 0 Median : 0
Mean : -205 Mean : -203 Mean : -209 Mean : -210 Mean : -213 Mean : -212
3rd Qu.: 44 3rd Qu.: 44 3rd Qu.: 44 3rd Qu.: 44 3rd Qu.: 44 3rd Qu.: 44
Max. : 710 Max. : 625 Max. : 3333 Max. : 3333 Max. : 658 Max. : 890
day_27 day_28 day_29 day_30 day_31
Min. :-9999 Min. :-9999 Min. :-9999 Min. :-9999 Min. :-9999
1st Qu.: 0 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0 1st Qu.:-9999
Median : 0 Median : 0 Median : 0 Median : 0 Median : 0
Mean : -210 Mean : -206 Mean : -831 Mean :-1027 Mean :-4312
3rd Qu.: 43 3rd Qu.: 43 3rd Qu.: 42 3rd Qu.: 42 3rd Qu.: 8
Max. : 3000 Max. : 578 Max. : 876 Max. : 1010 Max. : 1093
It seems we should melt the days and cast the measurements.
wx_data_melt <- melt(
data = wx_data,
id.vars = c("id", "year", "month", "measurement"),
variable.name = "char_day"
)
summary(wx_data_melt)
id year month measurement char_day value
Min. :130112 Min. :1893 Min. : 1.00 PRCP:981615 day_1 : 152404 Min. :-9999
1st Qu.:131635 1st Qu.:1926 1st Qu.: 3.00 SNOW:934619 day_2 : 152404 1st Qu.: 0
Median :134063 Median :1956 Median : 6.00 SNWD:847757 day_3 : 152404 Median : 0
Mean :134164 Mean :1955 Mean : 6.49 TMAX:980375 day_4 : 152404 Mean : -389
3rd Qu.:135952 3rd Qu.:1984 3rd Qu.:10.00 TMIN:980158 day_5 : 152404 3rd Qu.: 43
Max. :138688 Max. :2012 Max. :12.00 day_6 : 152404 Max. : 7000
(Other):3810100
Before casting, let's deal with the NA values properly.
wx_data_melt$value[wx_data_melt$value == -9999] <- NA
Now, let's cast.
wx_data_cast <- dcast(
data = wx_data_melt,
formula = id + year + month + char_day ~ measurement,
value.var = "value"
)
summary(wx_data_cast)
id year month char_day PRCP SNOW
Min. :130112 Min. :1893 Min. : 1.0 day_1 : 31740 Min. : 0 Min. : 0
1st Qu.:131635 1st Qu.:1924 1st Qu.: 4.0 day_2 : 31740 1st Qu.: 0 1st Qu.: 0
Median :134063 Median :1954 Median : 7.0 day_3 : 31740 Median : 0 Median : 0
Mean :134138 Mean :1954 Mean : 6.5 day_4 : 31740 Mean : 9 Mean : 1
3rd Qu.:135952 3rd Qu.:1984 3rd Qu.:10.0 day_5 : 31740 3rd Qu.: 1 3rd Qu.: 0
Max. :138688 Max. :2012 Max. :12.0 day_6 : 31740 Max. :7000 Max. :2520
(Other):793500 NA's :40629 NA's :82212
SNWD TMAX TMIN
Min. : 0 Min. : -89 Min. :-91
1st Qu.: 0 1st Qu.: 40 1st Qu.: 23
Median : 0 Median : 63 Median : 39
Mean : 2 Mean : 59 Mean : 38
3rd Qu.: 0 3rd Qu.: 79 3rd Qu.: 55
Max. :604 Max. :1093 Max. :118
NA's :209184 NA's :28291 NA's :28921
We can make proper dates (although, if we wanted to skip right to the station low, we don't need this):
fn_make_date <- function(year, month, char_day){
str_date <- str_join(
as.character(year),
as.character(month),
str_replace(char_day, pattern = "day_", replacement = ""),
sep = "-"
)
ymd(str_date) # lubridate will catch the "silly" dates and parse to NA
}
wx_data_tidy <- summarize(
wx_data_cast,
id,
date = suppressWarnings(fn_make_date(year, month, char_day)),
prcp = PRCP,
snow = SNOW,
snwd = SNWD,
tmax = TMAX,
tmin = TMIN
)
# get rid of NA date
wx_data_tidy <- wx_data_tidy[!is.na(wx_data_tidy$date), ]
summary(wx_data_tidy)
id date prcp snow snwd
Min. :130112 Min. :1893-01-01 00:00:00 Min. : 0 Min. : 0 Min. : 0
1st Qu.:131635 1st Qu.:1924-12-17 00:00:00 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0
Median :134063 Median :1954-11-27 00:00:00 Median : 0 Median : 0 Median : 0
Mean :134138 Mean :1954-05-29 17:51:59 Mean : 9 Mean : 1 Mean : 2
3rd Qu.:135952 3rd Qu.:1984-01-10 00:00:00 3rd Qu.: 1 3rd Qu.: 0 3rd Qu.: 0
Max. :138688 Max. :2012-12-31 00:00:00 Max. :7000 Max. :2520 Max. :604
NA's :22783 NA's :64366 NA's :191338
tmax tmin
Min. : -89 Min. :-91
1st Qu.: 40 1st Qu.: 23
Median : 63 Median : 39
Mean : 59 Mean : 38
3rd Qu.: 79 3rd Qu.: 55
Max. :1093 Max. :118
NA's :10445 NA's :11075
Some of those temperatures look suspicious
qplot(x = tmin, data = wx_data_tidy)
stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
head(arrange(wx_data_tidy, tmin) )
id date prcp snow snwd tmax tmin
1 132999 1916-12-16 0 0 1 37 -91
2 132999 1953-03-21 0 0 0 73 -45
3 132864 1996-02-03 0 0 18 -20 -40
4 137147 1899-02-09 0 0 NA -18 -40
5 137147 1912-01-12 0 0 NA -18 -40
6 132864 1996-02-04 0 0 17 -15 -39
I'm going to go out on a limb and discard the -45 on 1953-03-21, and the -91 on 1916-12-16.
wx_data_tidy$tmin[wx_data_tidy$tmin < -41] <- NA
summary(wx_data_tidy)
id date prcp snow snwd
Min. :130112 Min. :1893-01-01 00:00:00 Min. : 0 Min. : 0 Min. : 0
1st Qu.:131635 1st Qu.:1924-12-17 00:00:00 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0
Median :134063 Median :1954-11-27 00:00:00 Median : 0 Median : 0 Median : 0
Mean :134138 Mean :1954-05-29 17:51:59 Mean : 9 Mean : 1 Mean : 2
3rd Qu.:135952 3rd Qu.:1984-01-10 00:00:00 3rd Qu.: 1 3rd Qu.: 0 3rd Qu.: 0
Max. :138688 Max. :2012-12-31 00:00:00 Max. :7000 Max. :2520 Max. :604
NA's :22783 NA's :64366 NA's :191338
tmax tmin
Min. : -89 Min. :-40
1st Qu.: 40 1st Qu.: 23
Median : 63 Median : 39
Mean : 59 Mean : 38
3rd Qu.: 79 3rd Qu.: 55
Max. :1093 Max. :118
NA's :10445 NA's :11077
qplot(x = tmin, data = wx_data_tidy)
stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
Now, let's get the min for each station
wx_tmin <- ddply(
.data = wx_data_tidy,
.variables = .(id),
.fun = summarize,
tmin_hist = min(tmin, na.rm = TRUE),
date = max(date[tmin == tmin_hist], na.rm = TRUE)
)
wx_tmin
id tmin_hist date
1 130112 -31 1996-02-03
2 130133 -36 1899-02-11
3 130600 -38 2009-01-15
4 131402 -34 2009-01-16
5 131533 -31 1912-01-13
6 131635 -29 1996-02-03
7 132724 -38 1899-02-09
8 132789 -31 1996-02-03
9 132864 -40 1996-02-03
10 132977 -36 1912-01-12
11 132999 -35 1912-01-13
12 134063 -35 1996-02-04
13 134142 -34 1912-01-12
14 134735 -37 1912-01-12
15 134894 -35 1912-01-12
16 135769 -30 1912-01-12
17 135796 -27 2009-01-16
18 135952 -34 1963-01-15
19 137147 -40 1912-01-12
20 137161 -35 1912-01-12
21 137979 -34 1905-02-03
22 138296 -34 1996-02-03
23 138688 -32 2009-01-16
This all looks plausible. Now, let's join in the station data.
wx_tmin_station <- join(wx_tmin, stations, by = "id")
wx_tmin_station
id tmin_hist date lat long state name
1 130112 -31 1996-02-03 41.07 -92.79 IA ALBIA 3 NNE
2 130133 -36 1899-02-11 43.07 -94.31 IA ALGONA 3 W
3 130600 -38 2009-01-15 41.88 -92.28 IA BELLE PLAINE
4 131402 -34 2009-01-16 43.08 -92.67 IA CHARLES CITY
5 131533 -31 1912-01-13 40.72 -95.02 IA CLARINDA
6 131635 -29 1996-02-03 41.79 -90.26 IA CLINTON
7 132724 -38 1899-02-09 43.43 -94.82 IA ESTHERVILLE 2 N
8 132789 -31 1996-02-03 41.02 -91.96 IA FAIRFIELD
9 132864 -40 1996-02-03 42.85 -91.82 IA FAYETTE
10 132977 -36 1912-01-12 43.28 -93.63 IA FOREST CITY 2 NNE
11 132999 -35 1912-01-13 42.58 -94.20 IA FORT DODGE 5NNW
12 134063 -35 1996-02-04 41.37 -93.65 IA INDIANOLA 2W
13 134142 -34 1912-01-12 42.52 -93.25 IA IOWA FALLS
14 134735 -37 1912-01-12 42.78 -96.15 IA LE MARS
15 134894 -35 1912-01-12 41.64 -95.79 IA LOGAN
16 135769 -30 1912-01-12 40.71 -94.24 IA MT AYR
17 135796 -27 2009-01-16 40.95 -91.56 IA MT PLEASANT 1 SSW
18 135952 -34 1963-01-15 43.05 -92.31 IA NEW HAMPTON
19 137147 -40 1912-01-12 43.43 -96.17 IA ROCK RAPIDS
20 137161 -35 1912-01-12 42.40 -94.63 IA ROCKWELL CITY
21 137979 -34 1905-02-03 42.63 -95.17 IA STORM LAKE 2 E
22 138296 -34 1996-02-03 42.04 -92.58 IA TOLEDO 3N
23 138688 -32 2009-01-16 41.28 -91.71 IA WASHINGTON
It remains to map the data.
iowa_map_data <- map_data('state', region = c("iowa"))
ggplot(wx_tmin_station, mapping = aes(x = long, y = lat)) +
geom_text(mapping = aes(label = tmin_hist), color = "blue") +
geom_polygon(data = iowa_map_data, alpha = 0.5)
Finally!
library(ncdf)
fo <- open.ncdf("../../data/sweden.temp.01-10.nc")
sweden.temp <- get.var.ncdf(fo, "tasol")
dim(sweden.temp)
[1] 40 66 3287
##