Load up datasets from nycflights13
data(flights)
data(airports)
data(planes)airports %>%
semi_join(flights, c("faa" = "dest")) %>%
ggplot(aes(lon, lat)) +
borders("state") +
geom_point() +
coord_quickmap()# compute the average delay by destination
flights %>%
group_by(dest) %>%
filter(!is.na(arr_delay)) %>%
summarise(mean_delay = mean(arr_delay)) %>%
left_join(airports, by = c("dest" = "faa")) %>%
filter(!is.na(lat) & !is.na(lon)) %>%
filter(lon > (-140)) %>% # remove info out of cont US
# if yes(mean_delay < 0), change to zero / if no(mean_delay >= 0), leave it without changing
mutate(mean_delay = ifelse(mean_delay < 0, 0, mean_delay)) %>%
ggplot(aes(x = lon, y = lat, col = mean_delay, size = mean_delay)) +
borders("state") +
geom_point() +
coord_quickmap() +
xlab("Longitude") +
ylab("Latitude")flights %>%
left_join(airports %>% select(faa, lat, lon),
by = c("origin" = "faa")) %>%
left_join(airports %>% select(faa,lat,lon),
by = c("dest" = "faa"), suffix = c("", " .dest"))## # A tibble: 336,776 x 23
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ... with 336,766 more rows, and 15 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## # lat <dbl>, lon <dbl>, lat .dest <dbl>, lon .dest <dbl>
flights %>% select(year, month, day, carrier, tailnum, dep_delay, arr_delay) %>%
left_join(planes %>% select(tailnum, year),
by = "tailnum", suffix = c("", "_built")) %>%
mutate(plane_age = (year - year_built)) %>%
filter(!is.na(plane_age) & !is.na(dep_delay) &
!is.na(arr_delay) & dep_delay < 500) %>% #dep_delay < 500 removes extreme outliers
select(tailnum, dep_delay, arr_delay, plane_age) %>%
gather(key = type, value = delay, -c(1,4)) %>% # -c(1,4) changes the order of the table (check columns)
ggplot(aes(jitter(plane_age), delay, color = delay)) +
geom_point(size = 0.2, alpha = 0.1) +
facet_wrap(type~.) +
geom_smooth(method = "gam",formula = y ~ s(x, bs = "cs")) +
xlab("Plane Age") +
ylab("Delay (in minutes)")# without geom_point
flights %>% select(year, month, day, carrier, tailnum, dep_delay, arr_delay) %>%
left_join(planes %>% select(tailnum, year),
by = "tailnum", suffix = c("", "_built")) %>%
mutate(plane_age = (year - year_built)) %>%
filter(!is.na(plane_age) & !is.na(dep_delay) &
!is.na(arr_delay) & dep_delay < 500) %>% #dep_delay < 500 removes extreme outliers
select(tailnum, dep_delay, arr_delay, plane_age) %>%
gather(key = type, value = delay, -c(1,4)) %>% # -c(1,4) changes the order of the table (check columns)
ggplot(aes(jitter(plane_age), delay, color = delay)) +
facet_wrap(type~.) +
geom_smooth(method = "gam", formula = y ~ s(x, bs = "cs")) +
xlab("Plane Age") +
ylab("Delay (in minutes)")state.x77 matrix#your code here
library(datasets)
my_fn <- function(the_matrix){
apply(the_matrix, MARGIN = 2, FUN = function(a_col){
output = (a_col - mean(a_col))/sd(a_col)
return(output)
})
}
my_fn(state.x77)## Population Income Illiteracy Life Exp Murder
## Alabama -0.14143156 -1.32113867 1.525758 -1.362193670 2.091810096
## Alaska -0.86939802 3.05824562 0.541398 -1.168509784 1.062429318
## Arizona -0.45568908 0.15330286 1.033578 -0.244786635 0.114315444
## Arkansas -0.47853603 -1.72148373 1.197638 -0.162843452 0.737361704
## California 3.79697895 1.10371551 -0.114842 0.619341473 0.791539640
## Colorado -0.38199648 0.72940916 -0.771082 0.880069781 -0.156574234
## Connecticut -0.25678625 1.48453153 -0.114842 1.192943751 -1.158866044
## Delaware -0.82146423 0.60735274 -0.442962 -0.609806266 -0.319108041
## Florida 0.90280832 0.61711725 0.213278 -0.162843452 0.899895511
## Georgia 0.15333885 -0.56113404 1.361698 -1.742112063 1.766742482
## Hawaii -0.75673121 0.85797525 1.197638 2.027274338 -0.319108041
## Idaho -0.76905064 -0.51556631 -0.935142 0.738531557 -0.562908752
## Illinois 1.55685818 1.09232357 -0.442962 -0.550211224 0.791539640
## Indiana 0.23890291 0.03612870 -0.771082 0.001042913 -0.075307331
## Iowa -0.31031978 0.31278991 -1.099202 1.252538793 -1.375577787
## Kansas -0.44045778 0.37951409 -0.935142 1.267437554 -0.779620494
## Kentucky -0.19250121 -1.17792581 0.705458 -0.580008745 0.872806544
## Louisiana -0.09864953 -1.44970477 2.674178 -1.578225697 1.577119707
## Maine -0.71417317 -1.20721935 -0.771082 -0.363976718 -1.267221915
## Maryland -0.02786880 1.40478801 -0.442962 -0.490616183 0.303938219
## Massachusetts 0.35112174 0.51947212 -0.114842 0.708734036 -1.104688108
## Michigan 1.08961571 0.51296244 -0.442962 -0.185191593 1.008251383
## Minnesota -0.07289072 0.38927860 -0.935142 1.550514003 -1.375577787
## Mississippi -0.42679441 -2.17716102 2.017938 -2.077334173 1.387496932
## Missouri 0.11660455 -0.29586476 -0.607022 -0.140495311 0.520649962
## Montana -0.78405795 -0.14451480 -0.935142 -0.237337254 -0.644175655
## Nebraska -0.60531419 0.11749965 -0.935142 1.282336314 -1.213043980
## Nevada -0.81900034 1.16067517 -1.099202 -1.377092431 1.116607254
## New Hampshire -0.76927463 -0.25192445 -0.771082 0.261771222 -1.104688108
## New Jersey 0.69136206 1.30388803 -0.114842 0.038289814 -0.589997719
## New Mexico -0.69491006 -1.35856931 1.689818 -0.416122380 0.629005833
## New York 3.09768318 0.76033012 0.377338 -0.244786635 0.954073447
## North Carolina 0.26757359 -0.91265653 1.033578 -1.243003587 1.008251383
## North Dakota -0.80847282 1.05977520 -0.607022 1.416425159 -1.619378497
## Ohio 1.45337495 0.20375285 -0.607022 -0.043653368 0.005959573
## Oklahoma -0.34302227 -0.73689528 -0.114842 0.403309446 -0.264930105
## Oregon -0.43956182 0.36486732 -0.935142 0.932215443 -0.860887398
## Pennsylvania 1.70536334 0.02148193 -0.278902 -0.334179197 -0.346197009
## Rhode Island -0.74261986 0.19887059 0.213278 0.760879698 -1.348488819
## South Carolina -0.32039932 -1.30323706 1.853878 -2.174176116 1.143696222
## South Dakota -0.79861728 -0.43745020 -1.099202 0.894968542 -1.538111594
## Tennessee -0.01644532 -1.00053715 0.869518 -0.572559365 0.981162415
## Texas 1.78980744 -0.40327441 1.689818 0.015941674 1.306230029
## Utah -0.68169467 -0.67342595 -0.935142 1.505817721 -0.779620494
## Vermont -0.84543112 -0.86057912 -0.935142 0.567195811 -0.508730816
## Virginia 0.16453834 0.43159150 0.377338 -0.594907506 0.574827897
## Washington -0.15397498 0.69686078 -0.935142 0.626790853 -0.833798430
## West Virginia -0.54819682 -1.33253061 0.377338 -1.041870320 -0.183663202
## Wisconsin 0.07673438 0.05240289 -0.771082 1.192943751 -1.185955012
## Wyoming -0.86693413 0.21188994 -0.935142 -0.438470521 -0.129485266
## HS Grad Frost Area
## Alabama -1.46192933 -1.62482920 -0.23471832
## Alaska 1.68280347 0.91456761 5.80934967
## Arizona 0.61805142 -1.72101848 0.50020474
## Arkansas -1.63526106 -0.75912574 -0.22022120
## California 1.17518912 -1.62482920 1.00349033
## Colorado 1.33614002 1.18389757 0.38709909
## Connecticut 0.35805383 0.66447550 -0.77201412
## Delaware 0.18472210 -0.02808727 -0.80576651
## Florida -0.06289466 -1.79796989 -0.19508270
## Georgia -1.54859519 -0.85531502 -0.14840362
## Hawaii 1.08852326 -2.00958630 -0.75369642
## Idaho 0.79138315 0.41438339 0.13994490
## Illinois -0.06289466 0.43362124 -0.17565164
## Indiana -0.02575214 0.33743197 -0.40595308
## Iowa 0.72947896 0.68371335 -0.17338976
## Kansas 0.84090650 0.18352913 0.12951447
## Kentucky -1.80859279 -0.18199010 -0.36431342
## Louisiana -1.35050179 -1.77873204 -0.30243404
## Maine 0.19710294 1.08770830 -0.46662534
## Maryland -0.10003717 -0.06656298 -0.71307636
## Massachusetts 0.66757477 -0.02808727 -0.73727729
## Michigan -0.03813298 0.39514553 -0.16312341
## Minnesota 0.55614723 1.06847045 0.10023896
## Mississippi -1.49907184 -1.04769356 -0.27470552
## Missouri -0.53336649 0.06810201 -0.02040238
## Montana 0.75424064 0.97228117 0.87722359
## Nebraska 0.76662148 0.66447550 0.06735382
## Nevada 1.49709091 1.60713038 0.45885807
## New Hampshire 0.55614723 1.33780041 -0.72320207
## New Jersey -0.07527549 0.20276699 -0.74085176
## New Mexico 0.25900713 0.29895626 0.59390277
## New York -0.05051382 -0.43208222 -0.26843554
## North Carolina -1.80859279 -0.47055792 -0.25710271
## North Dakota -0.34765393 1.56865467 -0.01714434
## Ohio 0.01139037 0.37590768 -0.34878497
## Oklahoma -0.18670303 -0.43208222 -0.02289865
## Oregon 0.85328734 -1.16312069 0.29824124
## Pennsylvania -0.36003476 0.41438339 -0.30201214
## Rhode Island -0.83050660 0.43362124 -0.81670087
## South Carolina -1.89525865 -0.75912574 -0.47477044
## South Dakota 0.02377121 1.29932470 0.06116589
## Tennessee -1.40002514 -0.66293647 -0.34464796
## Texas -0.70669822 -1.33626138 2.24310532
## Utah 1.75708850 0.62599979 0.13313582
## Vermont 0.49424305 1.22237328 -0.72038937
## Virginia -0.65717487 -0.37436865 -0.36278987
## Washington 1.28661666 -1.39397495 -0.04882236
## West Virginia -1.42478681 -0.08580083 -0.54690445
## Wisconsin 0.17234126 0.85685405 -0.19069958
## Wyoming 1.21233164 1.31856256 0.31018349
apply and lapply write a function to obtain 10 subsamples of from state.x77 by selecting 20 rows at random each time, and calculate the column means for each of these subsamples.#your code here
lapply(1:10, function(X){
my_sample <- state.x77[sample(nrow(state.x77), 20),]
apply(my_sample, 2, mean)
})## [[1]]
## Population Income Illiteracy Life Exp Murder HS Grad Frost
## 4246.150 4313.900 1.145 70.675 7.505 52.875 107.950
## Area
## 61828.700
##
## [[2]]
## Population Income Illiteracy Life Exp Murder HS Grad Frost
## 3626.400 4537.600 1.215 70.675 7.650 53.490 99.250
## Area
## 77004.150
##
## [[3]]
## Population Income Illiteracy Life Exp Murder HS Grad Frost
## 4880.3500 4318.5000 1.1950 70.9845 7.1950 51.4350 110.3500
## Area
## 60000.9500
##
## [[4]]
## Population Income Illiteracy Life Exp Murder HS Grad Frost
## 5520.1000 4597.1000 1.0150 71.0145 7.3900 53.8650 106.6000
## Area
## 54259.2500
##
## [[5]]
## Population Income Illiteracy Life Exp Murder HS Grad Frost
## 4763.5000 4596.5500 1.2500 70.7645 7.2900 52.5850 97.0000
## Area
## 90984.1000
##
## [[6]]
## Population Income Illiteracy Life Exp Murder HS Grad Frost
## 4355.7500 4676.7000 1.1350 70.8155 7.7600 55.1100 106.9000
## Area
## 84311.1000
##
## [[7]]
## Population Income Illiteracy Life Exp Murder HS Grad Frost
## 4552.100 4523.200 1.185 70.587 7.860 53.455 99.850
## Area
## 80996.700
##
## [[8]]
## Population Income Illiteracy Life Exp Murder HS Grad Frost
## 2851.650 4572.150 1.000 71.288 5.775 56.210 118.600
## Area
## 58020.800
##
## [[9]]
## Population Income Illiteracy Life Exp Murder HS Grad Frost
## 3327.6000 4597.2500 1.1250 70.9225 7.0850 54.3850 105.0500
## Area
## 83554.0000
##
## [[10]]
## Population Income Illiteracy Life Exp Murder HS Grad Frost
## 3478.3500 4321.9000 1.3250 70.9245 7.2300 52.1750 98.3500
## Area
## 49230.0500
lapply by sapply#your code here
sapply(1:10, function(X){
my_sample <- state.x77[sample(nrow(state.x77), 20),]
apply(my_sample, 2, mean)
})## [,1] [,2] [,3] [,4] [,5] [,6]
## Population 4912.000 4381.8000 3661.1500 5878.300 4265.100 3243.5500
## Income 4386.350 4434.5000 4428.2500 4407.650 4469.100 4190.9500
## Illiteracy 1.285 1.0900 1.3700 1.200 1.220 1.3050
## Life Exp 70.808 71.0705 70.5865 70.679 70.888 70.7855
## Murder 7.790 7.1500 8.4650 7.925 7.720 8.0750
## HS Grad 51.395 53.4500 51.5400 51.055 54.145 50.5100
## Frost 95.000 109.6000 96.7000 99.450 103.800 98.7500
## Area 82496.150 99756.8000 82027.2500 54633.100 104766.900 51598.9500
## [,7] [,8] [,9] [,10]
## Population 5171.650 4021.400 4475.200 5014.150
## Income 4428.550 4534.350 4263.000 4454.450
## Illiteracy 1.110 1.090 1.220 1.095
## Life Exp 70.741 71.223 70.731 70.866
## Murder 7.815 6.440 7.665 7.260
## HS Grad 53.225 55.060 51.635 52.690
## Frost 101.300 116.850 101.800 102.050
## Area 65799.500 92610.350 60990.500 56734.250
Fun, let’s do this 1000 times and plot the bootstrap distribution of the means for each variable.
#your code here
sapply(1:1000, function(X){
my_sample <- state.x77[sample(nrow(state.x77), 20),]
apply(my_sample, 2, mean)
}) %>%
t() %>% # calculate transpose of a matrix of df
as_tibble() %>% # look more nicely w tibble
pivot_longer(cols = Population:Area, # pivot_longer makes datasets longer by increasing the # of rows and decreasing the # of columns
names_to = "Variable", # (can check that rows are increased from 1000 to 8000)
values_to = "Mean") %>%
arrange(Variable) %>%
ggplot() +
geom_density(aes(Mean)) +
facet_wrap(Variable~., scales = "free")