## https://mondpanther.github.io/datastorieshub/posts/exercises/exercises2/
df <- read.csv("/Users/isaacharris/Downloads/auto.csv", header = TRUE)
df
## X make price mpg rep78 headroom trunk weight length turn
## 1 1 AMC Concord 4099 22 3 2.5 11 2930 186 40
## 2 2 AMC Pacer 4749 17 3 3.0 11 3350 173 40
## 3 3 AMC Spirit 3799 22 NA 3.0 12 2640 168 35
## 4 4 Buick Century 4816 20 3 4.5 16 3250 196 40
## 5 5 Buick Electra 7827 15 4 4.0 20 4080 222 43
## 6 6 Buick LeSabre 5788 18 3 4.0 21 3670 218 43
## 7 7 Buick Opel 4453 26 NA 3.0 10 2230 170 34
## 8 8 Buick Regal 5189 20 3 2.0 16 3280 200 42
## 9 9 Buick Riviera 10372 16 3 3.5 17 3880 207 43
## 10 10 Buick Skylark 4082 19 3 3.5 13 3400 200 42
## 11 11 Cad. Deville 11385 14 3 4.0 20 4330 221 44
## 12 12 Cad. Eldorado 14500 14 2 3.5 16 3900 204 43
## 13 13 Cad. Seville 15906 21 3 3.0 13 4290 204 45
## 14 14 Chev. Chevette 3299 29 3 2.5 9 2110 163 34
## 15 15 Chev. Impala 5705 16 4 4.0 20 3690 212 43
## 16 16 Chev. Malibu 4504 22 3 3.5 17 3180 193 31
## 17 17 Chev. Monte Carlo 5104 22 2 2.0 16 3220 200 41
## 18 18 Chev. Monza 3667 24 2 2.0 7 2750 179 40
## 19 19 Chev. Nova 3955 19 3 3.5 13 3430 197 43
## 20 20 Dodge Colt 3984 30 5 2.0 8 2120 163 35
## 21 21 Dodge Diplomat 4010 18 2 4.0 17 3600 206 46
## 22 22 Dodge Magnum 5886 16 2 4.0 17 3600 206 46
## 23 23 Dodge St. Regis 6342 17 2 4.5 21 3740 220 46
## 24 24 Ford Fiesta 4389 28 4 1.5 9 1800 147 33
## 25 25 Ford Mustang 4187 21 3 2.0 10 2650 179 43
## 26 26 Linc. Continental 11497 12 3 3.5 22 4840 233 51
## 27 27 Linc. Mark V 13594 12 3 2.5 18 4720 230 48
## 28 28 Linc. Versailles 13466 14 3 3.5 15 3830 201 41
## 29 29 Merc. Bobcat 3829 22 4 3.0 9 2580 169 39
## 30 30 Merc. Cougar 5379 14 4 3.5 16 4060 221 48
## 31 31 Merc. Marquis 6165 15 3 3.5 23 3720 212 44
## 32 32 Merc. Monarch 4516 18 3 3.0 15 3370 198 41
## 33 33 Merc. XR-7 6303 14 4 3.0 16 4130 217 45
## 34 34 Merc. Zephyr 3291 20 3 3.5 17 2830 195 43
## 35 35 Olds 98 8814 21 4 4.0 20 4060 220 43
## 36 36 Olds Cutl Supr 5172 19 3 2.0 16 3310 198 42
## 37 37 Olds Cutlass 4733 19 3 4.5 16 3300 198 42
## 38 38 Olds Delta 88 4890 18 4 4.0 20 3690 218 42
## 39 39 Olds Omega 4181 19 3 4.5 14 3370 200 43
## 40 40 Olds Starfire 4195 24 1 2.0 10 2730 180 40
## 41 41 Olds Toronado 10371 16 3 3.5 17 4030 206 43
## 42 42 Plym. Arrow 4647 28 3 2.0 11 3260 170 37
## 43 43 Plym. Champ 4425 34 5 2.5 11 1800 157 37
## 44 44 Plym. Horizon 4482 25 3 4.0 17 2200 165 36
## 45 45 Plym. Sapporo 6486 26 NA 1.5 8 2520 182 38
## 46 46 Plym. Volare 4060 18 2 5.0 16 3330 201 44
## 47 47 Pont. Catalina 5798 18 4 4.0 20 3700 214 42
## 48 48 Pont. Firebird 4934 18 1 1.5 7 3470 198 42
## 49 49 Pont. Grand Prix 5222 19 3 2.0 16 3210 201 45
## 50 50 Pont. Le Mans 4723 19 3 3.5 17 3200 199 40
## 51 51 Pont. Phoenix 4424 19 NA 3.5 13 3420 203 43
## 52 52 Pont. Sunbird 4172 24 2 2.0 7 2690 179 41
## 53 53 Audi 5000 9690 17 5 3.0 15 2830 189 37
## 54 54 Audi Fox 6295 23 3 2.5 11 2070 174 36
## 55 55 BMW 320i 9735 25 4 2.5 12 2650 177 34
## 56 56 Datsun 200 6229 23 4 1.5 6 2370 170 35
## 57 57 Datsun 210 4589 35 5 2.0 8 2020 165 32
## 58 58 Datsun 510 5079 24 4 2.5 8 2280 170 34
## 59 59 Datsun 810 8129 21 4 2.5 8 2750 184 38
## 60 60 Fiat Strada 4296 21 3 2.5 16 2130 161 36
## 61 61 Honda Accord 5799 25 5 3.0 10 2240 172 36
## 62 62 Honda Civic 4499 28 4 2.5 5 1760 149 34
## 63 63 Mazda GLC 3995 30 4 3.5 11 1980 154 33
## 64 64 Peugeot 604 12990 14 NA 3.5 14 3420 192 38
## 65 65 Renault Le Car 3895 26 3 3.0 10 1830 142 34
## 66 66 Subaru 3798 35 5 2.5 11 2050 164 36
## 67 67 Toyota Celica 5899 18 5 2.5 14 2410 174 36
## 68 68 Toyota Corolla 3748 31 5 3.0 9 2200 165 35
## 69 69 Toyota Corona 5719 18 5 2.0 11 2670 175 36
## 70 70 VW Dasher 7140 23 4 2.5 12 2160 172 36
## 71 71 VW Diesel 5397 41 5 3.0 15 2040 155 35
## 72 72 VW Rabbit 4697 25 4 3.0 15 1930 155 35
## 73 73 VW Scirocco 6850 25 4 2.0 16 1990 156 36
## 74 74 Volvo 260 11995 17 5 2.5 14 3170 193 37
## displacement gear_ratio foreign
## 1 121 3.58 0
## 2 258 2.53 0
## 3 121 3.08 0
## 4 196 2.93 0
## 5 350 2.41 0
## 6 231 2.73 0
## 7 304 2.87 0
## 8 196 2.93 0
## 9 231 2.93 0
## 10 231 3.08 0
## 11 425 2.28 0
## 12 350 2.19 0
## 13 350 2.24 0
## 14 231 2.93 0
## 15 250 2.56 0
## 16 200 2.73 0
## 17 200 2.73 0
## 18 151 2.73 0
## 19 250 2.56 0
## 20 98 3.54 0
## 21 318 2.47 0
## 22 318 2.47 0
## 23 225 2.94 0
## 24 98 3.15 0
## 25 140 3.08 0
## 26 400 2.47 0
## 27 400 2.47 0
## 28 302 2.47 0
## 29 140 2.73 0
## 30 302 2.75 0
## 31 302 2.26 0
## 32 250 2.43 0
## 33 302 2.75 0
## 34 140 3.08 0
## 35 350 2.41 0
## 36 231 2.93 0
## 37 231 2.93 0
## 38 231 2.73 0
## 39 231 3.08 0
## 40 151 2.73 0
## 41 350 2.41 0
## 42 156 3.05 0
## 43 86 2.97 0
## 44 105 3.37 0
## 45 119 3.54 0
## 46 225 3.23 0
## 47 231 2.73 0
## 48 231 3.08 0
## 49 231 2.93 0
## 50 231 2.93 0
## 51 231 3.08 0
## 52 151 2.73 0
## 53 131 3.20 1
## 54 97 3.70 1
## 55 121 3.64 1
## 56 119 3.89 1
## 57 85 3.70 1
## 58 119 3.54 1
## 59 146 3.55 1
## 60 105 3.37 1
## 61 107 3.05 1
## 62 91 3.30 1
## 63 86 3.73 1
## 64 163 3.58 1
## 65 79 3.72 1
## 66 97 3.81 1
## 67 134 3.06 1
## 68 97 3.21 1
## 69 134 3.05 1
## 70 97 3.74 1
## 71 90 3.78 1
## 72 89 3.78 1
## 73 97 3.78 1
## 74 163 2.98 1
## this gives us summary of each variable - including mean
summary(df)
## X make price mpg
## Min. : 1.00 Length:74 Min. : 3291 Min. :12.00
## 1st Qu.:19.25 Class :character 1st Qu.: 4220 1st Qu.:18.00
## Median :37.50 Mode :character Median : 5006 Median :20.00
## Mean :37.50 Mean : 6165 Mean :21.30
## 3rd Qu.:55.75 3rd Qu.: 6332 3rd Qu.:24.75
## Max. :74.00 Max. :15906 Max. :41.00
##
## rep78 headroom trunk weight length
## Min. :1.000 Min. :1.500 Min. : 5.00 Min. :1760 Min. :142.0
## 1st Qu.:3.000 1st Qu.:2.500 1st Qu.:10.25 1st Qu.:2250 1st Qu.:170.0
## Median :3.000 Median :3.000 Median :14.00 Median :3190 Median :192.5
## Mean :3.406 Mean :2.993 Mean :13.76 Mean :3019 Mean :187.9
## 3rd Qu.:4.000 3rd Qu.:3.500 3rd Qu.:16.75 3rd Qu.:3600 3rd Qu.:203.8
## Max. :5.000 Max. :5.000 Max. :23.00 Max. :4840 Max. :233.0
## NA's :5
## turn displacement gear_ratio foreign
## Min. :31.00 Min. : 79.0 Min. :2.190 Min. :0.0000
## 1st Qu.:36.00 1st Qu.:119.0 1st Qu.:2.730 1st Qu.:0.0000
## Median :40.00 Median :196.0 Median :2.955 Median :0.0000
## Mean :39.65 Mean :197.3 Mean :3.015 Mean :0.2973
## 3rd Qu.:43.00 3rd Qu.:245.2 3rd Qu.:3.353 3rd Qu.:1.0000
## Max. :51.00 Max. :425.0 Max. :3.890 Max. :1.0000
##
## number of rows (number of observations) and number of columns (number variables)
nrow(df)
## [1] 74
ncol(df)
## [1] 13
##lists all the variables / headers
names(df)
## [1] "X" "make" "price" "mpg" "rep78"
## [6] "headroom" "trunk" "weight" "length" "turn"
## [11] "displacement" "gear_ratio" "foreign"
(.packages())
## [1] "stats" "graphics" "grDevices" "utils" "datasets" "methods"
## [7] "base"
## have to reload packages each time we start up R
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## this is known as piping - basically the %>% is a 'then' it is for consecutive functions
df_summary <- df %>% select(mpg, weight) %>%
summarise_all(list(mmm=mean, p50=median, sd=sd))
df_summary
## mpg_mmm weight_mmm mpg_p50 weight_p50 mpg_sd weight_sd
## 1 21.2973 3019.459 20 3190 5.785503 777.1936
## reshape data to look nice :)
## gather function will gather piped pair
step_1 <- df_summary %>% gather(stat, val)
step_1
## stat val
## 1 mpg_mmm 21.297297
## 2 weight_mmm 3019.459459
## 3 mpg_p50 20.000000
## 4 weight_p50 3190.000000
## 5 mpg_sd 5.785503
## 6 weight_sd 777.193567
## here separate will split stat into variable and stat as we asked
step_2 <- step_1 %>% separate(stat, into = c("var", "stat"), sep = "_")
step_2
## var stat val
## 1 mpg mmm 21.297297
## 2 weight mmm 3019.459459
## 3 mpg p50 20.000000
## 4 weight p50 3190.000000
## 5 mpg sd 5.785503
## 6 weight sd 777.193567
## spread will no create separate columns for stat and val again (creates new columns for every stat category)
step_3 <- step_2 %>% spread(stat, val)
step_3
## var mmm p50 sd
## 1 mpg 21.2973 20 5.785503
## 2 weight 3019.4595 3190 777.193567
## **Exercise 2.2**
covid=read.csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv")
nrow(covid)
## [1] 14699
names(covid)
## [1] "date" "state" "fips" "cases" "deaths"
## filter() this command will select specific rows in the dataframe N.B. as date is a factor variable we had to convert it to a character string - using as.character() so we can compare like with like
## mutate() can change previous variables or add new ones
## head() will give a snapshot of a large dataframe
## So using all of these we have created 3 new variables using only data from july 31st
covid_july <- covid %>% dplyr::filter(as.character(date) == '2020-07-31') %>%
mutate(deathrate = deaths/cases, death_sh = deaths/sum(deaths), cases_sh = cases/sum(cases))
head(covid_july, 10)
## date state fips cases deaths deathrate death_sh
## 1 2020-07-31 Alabama 1 87723 1580 0.018011240 0.0102689423
## 2 2020-07-31 Alaska 2 3675 21 0.005714286 0.0001364859
## 3 2020-07-31 Arizona 4 174108 3695 0.021222460 0.0240150265
## 4 2020-07-31 Arkansas 5 42511 453 0.010656065 0.0029441967
## 5 2020-07-31 California 6 502273 9222 0.018360533 0.0599368265
## 6 2020-07-31 Colorado 8 46948 1841 0.039213598 0.0119652676
## 7 2020-07-31 Connecticut 9 49810 4432 0.088978117 0.0288050331
## 8 2020-07-31 Delaware 10 14788 585 0.039559102 0.0038021084
## 9 2020-07-31 District of Columbia 11 12126 585 0.048243444 0.0038021084
## 10 2020-07-31 Florida 12 470378 6842 0.014545748 0.0444684198
## cases_sh
## 1 0.019188397
## 2 0.000803864
## 3 0.038084122
## 4 0.009298792
## 5 0.109866440
## 6 0.010269335
## 7 0.010895364
## 8 0.003234705
## 9 0.002652423
## 10 0.102889776
##Here we filtered for the maximum deathrate
covid_july %>% filter(deathrate == max(deathrate))
## date state fips cases deaths deathrate death_sh cases_sh
## 1 2020-07-31 Connecticut 9 49810 4432 0.08897812 0.02880503 0.01089536
##Here we used group_by() to group the rows by one variable (date), we then created a summary using summarise and sum
usdaily <- covid %>% group_by(date) %>% summarise(deaths = sum(deaths), cases=sum(cases))
## `summarise()` ungrouping output (override with `.groups` argument)
head(usdaily)
## # A tibble: 6 x 3
## date deaths cases
## <chr> <int> <int>
## 1 2020-01-21 0 1
## 2 2020-01-22 0 1
## 3 2020-01-23 0 1
## 4 2020-01-24 0 2
## 5 2020-01-25 0 3
## 6 2020-01-26 0 5
##showing how deaths change over time in a figure
## as.Date(date) tells R that it is a date making it look better
library(ggplot2)
usdaily <- usdaily %>% mutate(date=as.Date(date), deathrate=deaths/cases)
ggplot(usdaily,aes(x=date, y=deathrate)) +
geom_point() +
scale_x_date(date_breaks = "1 month", date_labels = "%b")

## merging two datasets
pop=read.csv("https://www.dropbox.com/s/tp4kiq8if372rcz/populationdata.csv?dl=1")
##this is a dataset of population of american states
covid_julyb=covid_july %>% merge(pop,by="state", all.x=TRUE)
## merge them by variable 'state' the all.x tells R to keep everything from covid_july even if there is no matching data in pop
##Can now compare covid per capita (per 1000 pop) and pop density - geom_smooth(method=lm) adds a regression line
covid_julyb <- covid_julyb %>% mutate(CovidPerCapita = cases/pop*1000)
ggplot(covid_julyb, aes(x=density, y=CovidPerCapita)) +
geom_point() + geom_smooth(method=lm)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
## Warning: Removed 3 rows containing missing values (geom_point).

##remove outlier of washington DC
ggplot(covid_julyb %>% filter(density<6000), aes(x=density, y=CovidPerCapita)) +
geom_point() + geom_smooth(method=lm)
## `geom_smooth()` using formula 'y ~ x'

##we can also look at parameters of regression line directly with lm()
summary(lm(CovidPerCapita~density,covid_julyb %>% filter(density<6000) ))
##
## Call:
## lm(formula = CovidPerCapita ~ density, data = covid_julyb %>%
## filter(density < 6000))
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.6365 -3.1810 -0.4811 3.3123 14.2090
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.103760 0.970901 10.407 5.27e-14 ***
## density 0.006921 0.002751 2.516 0.0152 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.466 on 49 degrees of freedom
## Multiple R-squared: 0.1144, Adjusted R-squared: 0.0963
## F-statistic: 6.328 on 1 and 49 DF, p-value: 0.01521
##We have used a for loop here to create a graph for each of the states as below (instead of typing code 4 times)
states <- c("California","Texas","New York","Alaska")
covid <- covid %>% mutate(deathrate = deaths/cases, date=as.Date(date))
for(s in states){
print(s)
p=ggplot(covid %>% filter(state==s), aes(x=date, y=deathrate)) +
geom_point() + ggtitle(s)
scale_x_date(date_breaks="1 month",date_labels = "%b")
print(p)
}
## [1] "California"

## [1] "Texas"

## [1] "New York"

## [1] "Alaska"
