## https://mondpanther.github.io/datastorieshub/posts/exercises/exercises2/

df <- read.csv("/Users/isaacharris/Downloads/auto.csv", header = TRUE)
df
##     X              make price mpg rep78 headroom trunk weight length turn
## 1   1       AMC Concord  4099  22     3      2.5    11   2930    186   40
## 2   2         AMC Pacer  4749  17     3      3.0    11   3350    173   40
## 3   3        AMC Spirit  3799  22    NA      3.0    12   2640    168   35
## 4   4     Buick Century  4816  20     3      4.5    16   3250    196   40
## 5   5     Buick Electra  7827  15     4      4.0    20   4080    222   43
## 6   6     Buick LeSabre  5788  18     3      4.0    21   3670    218   43
## 7   7        Buick Opel  4453  26    NA      3.0    10   2230    170   34
## 8   8       Buick Regal  5189  20     3      2.0    16   3280    200   42
## 9   9     Buick Riviera 10372  16     3      3.5    17   3880    207   43
## 10 10     Buick Skylark  4082  19     3      3.5    13   3400    200   42
## 11 11      Cad. Deville 11385  14     3      4.0    20   4330    221   44
## 12 12     Cad. Eldorado 14500  14     2      3.5    16   3900    204   43
## 13 13      Cad. Seville 15906  21     3      3.0    13   4290    204   45
## 14 14    Chev. Chevette  3299  29     3      2.5     9   2110    163   34
## 15 15      Chev. Impala  5705  16     4      4.0    20   3690    212   43
## 16 16      Chev. Malibu  4504  22     3      3.5    17   3180    193   31
## 17 17 Chev. Monte Carlo  5104  22     2      2.0    16   3220    200   41
## 18 18       Chev. Monza  3667  24     2      2.0     7   2750    179   40
## 19 19        Chev. Nova  3955  19     3      3.5    13   3430    197   43
## 20 20        Dodge Colt  3984  30     5      2.0     8   2120    163   35
## 21 21    Dodge Diplomat  4010  18     2      4.0    17   3600    206   46
## 22 22      Dodge Magnum  5886  16     2      4.0    17   3600    206   46
## 23 23   Dodge St. Regis  6342  17     2      4.5    21   3740    220   46
## 24 24       Ford Fiesta  4389  28     4      1.5     9   1800    147   33
## 25 25      Ford Mustang  4187  21     3      2.0    10   2650    179   43
## 26 26 Linc. Continental 11497  12     3      3.5    22   4840    233   51
## 27 27      Linc. Mark V 13594  12     3      2.5    18   4720    230   48
## 28 28  Linc. Versailles 13466  14     3      3.5    15   3830    201   41
## 29 29      Merc. Bobcat  3829  22     4      3.0     9   2580    169   39
## 30 30      Merc. Cougar  5379  14     4      3.5    16   4060    221   48
## 31 31     Merc. Marquis  6165  15     3      3.5    23   3720    212   44
## 32 32     Merc. Monarch  4516  18     3      3.0    15   3370    198   41
## 33 33        Merc. XR-7  6303  14     4      3.0    16   4130    217   45
## 34 34      Merc. Zephyr  3291  20     3      3.5    17   2830    195   43
## 35 35           Olds 98  8814  21     4      4.0    20   4060    220   43
## 36 36    Olds Cutl Supr  5172  19     3      2.0    16   3310    198   42
## 37 37      Olds Cutlass  4733  19     3      4.5    16   3300    198   42
## 38 38     Olds Delta 88  4890  18     4      4.0    20   3690    218   42
## 39 39        Olds Omega  4181  19     3      4.5    14   3370    200   43
## 40 40     Olds Starfire  4195  24     1      2.0    10   2730    180   40
## 41 41     Olds Toronado 10371  16     3      3.5    17   4030    206   43
## 42 42       Plym. Arrow  4647  28     3      2.0    11   3260    170   37
## 43 43       Plym. Champ  4425  34     5      2.5    11   1800    157   37
## 44 44     Plym. Horizon  4482  25     3      4.0    17   2200    165   36
## 45 45     Plym. Sapporo  6486  26    NA      1.5     8   2520    182   38
## 46 46      Plym. Volare  4060  18     2      5.0    16   3330    201   44
## 47 47    Pont. Catalina  5798  18     4      4.0    20   3700    214   42
## 48 48    Pont. Firebird  4934  18     1      1.5     7   3470    198   42
## 49 49  Pont. Grand Prix  5222  19     3      2.0    16   3210    201   45
## 50 50     Pont. Le Mans  4723  19     3      3.5    17   3200    199   40
## 51 51     Pont. Phoenix  4424  19    NA      3.5    13   3420    203   43
## 52 52     Pont. Sunbird  4172  24     2      2.0     7   2690    179   41
## 53 53         Audi 5000  9690  17     5      3.0    15   2830    189   37
## 54 54          Audi Fox  6295  23     3      2.5    11   2070    174   36
## 55 55          BMW 320i  9735  25     4      2.5    12   2650    177   34
## 56 56        Datsun 200  6229  23     4      1.5     6   2370    170   35
## 57 57        Datsun 210  4589  35     5      2.0     8   2020    165   32
## 58 58        Datsun 510  5079  24     4      2.5     8   2280    170   34
## 59 59        Datsun 810  8129  21     4      2.5     8   2750    184   38
## 60 60       Fiat Strada  4296  21     3      2.5    16   2130    161   36
## 61 61      Honda Accord  5799  25     5      3.0    10   2240    172   36
## 62 62       Honda Civic  4499  28     4      2.5     5   1760    149   34
## 63 63         Mazda GLC  3995  30     4      3.5    11   1980    154   33
## 64 64       Peugeot 604 12990  14    NA      3.5    14   3420    192   38
## 65 65    Renault Le Car  3895  26     3      3.0    10   1830    142   34
## 66 66            Subaru  3798  35     5      2.5    11   2050    164   36
## 67 67     Toyota Celica  5899  18     5      2.5    14   2410    174   36
## 68 68    Toyota Corolla  3748  31     5      3.0     9   2200    165   35
## 69 69     Toyota Corona  5719  18     5      2.0    11   2670    175   36
## 70 70         VW Dasher  7140  23     4      2.5    12   2160    172   36
## 71 71         VW Diesel  5397  41     5      3.0    15   2040    155   35
## 72 72         VW Rabbit  4697  25     4      3.0    15   1930    155   35
## 73 73       VW Scirocco  6850  25     4      2.0    16   1990    156   36
## 74 74         Volvo 260 11995  17     5      2.5    14   3170    193   37
##    displacement gear_ratio foreign
## 1           121       3.58       0
## 2           258       2.53       0
## 3           121       3.08       0
## 4           196       2.93       0
## 5           350       2.41       0
## 6           231       2.73       0
## 7           304       2.87       0
## 8           196       2.93       0
## 9           231       2.93       0
## 10          231       3.08       0
## 11          425       2.28       0
## 12          350       2.19       0
## 13          350       2.24       0
## 14          231       2.93       0
## 15          250       2.56       0
## 16          200       2.73       0
## 17          200       2.73       0
## 18          151       2.73       0
## 19          250       2.56       0
## 20           98       3.54       0
## 21          318       2.47       0
## 22          318       2.47       0
## 23          225       2.94       0
## 24           98       3.15       0
## 25          140       3.08       0
## 26          400       2.47       0
## 27          400       2.47       0
## 28          302       2.47       0
## 29          140       2.73       0
## 30          302       2.75       0
## 31          302       2.26       0
## 32          250       2.43       0
## 33          302       2.75       0
## 34          140       3.08       0
## 35          350       2.41       0
## 36          231       2.93       0
## 37          231       2.93       0
## 38          231       2.73       0
## 39          231       3.08       0
## 40          151       2.73       0
## 41          350       2.41       0
## 42          156       3.05       0
## 43           86       2.97       0
## 44          105       3.37       0
## 45          119       3.54       0
## 46          225       3.23       0
## 47          231       2.73       0
## 48          231       3.08       0
## 49          231       2.93       0
## 50          231       2.93       0
## 51          231       3.08       0
## 52          151       2.73       0
## 53          131       3.20       1
## 54           97       3.70       1
## 55          121       3.64       1
## 56          119       3.89       1
## 57           85       3.70       1
## 58          119       3.54       1
## 59          146       3.55       1
## 60          105       3.37       1
## 61          107       3.05       1
## 62           91       3.30       1
## 63           86       3.73       1
## 64          163       3.58       1
## 65           79       3.72       1
## 66           97       3.81       1
## 67          134       3.06       1
## 68           97       3.21       1
## 69          134       3.05       1
## 70           97       3.74       1
## 71           90       3.78       1
## 72           89       3.78       1
## 73           97       3.78       1
## 74          163       2.98       1
## this gives us summary of each variable - including mean 
summary(df)
##        X             make               price            mpg       
##  Min.   : 1.00   Length:74          Min.   : 3291   Min.   :12.00  
##  1st Qu.:19.25   Class :character   1st Qu.: 4220   1st Qu.:18.00  
##  Median :37.50   Mode  :character   Median : 5006   Median :20.00  
##  Mean   :37.50                      Mean   : 6165   Mean   :21.30  
##  3rd Qu.:55.75                      3rd Qu.: 6332   3rd Qu.:24.75  
##  Max.   :74.00                      Max.   :15906   Max.   :41.00  
##                                                                    
##      rep78          headroom         trunk           weight         length     
##  Min.   :1.000   Min.   :1.500   Min.   : 5.00   Min.   :1760   Min.   :142.0  
##  1st Qu.:3.000   1st Qu.:2.500   1st Qu.:10.25   1st Qu.:2250   1st Qu.:170.0  
##  Median :3.000   Median :3.000   Median :14.00   Median :3190   Median :192.5  
##  Mean   :3.406   Mean   :2.993   Mean   :13.76   Mean   :3019   Mean   :187.9  
##  3rd Qu.:4.000   3rd Qu.:3.500   3rd Qu.:16.75   3rd Qu.:3600   3rd Qu.:203.8  
##  Max.   :5.000   Max.   :5.000   Max.   :23.00   Max.   :4840   Max.   :233.0  
##  NA's   :5                                                                     
##       turn        displacement     gear_ratio       foreign      
##  Min.   :31.00   Min.   : 79.0   Min.   :2.190   Min.   :0.0000  
##  1st Qu.:36.00   1st Qu.:119.0   1st Qu.:2.730   1st Qu.:0.0000  
##  Median :40.00   Median :196.0   Median :2.955   Median :0.0000  
##  Mean   :39.65   Mean   :197.3   Mean   :3.015   Mean   :0.2973  
##  3rd Qu.:43.00   3rd Qu.:245.2   3rd Qu.:3.353   3rd Qu.:1.0000  
##  Max.   :51.00   Max.   :425.0   Max.   :3.890   Max.   :1.0000  
## 
## number of rows (number of observations) and number of columns (number variables)
nrow(df)
## [1] 74
ncol(df)
## [1] 13
##lists all the variables / headers 
names(df)
##  [1] "X"            "make"         "price"        "mpg"          "rep78"       
##  [6] "headroom"     "trunk"        "weight"       "length"       "turn"        
## [11] "displacement" "gear_ratio"   "foreign"
(.packages())
## [1] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
## [7] "base"
## have to reload packages each time we start up R
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)

## this is known as piping - basically the %>% is a 'then' it is for consecutive functions 
df_summary <- df %>% select(mpg, weight) %>%
  summarise_all(list(mmm=mean, p50=median, sd=sd))
  df_summary
##   mpg_mmm weight_mmm mpg_p50 weight_p50   mpg_sd weight_sd
## 1 21.2973   3019.459      20       3190 5.785503  777.1936
## reshape data to look nice :) 
## gather function will gather piped pair
step_1 <- df_summary %>% gather(stat, val)
step_1
##         stat         val
## 1    mpg_mmm   21.297297
## 2 weight_mmm 3019.459459
## 3    mpg_p50   20.000000
## 4 weight_p50 3190.000000
## 5     mpg_sd    5.785503
## 6  weight_sd  777.193567
## here separate will split stat into variable and stat as we asked 
step_2 <- step_1 %>% separate(stat, into = c("var", "stat"), sep = "_")
step_2
##      var stat         val
## 1    mpg  mmm   21.297297
## 2 weight  mmm 3019.459459
## 3    mpg  p50   20.000000
## 4 weight  p50 3190.000000
## 5    mpg   sd    5.785503
## 6 weight   sd  777.193567
## spread will no create separate columns for stat and val again (creates new columns for every stat category)
step_3 <- step_2 %>% spread(stat, val)
step_3
##      var       mmm  p50         sd
## 1    mpg   21.2973   20   5.785503
## 2 weight 3019.4595 3190 777.193567
## **Exercise 2.2**

covid=read.csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv")
nrow(covid)
## [1] 14699
names(covid)
## [1] "date"   "state"  "fips"   "cases"  "deaths"
## filter() this command will select specific rows in the dataframe N.B. as date is a factor variable we had to convert it to a character string - using as.character() so we can compare like with like 
## mutate() can change previous variables or add new ones 
## head() will give a snapshot of a large dataframe 
## So using all of these we have created 3 new variables using only data from july 31st 
covid_july <- covid %>% dplyr::filter(as.character(date) == '2020-07-31') %>%
  mutate(deathrate = deaths/cases, death_sh = deaths/sum(deaths), cases_sh = cases/sum(cases))
head(covid_july, 10) 
##          date                state fips  cases deaths   deathrate     death_sh
## 1  2020-07-31              Alabama    1  87723   1580 0.018011240 0.0102689423
## 2  2020-07-31               Alaska    2   3675     21 0.005714286 0.0001364859
## 3  2020-07-31              Arizona    4 174108   3695 0.021222460 0.0240150265
## 4  2020-07-31             Arkansas    5  42511    453 0.010656065 0.0029441967
## 5  2020-07-31           California    6 502273   9222 0.018360533 0.0599368265
## 6  2020-07-31             Colorado    8  46948   1841 0.039213598 0.0119652676
## 7  2020-07-31          Connecticut    9  49810   4432 0.088978117 0.0288050331
## 8  2020-07-31             Delaware   10  14788    585 0.039559102 0.0038021084
## 9  2020-07-31 District of Columbia   11  12126    585 0.048243444 0.0038021084
## 10 2020-07-31              Florida   12 470378   6842 0.014545748 0.0444684198
##       cases_sh
## 1  0.019188397
## 2  0.000803864
## 3  0.038084122
## 4  0.009298792
## 5  0.109866440
## 6  0.010269335
## 7  0.010895364
## 8  0.003234705
## 9  0.002652423
## 10 0.102889776
##Here we filtered for the maximum deathrate
covid_july %>% filter(deathrate == max(deathrate))
##         date       state fips cases deaths  deathrate   death_sh   cases_sh
## 1 2020-07-31 Connecticut    9 49810   4432 0.08897812 0.02880503 0.01089536
##Here we used group_by() to group the rows by one variable (date), we then created a summary using summarise and sum
usdaily <- covid %>% group_by(date) %>% summarise(deaths = sum(deaths), cases=sum(cases))
## `summarise()` ungrouping output (override with `.groups` argument)
head(usdaily)
## # A tibble: 6 x 3
##   date       deaths cases
##   <chr>       <int> <int>
## 1 2020-01-21      0     1
## 2 2020-01-22      0     1
## 3 2020-01-23      0     1
## 4 2020-01-24      0     2
## 5 2020-01-25      0     3
## 6 2020-01-26      0     5
##showing how deaths change over time in a figure 
## as.Date(date) tells R that it is a date making it look better 
library(ggplot2)
usdaily <- usdaily %>% mutate(date=as.Date(date), deathrate=deaths/cases)
ggplot(usdaily,aes(x=date, y=deathrate)) + 
  geom_point() + 
  scale_x_date(date_breaks = "1 month", date_labels = "%b")

## merging two datasets 
pop=read.csv("https://www.dropbox.com/s/tp4kiq8if372rcz/populationdata.csv?dl=1")
##this is a dataset of population of american states 
covid_julyb=covid_july %>% merge(pop,by="state", all.x=TRUE)
## merge them by variable 'state' the all.x tells R to keep everything from covid_july even if there is no matching data in pop

##Can now compare covid per capita (per 1000 pop) and pop density - geom_smooth(method=lm) adds a regression line
covid_julyb <- covid_julyb %>% mutate(CovidPerCapita = cases/pop*1000)
ggplot(covid_julyb, aes(x=density, y=CovidPerCapita)) +
         geom_point() + geom_smooth(method=lm)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
## Warning: Removed 3 rows containing missing values (geom_point).

##remove outlier of washington DC
ggplot(covid_julyb %>% filter(density<6000), aes(x=density, y=CovidPerCapita)) + 
  geom_point() + geom_smooth(method=lm)
## `geom_smooth()` using formula 'y ~ x'

##we can also look at parameters of regression line directly with lm()
summary(lm(CovidPerCapita~density,covid_julyb  %>% filter(density<6000) ))
## 
## Call:
## lm(formula = CovidPerCapita ~ density, data = covid_julyb %>% 
##     filter(density < 6000))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.6365  -3.1810  -0.4811   3.3123  14.2090 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.103760   0.970901  10.407 5.27e-14 ***
## density      0.006921   0.002751   2.516   0.0152 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.466 on 49 degrees of freedom
## Multiple R-squared:  0.1144, Adjusted R-squared:  0.0963 
## F-statistic: 6.328 on 1 and 49 DF,  p-value: 0.01521
##We have used a for loop here to create a graph for each of the states as below (instead of typing code 4 times)
states <- c("California","Texas","New York","Alaska")
covid <- covid %>% mutate(deathrate = deaths/cases, date=as.Date(date))

for(s in states){
  print(s)
  p=ggplot(covid  %>% filter(state==s), aes(x=date, y=deathrate)) +
    geom_point() + ggtitle(s)
  scale_x_date(date_breaks="1 month",date_labels = "%b")
  
  print(p)
}
## [1] "California"

## [1] "Texas"

## [1] "New York"

## [1] "Alaska"