This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
library(tidyverse)
library(gridExtra)
library(MASS)
data("mtcars")
college = read_csv("/Users/farre/Documents/data_set/college_history.csv")
states = read_csv("/Users/farre/Documents/data_set/introductory_state_example.csv", col_names = FALSE)
library(corrplot)
library(nycflights13)
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.
ggplot(data = mpg)+
geom_point(mapping = aes(x = displ, y = hwy))
ggplot(data = mpg)+
geom_point(mapping = aes(x=displ, y = hwy, alpha = cyl))
ggplot(data = mpg)+
geom_point(mapping = aes(x=displ, y = hwy, shape = as.factor(cyl)))
ggplot(data = mpg)+
geom_point(mapping = aes(x=displ, y = hwy, color = cyl))+
labs(x = "Engine size[liters]",y = "Mileage[mpg]", title = "Efficeny", caption = "(Based on data from mpg)", tag = "a)")
ggplot(data = mpg)+
geom_point(mapping = aes(x=displ, y = hwy))+
facet_wrap(~ class, nrow = 2)
ggplot(data = mpg)+
geom_point(mapping = aes(x=displ, y = hwy))+
facet_grid(class ~ cyl)
ggplot(data = mpg)+
geom_smooth(mapping = aes(x=displ, y = hwy))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Works well with categorical variables in data.
ggplot(data = mpg)+
geom_smooth(mapping = aes(x=displ, y = hwy, linetype = drv))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(data = mpg, mapping = aes(x=displ, y = hwy))+
geom_point()+
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(data = mpg, mapping = aes(x=displ, y = hwy))+
geom_point(mapping = aes(color = class))+
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(data = mpg, mapping = aes(x=displ, y = hwy))+
geom_point(mapping = aes(color = class))+
geom_smooth(data = filter(mpg, class == "suv"), se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(data = mpg)+
geom_bar(mapping = aes(x = class))
ggplot(data = mpg)+
geom_bar(mapping = aes(x = class,y = stat(prop), group = 1))
ggplot(data = mpg)+
stat_summary(mapping = aes(x = class,y = hwy),fun.ymin = min, fun.ymax = max, fun.y = median)
## Warning: `fun.y` is deprecated. Use `fun` instead.
## Warning: `fun.ymin` is deprecated. Use `fun.min` instead.
## Warning: `fun.ymax` is deprecated. Use `fun.max` instead.
ggplot(data = mpg)+
geom_bar(mapping = aes(x = class, color = class))
ggplot(data = mpg)+
geom_bar(mapping = aes(x = class, fill = as.factor(cyl)))
ggplot(data = mpg)+
geom_bar(mapping = aes(x = class, fill = as.factor(cyl)),position = "dodge")
plot1 = ggplot(data = mpg)+
geom_histogram(binwidth = .5, mapping = aes(x = hwy))+
labs(title = "binwidth = .5")
plot2 = ggplot(data = mpg)+
geom_histogram(binwidth = 1, mapping = aes(x = hwy))+
labs(title = "binwidth = 1")
plot3 = ggplot(data = mpg)+
geom_histogram(binwidth = 2, mapping = aes(x = hwy))+
labs(title = "binwidth = 2")
plot4 = ggplot(data = mpg)+
geom_histogram(binwidth = 3, mapping = aes(x = hwy))+
labs(title = "binwidth = 3")
grid.arrange(plot1, plot2, plot3, plot4, ncol = 4)
ggplot(data = mpg) +
geom_density(mapping = aes(x = hwy))
This type of plot helps see where the overlap is occuring.
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy), position = "jitter")
ggplot(data = mpg, mapping = aes(x = class, y = hwy)) +
geom_boxplot()+
coord_flip()
ggplot(data = birthwt, mapping = aes(x = lwt, fill = as.factor(race))) +
geom_histogram(binwidth = 10, alpha = .5)+
scale_fill_discrete(name = "Race")+
coord_flip()
ggplot(data = birthwt, mapping = aes(x = lwt, fill = as.factor(smoke)))+
geom_histogram(binwidth = 10)+
scale_fill_discrete(name = "Smoker")+
facet_wrap(~ smoke, nrow = 1)
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
carnames = rownames(mtcars)
print(carnames)
## [1] "Mazda RX4" "Mazda RX4 Wag" "Datsun 710"
## [4] "Hornet 4 Drive" "Hornet Sportabout" "Valiant"
## [7] "Duster 360" "Merc 240D" "Merc 230"
## [10] "Merc 280" "Merc 280C" "Merc 450SE"
## [13] "Merc 450SL" "Merc 450SLC" "Cadillac Fleetwood"
## [16] "Lincoln Continental" "Chrysler Imperial" "Fiat 128"
## [19] "Honda Civic" "Toyota Corolla" "Toyota Corona"
## [22] "Dodge Challenger" "AMC Javelin" "Camaro Z28"
## [25] "Pontiac Firebird" "Fiat X1-9" "Porsche 914-2"
## [28] "Lotus Europa" "Ford Pantera L" "Ferrari Dino"
## [31] "Maserati Bora" "Volvo 142E"
does not convert strings to factors tibble column names are more flexible
mtcars =as_tibble(mtcars)
print(mtcars)
## # A tibble: 32 x 11
## mpg cyl disp hp drat wt qsec vs am gear carb
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4
## 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4
## 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
## 4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1
## 5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2
## 6 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1
## 7 14.3 8 360 245 3.21 3.57 15.8 0 0 3 4
## 8 24.4 4 147. 62 3.69 3.19 20 1 0 4 2
## 9 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2
## 10 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 4
## # ... with 22 more rows
`` function creates different column headers
tibble(`$$` = c("USD","AUD","PES","Yen"), `1` = 20)
## # A tibble: 4 x 2
## `$$` `1`
## <chr> <dbl>
## 1 USD 20
## 2 AUD 20
## 3 PES 20
## 4 Yen 20
tib_iris = as_tibble(iris)
print(tib_iris)
## # A tibble: 150 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 8 5 3.4 1.5 0.2 setosa
## 9 4.4 2.9 1.4 0.2 setosa
## 10 4.9 3.1 1.5 0.1 setosa
## # ... with 140 more rows
assigns the data the a variable and prints the data from the set that matches that criteria. Must include a == to create a testing criteria.
car_reduced = filter(mtcars, carb ==1, gear == 4)
print(car_reduced)
## # A tibble: 4 x 11
## mpg cyl disp hp drat wt qsec vs am gear carb
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
## 2 32.4 4 78.7 66 4.08 2.2 19.5 1 1 4 1
## 3 33.9 4 71.1 65 4.22 1.84 19.9 1 1 4 1
## 4 27.3 4 79 66 4.08 1.94 18.9 1 1 4 1
filter(mtcars, cyl==6 | gear==5)
## # A tibble: 11 x 11
## mpg cyl disp hp drat wt qsec vs am gear carb
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4
## 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4
## 3 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1
## 4 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1
## 5 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 4
## 6 17.8 6 168. 123 3.92 3.44 18.9 1 0 4 4
## 7 26 4 120. 91 4.43 2.14 16.7 0 1 5 2
## 8 30.4 4 95.1 113 3.77 1.51 16.9 1 1 5 2
## 9 15.8 8 351 264 4.22 3.17 14.5 0 1 5 4
## 10 19.7 6 145 175 3.62 2.77 15.5 0 1 5 6
## 11 15 8 301 335 3.54 3.57 14.6 0 1 5 8
filter(mtcars, cyl==6 & gear==5)
## # A tibble: 1 x 11
## mpg cyl disp hp drat wt qsec vs am gear carb
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 19.7 6 145 175 3.62 2.77 15.5 0 1 5 6
mtcars[between(mtcars$mpg, 20, 25),]
## # A tibble: 8 x 11
## mpg cyl disp hp drat wt qsec vs am gear carb
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4
## 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4
## 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
## 4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1
## 5 24.4 4 147. 62 3.69 3.19 20 1 0 4 2
## 6 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2
## 7 21.5 4 120. 97 3.7 2.46 20.0 1 0 3 1
## 8 21.4 4 121 109 4.11 2.78 18.6 1 1 4 2
arrange(mtcars, cyl, gear, carb)
## # A tibble: 32 x 11
## mpg cyl disp hp drat wt qsec vs am gear carb
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 21.5 4 120. 97 3.7 2.46 20.0 1 0 3 1
## 2 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
## 3 32.4 4 78.7 66 4.08 2.2 19.5 1 1 4 1
## 4 33.9 4 71.1 65 4.22 1.84 19.9 1 1 4 1
## 5 27.3 4 79 66 4.08 1.94 18.9 1 1 4 1
## 6 24.4 4 147. 62 3.69 3.19 20 1 0 4 2
## 7 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2
## 8 30.4 4 75.7 52 4.93 1.62 18.5 1 1 4 2
## 9 21.4 4 121 109 4.11 2.78 18.6 1 1 4 2
## 10 26 4 120. 91 4.43 2.14 16.7 0 1 5 2
## # ... with 22 more rows
arrange(mtcars, cyl, desc(gear), carb)
## # A tibble: 32 x 11
## mpg cyl disp hp drat wt qsec vs am gear carb
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 26 4 120. 91 4.43 2.14 16.7 0 1 5 2
## 2 30.4 4 95.1 113 3.77 1.51 16.9 1 1 5 2
## 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
## 4 32.4 4 78.7 66 4.08 2.2 19.5 1 1 4 1
## 5 33.9 4 71.1 65 4.22 1.84 19.9 1 1 4 1
## 6 27.3 4 79 66 4.08 1.94 18.9 1 1 4 1
## 7 24.4 4 147. 62 3.69 3.19 20 1 0 4 2
## 8 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2
## 9 30.4 4 75.7 52 4.93 1.62 18.5 1 1 4 2
## 10 21.4 4 121 109 4.11 2.78 18.6 1 1 4 2
## # ... with 22 more rows
The !is.na will display the data that has a value in the cell rather than NA.
secular_cols = filter(college, sponsorship == "Secular")
print(secular_cols)
## # A tibble: 12 x 6
## college original_name city state established sponsorship
## <chr> <chr> <chr> <chr> <dbl> <chr>
## 1 Georgia, Univ. of <NA> Athens GA 1785 Secular
## 2 North Carolina, Univ. of <NA> Chapel~ NC 1789 Secular
## 3 Tennessee, Univ. of Blount College Knoxvi~ TN 1794 Secular
## 4 U.S. Military Academy <NA> West P~ NY 1802 Secular
## 5 Ohio Univ. <NA> Athens OH 1804 Secular
## 6 Miami Univ. <NA> Oxford OH 1809 Secular
## 7 Maryland, Univ. of <NA> Baltim~ MD 1812 Secular
## 8 Missouri, Univ. of <NA> Columb~ MO 1839 Secular
## 9 Mississipps, Univ. of <NA> Oxford MI 1844 Secular
## 10 Louisiana, Univ. of <NA> New Or~ LA 1845 Secular
## 11 U.S. Naval Academy <NA> Annapo~ MD 1845 Secular
## 12 Wisconsin, Univ. of <NA> Madison WI 1848 Secular
Secular_cols_srtd = arrange(filter(college, sponsorship == "Secular"), established)
print(Secular_cols_srtd)
## # A tibble: 12 x 6
## college original_name city state established sponsorship
## <chr> <chr> <chr> <chr> <dbl> <chr>
## 1 Georgia, Univ. of <NA> Athens GA 1785 Secular
## 2 North Carolina, Univ. of <NA> Chapel~ NC 1789 Secular
## 3 Tennessee, Univ. of Blount College Knoxvi~ TN 1794 Secular
## 4 U.S. Military Academy <NA> West P~ NY 1802 Secular
## 5 Ohio Univ. <NA> Athens OH 1804 Secular
## 6 Miami Univ. <NA> Oxford OH 1809 Secular
## 7 Maryland, Univ. of <NA> Baltim~ MD 1812 Secular
## 8 Missouri, Univ. of <NA> Columb~ MO 1839 Secular
## 9 Mississipps, Univ. of <NA> Oxford MI 1844 Secular
## 10 Louisiana, Univ. of <NA> New Or~ LA 1845 Secular
## 11 U.S. Naval Academy <NA> Annapo~ MD 1845 Secular
## 12 Wisconsin, Univ. of <NA> Madison WI 1848 Secular
original_name = Secular_cols_srtd$original_name[!is.na(Secular_cols_srtd$original_name)]
print(original_name)
## [1] "Blount College"
mutate(mtcars, MileagePerCylinder = mpg/cyl)
## # A tibble: 32 x 12
## mpg cyl disp hp drat wt qsec vs am gear carb
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4
## 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4
## 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
## 4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1
## 5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2
## 6 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1
## 7 14.3 8 360 245 3.21 3.57 15.8 0 0 3 4
## 8 24.4 4 147. 62 3.69 3.19 20 1 0 4 2
## 9 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2
## 10 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 4
## # ... with 22 more rows, and 1 more variable: MileagePerCylinder <dbl>
The lag function skips the first variable and the lead function skips the last variable
a = c(1:10)
lag(a)
## [1] NA 1 2 3 4 5 6 7 8 9
lead(a)
## [1] 2 3 4 5 6 7 8 9 10 NA
mtcars %>%
dplyr:: select(mpg, disp, hp, drat, wt, qsec)
## # A tibble: 32 x 6
## mpg disp hp drat wt qsec
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 21 160 110 3.9 2.62 16.5
## 2 21 160 110 3.9 2.88 17.0
## 3 22.8 108 93 3.85 2.32 18.6
## 4 21.4 258 110 3.08 3.22 19.4
## 5 18.7 360 175 3.15 3.44 17.0
## 6 18.1 225 105 2.76 3.46 20.2
## 7 14.3 360 245 3.21 3.57 15.8
## 8 24.4 147. 62 3.69 3.19 20
## 9 22.8 141. 95 3.92 3.15 22.9
## 10 19.2 168. 123 3.92 3.44 18.3
## # ... with 22 more rows
my_data = mtcars %>%
dplyr:: select(mpg, disp, hp, drat, wt, qsec)
rc = cor(my_data)
corrplot(rc)
Then goes on to compute the mean of the efficent cars
eff_mtcars = mutate(mtcars, efficient = if_else(mpg > 23, 1, 0))
eff_cars = filter(eff_mtcars, efficient == 1)
mean(eff_cars$mpg)
## [1] 29.25714
Then the function will compute the average mpg, min mpg, and max mpg.
grp_data = group_by(mtcars, cyl, gear, carb)
summarise(grp_data, avempg=mean(mpg), minmpg = min(mpg), maxmpg = max(mpg))
## `summarise()` has grouped output by 'cyl', 'gear'. You can override using the `.groups` argument.
## # A tibble: 12 x 6
## # Groups: cyl, gear [8]
## cyl gear carb avempg minmpg maxmpg
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 4 3 1 21.5 21.5 21.5
## 2 4 4 1 29.1 22.8 33.9
## 3 4 4 2 24.8 21.4 30.4
## 4 4 5 2 28.2 26 30.4
## 5 6 3 1 19.8 18.1 21.4
## 6 6 4 4 19.8 17.8 21
## 7 6 5 6 19.7 19.7 19.7
## 8 8 3 2 17.2 15.2 19.2
## 9 8 3 3 16.3 15.2 17.3
## 10 8 3 4 12.6 10.4 14.7
## 11 8 5 4 15.8 15.8 15.8
## 12 8 5 8 15 15 15
sample_n(mtcars, 5)
## # A tibble: 5 x 11
## mpg cyl disp hp drat wt qsec vs am gear carb
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 16.4 8 276. 180 3.07 4.07 17.4 0 0 3 3
## 2 14.7 8 440 230 3.23 5.34 17.4 0 0 3 4
## 3 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1
## 4 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2
## 5 26 4 120. 91 4.43 2.14 16.7 0 1 5 2
sample_frac(mtcars, .5)
## # A tibble: 16 x 11
## mpg cyl disp hp drat wt qsec vs am gear carb
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 24.4 4 147. 62 3.69 3.19 20 1 0 4 2
## 2 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 4
## 3 27.3 4 79 66 4.08 1.94 18.9 1 1 4 1
## 4 21.5 4 120. 97 3.7 2.46 20.0 1 0 3 1
## 5 21.4 4 121 109 4.11 2.78 18.6 1 1 4 2
## 6 14.7 8 440 230 3.23 5.34 17.4 0 0 3 4
## 7 19.7 6 145 175 3.62 2.77 15.5 0 1 5 6
## 8 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1
## 9 15.2 8 276. 180 3.07 3.78 18 0 0 3 3
## 10 15.5 8 318 150 2.76 3.52 16.9 0 0 3 2
## 11 10.4 8 472 205 2.93 5.25 18.0 0 0 3 4
## 12 15.2 8 304 150 3.15 3.44 17.3 0 0 3 2
## 13 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2
## 14 30.4 4 75.7 52 4.93 1.62 18.5 1 1 4 2
## 15 17.8 6 168. 123 3.92 3.44 18.9 1 0 4 4
## 16 10.4 8 460 215 3 5.42 17.8 0 0 3 4
jan1flights = filter(flights, month == 1, day ==1)
print(jan1flights)
## # A tibble: 842 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ... with 832 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>