R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
?mpg #access information about the mpg dataset
## starting httpd help server ... done
mpg #display dataset
## # A tibble: 234 × 11
##    manufacturer model      displ  year   cyl trans drv     cty   hwy fl    class
##    <chr>        <chr>      <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
##  1 audi         a4           1.8  1999     4 auto… f        18    29 p     comp…
##  2 audi         a4           1.8  1999     4 manu… f        21    29 p     comp…
##  3 audi         a4           2    2008     4 manu… f        20    31 p     comp…
##  4 audi         a4           2    2008     4 auto… f        21    30 p     comp…
##  5 audi         a4           2.8  1999     6 auto… f        16    26 p     comp…
##  6 audi         a4           2.8  1999     6 manu… f        18    26 p     comp…
##  7 audi         a4           3.1  2008     6 auto… f        18    27 p     comp…
##  8 audi         a4 quattro   1.8  1999     4 manu… 4        18    26 p     comp…
##  9 audi         a4 quattro   1.8  1999     4 auto… 4        16    25 p     comp…
## 10 audi         a4 quattro   2    2008     4 manu… 4        20    28 p     comp…
## # ℹ 224 more rows
head(mpg) #display the first 6 rows in the dataset
## # A tibble: 6 × 11
##   manufacturer model displ  year   cyl trans      drv     cty   hwy fl    class 
##   <chr>        <chr> <dbl> <int> <int> <chr>      <chr> <int> <int> <chr> <chr> 
## 1 audi         a4      1.8  1999     4 auto(l5)   f        18    29 p     compa…
## 2 audi         a4      1.8  1999     4 manual(m5) f        21    29 p     compa…
## 3 audi         a4      2    2008     4 manual(m6) f        20    31 p     compa…
## 4 audi         a4      2    2008     4 auto(av)   f        21    30 p     compa…
## 5 audi         a4      2.8  1999     6 auto(l5)   f        16    26 p     compa…
## 6 audi         a4      2.8  1999     6 manual(m5) f        18    26 p     compa…
#command is ggplot, the first argument is the dataset, the + sign adds an additional layer, 
#geom_point creates a scatter plot, mapping defines what will be in the scatter plot, aes 
#means aestetic, x is set to displacement (engine size), y is set to highway mileage (fuel effieciency)
#ggplot(data=<DATA>)+<GEOM_FUNCTION>(mapping=aes<MAPPINGS>)
ggplot(data=mpg)+geom_point(mapping=aes(x=displ,y=hwy))

ggplot(data=mpg)+geom_point(mapping=aes(x=displ,y=cty))

ggplot(data=mpg)+geom_point(mapping=aes(x=displ,y=cyl))

#you can add a third variable, like class, to a two dimensional scatterplot by mapping it to an aesthetic
#an aestetic is a visual property of the objects in our plot

#can change the levels of a point's size, shape, color, or alpha
#convey info about your data by mapping the aesthetics in your plot to variables in the data set
#map the colors of your points to the class variables to reveal the class of each car
#will display each class value with a diff color
#scaling: mapping an aesthetic to a variable by associating the name of the aesthetic tot he name of the variable inside aes()
ggplot(data=mpg)+geom_point(mapping=aes(x=displ,y=hwy, color=class))

ggplot(data=mpg)+geom_point(mapping=aes(x=displ,y=hwy, size=class)) #not advisable
## Warning: Using size for a discrete variable is not advised.

ggplot(data=mpg)+geom_point(mapping=aes(x=displ,y=hwy, shape=class)) #displays points as different shapes, stars, crosses, etc
## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 7 values. Consider specifying shapes manually if you need
##   that many have them.
## Warning: Removed 62 rows containing missing values (`geom_point()`).

ggplot(data=mpg)+geom_point(mapping=aes(x=displ,y=hwy, alpha=class)) #displays different transparencies
## Warning: Using alpha for a discrete variable is not advised.

#set aesthetics manually, make all of the points blue
#important to note, this does not go inside of the aes function
ggplot(data=mpg)+geom_point(mapping=aes(x=displ,y=hwy), color="blue")

ggplot(data=mpg)+geom_point(mapping=aes(x=displ,y=hwy, color=class)) 

ggplot(data=mpg)+geom_point(mapping=aes(x=displ,y=hwy, color="blue")) #changes the name of the class to "blue" rather than the color of the points to blue

#facets: creates subplots that each display one subset of the data, 
#useful for categorical variables to add additional variables
#if its continuous and doesn't have specific values for the attribute, it will not work
ggplot(data=mpg)+geom_point(mapping=aes(x=displ,y=hwy)) + facet_wrap(~class, nrow=2) #creates 2 rows of plots

#draws a unique linetype for each unique value of the variable that you map to the linetype
ggplot(data=mpg)+geom_smooth(mapping=aes(x=displ,y=hwy, linetype=drv)) 
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(data=mpg)+geom_smooth(mapping=aes(x=displ,y=hwy)) 
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(data=mpg)+geom_smooth(mapping=aes(x=displ,y=hwy, group=drv)) 
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(data=mpg)+geom_smooth(mapping=aes(x=displ,y=hwy, color=drv), show.legend=FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(data=mpg)+geom_point(mapping=aes(x=displ,y=hwy))+geom_smooth(mapping=aes(x=displ,y=hwy))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(data=mpg,mapping=aes(x=displ,y=hwy))+geom_point()+geom_smooth() #makes mapping global so you dont have to retype it
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(data=mpg,mapping=aes(x=displ,y=hwy))+geom_point(mapping=aes(color=class))+geom_smooth(data=filter(mpg,class=="subcompact"), se=FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(data=mpg,mapping=aes(x=displ,y=hwy))+geom_point(mapping=aes(color=class))+geom_smooth(data=filter(mpg,class=="minivan"), se=FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 4.008
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 0.708
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : There are other near singularities as well. 0.25

?diamonds
view(diamonds) #shows table/dataset
str(diamonds) #str shows size & columns...first step to loading a dataset is to look at/analyze it
## tibble [53,940 × 10] (S3: tbl_df/tbl/data.frame)
##  $ carat  : num [1:53940] 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
##  $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
##  $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
##  $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
##  $ depth  : num [1:53940] 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
##  $ table  : num [1:53940] 55 61 65 58 58 57 57 55 61 61 ...
##  $ price  : int [1:53940] 326 326 327 334 335 336 336 337 337 338 ...
##  $ x      : num [1:53940] 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
##  $ y      : num [1:53940] 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
##  $ z      : num [1:53940] 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
summary(diamonds$carat) #gives statistics/interquartile values on the specific attribute...second step
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.2000  0.4000  0.7000  0.7979  1.0400  5.0100
mean(diamonds$price)
## [1] 3932.8
summary(diamonds$price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     950    2401    3933    5324   18823
Val <- c(46,34,87,22,91)
mean(Val)
## [1] 56
ggplot(data=diamonds)+geom_point(mapping=aes(x=carat,y=price, color=cut))

ggplot(data=diamonds)+geom_smooth(mapping=aes(x=carat,y=price, color=cut))
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

#xlab = x label, "Frequency" is the automatic label for y axis
hist(diamonds$carat, main="Histogram of diamonds carat weight", xlab="Carat")

sd(diamonds$carat)
## [1] 0.4740112
var(diamonds$carat)
## [1] 0.2246867
sd(diamonds$price)
## [1] 3989.44
var(diamonds$price)
## [1] 15915629
table(diamonds$cut)
## 
##      Fair      Good Very Good   Premium     Ideal 
##      1610      4906     12082     13791     21551
ggplot(data=diamonds)+geom_bar(mapping=aes(x=cut))

ggplot(data=diamonds)+stat_summary(mapping=aes(x=cut,y=depth), fun.max=max, fun=median)
## Warning: Removed 5 rows containing missing values (`geom_segment()`).

ggplot(data=diamonds)+geom_bar(mapping=aes(x=cut, fill=clarity))

ggplot(data=diamonds)+geom_bar(mapping=aes(x=cut, fill=clarity), position="dodge")

#coord_quickmap, sets aspect ratio
nz <- map_data("nz")
ggplot(nz,aes(long, lat, group = group)) + geom_polygon(fill  = "white", colour = "black")

ggplot(nz,aes(long, lat, group = group)) + geom_polygon(fill  = "white", colour = "black") + coord_quickmap()