Introduction to R!

Welcome to R! R is the most popular statitical programming language. We are going to use it in this class to model data and learn about different statisitcal learning algorithms.

First, we need to install the tidyverse package. Once you’ve installed a package you shouldn’t need to do it again, but you will beed to call the library into the environment in each R session.

Into the tidyverse!

#install.packages("tidyverse")
library(tidyverse)

Coding basics

Commenting

The first thing that I like to know when I’m learning a new programming language is how to comment. Commenting your code is useful because it allows you to leave comments to your future self! This will help especially when code can get messy and long. In R, we use the ‘#’ sign and everything that follows it will be commented out (this means it is not executable).

Data types and Object Assignment

# numerics
my_numeric <- 42.5
class(my_numeric)
## [1] "numeric"
# integers
my_numeric <- 2
class(my_numeric)
## [1] "numeric"
# character strings
my_character <- "hello world"
class(my_character)
## [1] "character"
# logic/booleans
my_logical <- TRUE
class(my_logical)
## [1] "logical"

Basic operators

# Addition
3 + 3
## [1] 6
# Subtraction
4 - 3
## [1] 1
# Multiplication
4 * 3
## [1] 12
# Division
(6 * 4) / 3
## [1] 8

Vectors and Concatination

# concatination
x<-c(1, 3, 2, 5)
x
## [1] 1 3 2 5
# or
x = c(1, 6, 2)
x
## [1] 1 6 2
y = c(1, 4, 3)

# length function of a vector (not to be used with matrices)
length(x)
## [1] 3
length(y)
## [1] 3
# logical comparison 
x>2
## [1] FALSE  TRUE FALSE
# selection
x[x>2]
## [1] 6
# element-wise addition 
x+y
## [1]  2 10  5

Matrices

# matrices can also be created 
# use "?" before a function to learn about its inputs
?matrix

x1 = matrix(data=c(1, 2, 3, 4), 
            nrow = 2, 
            ncol = 2, 
            byrow=FALSE)

x1
##      [,1] [,2]
## [1,]    1    3
## [2,]    2    4
# sequences of integers
x2 = matrix(data=1:4, 
            nrow = 2, 
            ncol = 2, 
            byrow=FALSE)

x2
##      [,1] [,2]
## [1,]    1    3
## [2,]    2    4
# byrow=TRUE
x3 = matrix(data=1:4, 
            nrow = 2, 
            ncol = 2, 
            byrow=TRUE)

x3
##      [,1] [,2]
## [1,]    1    2
## [2,]    3    4
# basic operations done elementwise 
sqrt(x1)
##          [,1]     [,2]
## [1,] 1.000000 1.732051
## [2,] 1.414214 2.000000
x^2
## [1]  1 36  4
solve(x1) #inverse
##      [,1] [,2]
## [1,]   -2  1.5
## [2,]    1 -0.5
t(x1) #transpose
##      [,1] [,2]
## [1,]    1    2
## [2,]    3    4
# Indexing Data
A = matrix(1:16, 4, 4)
A
##      [,1] [,2] [,3] [,4]
## [1,]    1    5    9   13
## [2,]    2    6   10   14
## [3,]    3    7   11   15
## [4,]    4    8   12   16
# call single elements 
A[2,3]
## [1] 10
# call subsets
A[c(1, 3), c(2, 4)]
##      [,1] [,2]
## [1,]    5   13
## [2,]    7   15
A[1:3, 2:4]
##      [,1] [,2] [,3]
## [1,]    5    9   13
## [2,]    6   10   14
## [3,]    7   11   15
A[1:2,]
##      [,1] [,2] [,3] [,4]
## [1,]    1    5    9   13
## [2,]    2    6   10   14
A[,1:2]
##      [,1] [,2]
## [1,]    1    5
## [2,]    2    6
## [3,]    3    7
## [4,]    4    8
A[1,]
## [1]  1  5  9 13
A[-c(1,3),] #use negative sign for the opposite
##      [,1] [,2] [,3] [,4]
## [1,]    2    6   10   14
## [2,]    4    8   12   16
A[-c(1, 3), -c(1, 3, 4)]
## [1] 6 8
# dimension 
dim(A)
## [1] 4 4

Calling and Writing Functions

# random variable functions
?rnorm # default, mean=0, sd=1
x <- rnorm(500)
head(x)
## [1] -0.8501168  2.0185474 -0.7441127  0.7997203 -1.1035795  0.2796225
hist(x) # simple base R graphic histogram

y <- x+rnorm(500, mean=50, sd=.1)
hist(y)

plot(x,y) # simple base R scatterplot

# correlation 
cor(x, y)
## [1] 0.9947211
# setting a seed
set.seed(1303)
z <- rnorm(50)
head(z)
## [1] -1.14397631  1.34212937  2.18539048  0.53639252  0.06319297  0.50223448

Working with Data Frames

Auto <- read.table("http://faculty.marshall.usc.edu/gareth-james/ISL/Auto.data", 
                   header=TRUE,
                   na.strings = "?")
head(Auto)
##   mpg cylinders displacement horsepower weight acceleration year origin
## 1  18         8          307        130   3504         12.0   70      1
## 2  15         8          350        165   3693         11.5   70      1
## 3  18         8          318        150   3436         11.0   70      1
## 4  16         8          304        150   3433         12.0   70      1
## 5  17         8          302        140   3449         10.5   70      1
## 6  15         8          429        198   4341         10.0   70      1
##                        name
## 1 chevrolet chevelle malibu
## 2         buick skylark 320
## 3        plymouth satellite
## 4             amc rebel sst
## 5               ford torino
## 6          ford galaxie 500
dim(Auto)
## [1] 397   9
#Auto<-read.csv("/Users/heatherkitada/Downloads/Auto.csv", 
#               header=TRUE, 
#               na.strings = "?")


Auto[1:4,]
##   mpg cylinders displacement horsepower weight acceleration year origin
## 1  18         8          307        130   3504         12.0   70      1
## 2  15         8          350        165   3693         11.5   70      1
## 3  18         8          318        150   3436         11.0   70      1
## 4  16         8          304        150   3433         12.0   70      1
##                        name
## 1 chevrolet chevelle malibu
## 2         buick skylark 320
## 3        plymouth satellite
## 4             amc rebel sst
Auto=na.omit(Auto)
dim(Auto)
## [1] 392   9
names(Auto)
## [1] "mpg"          "cylinders"    "displacement" "horsepower"  
## [5] "weight"       "acceleration" "year"         "origin"      
## [9] "name"
str(Auto)
## 'data.frame':    392 obs. of  9 variables:
##  $ mpg         : num  18 15 18 16 17 15 14 14 14 15 ...
##  $ cylinders   : int  8 8 8 8 8 8 8 8 8 8 ...
##  $ displacement: num  307 350 318 304 302 429 454 440 455 390 ...
##  $ horsepower  : num  130 165 150 150 140 198 220 215 225 190 ...
##  $ weight      : num  3504 3693 3436 3433 3449 ...
##  $ acceleration: num  12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
##  $ year        : int  70 70 70 70 70 70 70 70 70 70 ...
##  $ origin      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ name        : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
##  - attr(*, "na.action")=Class 'omit'  Named int [1:5] 33 127 331 337 355
##   .. ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...
# More Graphical and numerical Summaries
plot(Auto$cylinders, Auto$mpg)

Auto$cylinders<-as.factor(Auto$cylinders)

plot(Auto$cylinders, Auto$mpg) # default to boxplot because factors

plot(Auto$cylinders, Auto$mpg, col="red")

plot(Auto$cylinders, Auto$mpg, col="red", varwidth=T)

plot(Auto$cylinders, Auto$mpg, col="red", varwidth=T, horizontal=T)

plot(Auto$cylinders, Auto$mpg, col="red", xlab="cylinders", ylab="MPG")

ggplot(Auto, aes(y=mpg, fill=cylinders))+
  geom_boxplot()+
  theme_bw()

Auto%>%
  group_by(cylinders)%>%
  summarise(mean=mean(mpg))
## # A tibble: 5 x 2
##   cylinders  mean
##   <fct>     <dbl>
## 1 3          20.6
## 2 4          29.3
## 3 5          27.4
## 4 6          20.0
## 5 8          15.0
Auto_f<-Auto%>%
  filter(mpg>30)
dim(Auto_f)
## [1] 83  9
Auto_m<-Auto%>%
  mutate(ratio=weight/horsepower)
head(Auto_m)
##   mpg cylinders displacement horsepower weight acceleration year origin
## 1  18         8          307        130   3504         12.0   70      1
## 2  15         8          350        165   3693         11.5   70      1
## 3  18         8          318        150   3436         11.0   70      1
## 4  16         8          304        150   3433         12.0   70      1
## 5  17         8          302        140   3449         10.5   70      1
## 6  15         8          429        198   4341         10.0   70      1
##                        name    ratio
## 1 chevrolet chevelle malibu 26.95385
## 2         buick skylark 320 22.38182
## 3        plymouth satellite 22.90667
## 4             amc rebel sst 22.88667
## 5               ford torino 24.63571
## 6          ford galaxie 500 21.92424
hist(Auto$mpg)

hist(Auto$mpg, col=2)

hist(Auto$mpg, col=2, breaks=15) #define the number of bins

ggplot(Auto, aes(x=mpg))+
  geom_histogram()+
  theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# all pairwise plots
pairs(Auto)

# select comparisons
pairs(~mpg + displacement+ horsepower + weight + acceleration, Auto)

plot(Auto$horsepower, Auto$mpg)