Welcome to R! R is the most popular statitical programming language. We are going to use it in this class to model data and learn about different statisitcal learning algorithms.
First, we need to install the tidyverse package. Once you’ve installed a package you shouldn’t need to do it again, but you will beed to call the library into the environment in each R session.
#install.packages("tidyverse")
library(tidyverse)
The first thing that I like to know when I’m learning a new programming language is how to comment. Commenting your code is useful because it allows you to leave comments to your future self! This will help especially when code can get messy and long. In R, we use the ‘#’ sign and everything that follows it will be commented out (this means it is not executable).
# numerics
my_numeric <- 42.5
class(my_numeric)
## [1] "numeric"
# integers
my_numeric <- 2
class(my_numeric)
## [1] "numeric"
# character strings
my_character <- "hello world"
class(my_character)
## [1] "character"
# logic/booleans
my_logical <- TRUE
class(my_logical)
## [1] "logical"
# Addition
3 + 3
## [1] 6
# Subtraction
4 - 3
## [1] 1
# Multiplication
4 * 3
## [1] 12
# Division
(6 * 4) / 3
## [1] 8
# concatination
x<-c(1, 3, 2, 5)
x
## [1] 1 3 2 5
# or
x = c(1, 6, 2)
x
## [1] 1 6 2
y = c(1, 4, 3)
# length function of a vector (not to be used with matrices)
length(x)
## [1] 3
length(y)
## [1] 3
# logical comparison
x>2
## [1] FALSE TRUE FALSE
# selection
x[x>2]
## [1] 6
# element-wise addition
x+y
## [1] 2 10 5
# matrices can also be created
# use "?" before a function to learn about its inputs
?matrix
x1 = matrix(data=c(1, 2, 3, 4),
nrow = 2,
ncol = 2,
byrow=FALSE)
x1
## [,1] [,2]
## [1,] 1 3
## [2,] 2 4
# sequences of integers
x2 = matrix(data=1:4,
nrow = 2,
ncol = 2,
byrow=FALSE)
x2
## [,1] [,2]
## [1,] 1 3
## [2,] 2 4
# byrow=TRUE
x3 = matrix(data=1:4,
nrow = 2,
ncol = 2,
byrow=TRUE)
x3
## [,1] [,2]
## [1,] 1 2
## [2,] 3 4
# basic operations done elementwise
sqrt(x1)
## [,1] [,2]
## [1,] 1.000000 1.732051
## [2,] 1.414214 2.000000
x^2
## [1] 1 36 4
solve(x1) #inverse
## [,1] [,2]
## [1,] -2 1.5
## [2,] 1 -0.5
t(x1) #transpose
## [,1] [,2]
## [1,] 1 2
## [2,] 3 4
# Indexing Data
A = matrix(1:16, 4, 4)
A
## [,1] [,2] [,3] [,4]
## [1,] 1 5 9 13
## [2,] 2 6 10 14
## [3,] 3 7 11 15
## [4,] 4 8 12 16
# call single elements
A[2,3]
## [1] 10
# call subsets
A[c(1, 3), c(2, 4)]
## [,1] [,2]
## [1,] 5 13
## [2,] 7 15
A[1:3, 2:4]
## [,1] [,2] [,3]
## [1,] 5 9 13
## [2,] 6 10 14
## [3,] 7 11 15
A[1:2,]
## [,1] [,2] [,3] [,4]
## [1,] 1 5 9 13
## [2,] 2 6 10 14
A[,1:2]
## [,1] [,2]
## [1,] 1 5
## [2,] 2 6
## [3,] 3 7
## [4,] 4 8
A[1,]
## [1] 1 5 9 13
A[-c(1,3),] #use negative sign for the opposite
## [,1] [,2] [,3] [,4]
## [1,] 2 6 10 14
## [2,] 4 8 12 16
A[-c(1, 3), -c(1, 3, 4)]
## [1] 6 8
# dimension
dim(A)
## [1] 4 4
# random variable functions
?rnorm # default, mean=0, sd=1
x <- rnorm(500)
head(x)
## [1] -0.8501168 2.0185474 -0.7441127 0.7997203 -1.1035795 0.2796225
hist(x) # simple base R graphic histogram
y <- x+rnorm(500, mean=50, sd=.1)
hist(y)
plot(x,y) # simple base R scatterplot
# correlation
cor(x, y)
## [1] 0.9947211
# setting a seed
set.seed(1303)
z <- rnorm(50)
head(z)
## [1] -1.14397631 1.34212937 2.18539048 0.53639252 0.06319297 0.50223448
Auto <- read.table("http://faculty.marshall.usc.edu/gareth-james/ISL/Auto.data",
header=TRUE,
na.strings = "?")
head(Auto)
## mpg cylinders displacement horsepower weight acceleration year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10.0 70 1
## name
## 1 chevrolet chevelle malibu
## 2 buick skylark 320
## 3 plymouth satellite
## 4 amc rebel sst
## 5 ford torino
## 6 ford galaxie 500
dim(Auto)
## [1] 397 9
#Auto<-read.csv("/Users/heatherkitada/Downloads/Auto.csv",
# header=TRUE,
# na.strings = "?")
Auto[1:4,]
## mpg cylinders displacement horsepower weight acceleration year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## name
## 1 chevrolet chevelle malibu
## 2 buick skylark 320
## 3 plymouth satellite
## 4 amc rebel sst
Auto=na.omit(Auto)
dim(Auto)
## [1] 392 9
names(Auto)
## [1] "mpg" "cylinders" "displacement" "horsepower"
## [5] "weight" "acceleration" "year" "origin"
## [9] "name"
str(Auto)
## 'data.frame': 392 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : num 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : num 3504 3693 3436 3433 3449 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:5] 33 127 331 337 355
## .. ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...
# More Graphical and numerical Summaries
plot(Auto$cylinders, Auto$mpg)
Auto$cylinders<-as.factor(Auto$cylinders)
plot(Auto$cylinders, Auto$mpg) # default to boxplot because factors
plot(Auto$cylinders, Auto$mpg, col="red")
plot(Auto$cylinders, Auto$mpg, col="red", varwidth=T)
plot(Auto$cylinders, Auto$mpg, col="red", varwidth=T, horizontal=T)
plot(Auto$cylinders, Auto$mpg, col="red", xlab="cylinders", ylab="MPG")
ggplot(Auto, aes(y=mpg, fill=cylinders))+
geom_boxplot()+
theme_bw()
Auto%>%
group_by(cylinders)%>%
summarise(mean=mean(mpg))
## # A tibble: 5 x 2
## cylinders mean
## <fct> <dbl>
## 1 3 20.6
## 2 4 29.3
## 3 5 27.4
## 4 6 20.0
## 5 8 15.0
Auto_f<-Auto%>%
filter(mpg>30)
dim(Auto_f)
## [1] 83 9
Auto_m<-Auto%>%
mutate(ratio=weight/horsepower)
head(Auto_m)
## mpg cylinders displacement horsepower weight acceleration year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10.0 70 1
## name ratio
## 1 chevrolet chevelle malibu 26.95385
## 2 buick skylark 320 22.38182
## 3 plymouth satellite 22.90667
## 4 amc rebel sst 22.88667
## 5 ford torino 24.63571
## 6 ford galaxie 500 21.92424
hist(Auto$mpg)
hist(Auto$mpg, col=2)
hist(Auto$mpg, col=2, breaks=15) #define the number of bins
ggplot(Auto, aes(x=mpg))+
geom_histogram()+
theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# all pairwise plots
pairs(Auto)
# select comparisons
pairs(~mpg + displacement+ horsepower + weight + acceleration, Auto)
plot(Auto$horsepower, Auto$mpg)