An Introduction to Statistical Learning with Applications in R by Gareth James
library(dplyr)
# Creating a vector of numbers
x <- c(1, 3, 2, 5)
y <- c(3, 6, 9, 12)
x
## [1] 1 3 2 5
# Another way of creating a vector of numbers
x = c(1, 2, 3, 4)
# Checking the length
length(x)
## [1] 4
length(y)
## [1] 4
# Looking at the list of the objects we have
ls()
## [1] "x" "y"
# Removing the objects
rm(x, y)
# Removing all the objects at once
rm(list=ls())
# The matrix() function can be used to create a matric of numbers
x <- matrix(data=c(1, 2, 3, 4), nrow=2, ncol=2)
x
## [,1] [,2]
## [1,] 1 3
## [2,] 2 4
even_numbers <- matrix(data=c(2, 4, 6, 8, 10, 12), nrow=3, ncol=3)
even_numbers
## [,1] [,2] [,3]
## [1,] 2 8 2
## [2,] 4 10 4
## [3,] 6 12 6
odd_numbers <- matrix(data=c(1, 3, 5, 7, 9, 11), nrow=3, ncol=3)
odd_numbers
## [,1] [,2] [,3]
## [1,] 1 7 1
## [2,] 3 9 3
## [3,] 5 11 5
even_num_col <- matrix(c(2, 4, 6, 8), 2, 2)
even_num_col
## [,1] [,2]
## [1,] 2 6
## [2,] 4 8
even_num_row <- matrix(c(2, 4, 6, 8), 2, 2, byrow = TRUE)
even_num_row
## [,1] [,2]
## [1,] 2 4
## [2,] 6 8
odd_num_col <- matrix(c(1, 3, 5, 7), 2, 2)
odd_num_col
## [,1] [,2]
## [1,] 1 5
## [2,] 3 7
odd_num_row <- matrix(c(1, 3, 5, 7), 2, 2, byrow=TRUE)
odd_num_row
## [,1] [,2]
## [1,] 1 3
## [2,] 5 7
# Adding matrixes
xy <- even_num_row + odd_num_row
xy
## [,1] [,2]
## [1,] 3 7
## [2,] 11 15
# Squaring matrixes
sqrt(x)
## [,1] [,2]
## [1,] 1.000000 1.732051
## [2,] 1.414214 2.000000
# Generating a vector w/ 50 random normal variables
g = rnorm(50)
g
## [1] -1.70182588 -0.46807528 1.66960485 -1.42675566 -1.68851565 -0.50195589
## [7] 1.02469414 -1.37718561 -0.13340481 1.19604500 -0.73358578 -1.07659771
## [13] 1.03497484 0.10257096 -0.11663881 -0.11938221 0.22473252 1.10688528
## [19] -1.45833317 0.74028186 -0.14612654 0.77552054 -0.66924461 0.96646941
## [25] 1.58080836 0.69527720 -0.68358213 -0.30081549 -1.02651401 -1.40632599
## [31] -2.05144748 0.88205383 0.18124067 1.53693190 -0.60122182 -1.36278109
## [37] -2.38858482 -0.07345429 -0.81375158 -0.59976911 1.20447839 -0.19625599
## [43] 0.40119519 -0.20162722 0.58276837 -2.05952529 0.04517033 0.57378775
## [49] 1.50252850 -1.89006688
# rnorm() creates standard normal random variable with a mean of 0 and a standard deviation of 1
a = g+rnorm(50, mean=50, sd=.1)
a
## [1] 48.46225 49.48930 51.63280 48.47734 48.26294 49.62678 50.89145 48.59235
## [9] 49.78914 51.16164 49.34270 48.79904 51.02977 50.10725 49.85503 49.92017
## [17] 50.16951 50.98986 48.49601 50.52136 49.84575 50.94179 49.30866 51.00701
## [25] 51.47158 50.63695 49.30131 49.50744 49.09042 48.46629 47.98530 50.86075
## [33] 50.09444 51.42789 49.43481 48.79913 47.74453 49.71255 49.03147 49.41765
## [41] 51.16549 49.79556 50.40500 49.91571 50.62489 47.86671 50.21538 50.68400
## [49] 51.56315 47.98324
# Cor computes the correlation between g and a = .9965
cor(g, a)
## [1] 0.9957382
# The set.seed allows you to produce the same exact set of random numbers
set.seed(1303)
rnorm(50)
## [1] -1.1439763145 1.3421293656 2.1853904757 0.5363925179 0.0631929665
## [6] 0.5022344825 -0.0004167247 0.5658198405 -0.5725226890 -1.1102250073
## [11] -0.0486871234 -0.6956562176 0.8289174803 0.2066528551 -0.2356745091
## [16] -0.5563104914 -0.3647543571 0.8623550343 -0.6307715354 0.3136021252
## [21] -0.9314953177 0.8238676185 0.5233707021 0.7069214120 0.4202043256
## [26] -0.2690521547 -1.5103172999 -0.6902124766 -0.1434719524 -1.0135274099
## [31] 1.5732737361 0.0127465055 0.8726470499 0.4220661905 -0.0188157917
## [36] 2.6157489689 -0.6931401748 -0.2663217810 -0.7206364412 1.3677342065
## [41] 0.2640073322 0.6321868074 -1.3306509858 0.0268888182 1.0406363208
## [46] 1.3120237985 -0.0300020767 -0.2500257125 0.0234144857 1.6598706557
# Applying sqrt() to the output of var() will give the standard deviation. Or we can simply use the sd() function
set.seed(3)
# Producing 100 exact numbers set of random numbers
y=rnorm(100)
y
## [1] -0.961933416 -0.292525723 0.258788216 -1.152131886 0.195782826
## [6] 0.030123945 0.085417732 1.116610213 -1.218857416 1.267368722
## [11] -0.744781596 -1.131218571 -0.716358490 0.252652370 0.152045707
## [16] -0.307656430 -0.953017331 -0.648242811 1.224313624 0.199811608
## [21] -0.578483722 -0.942300733 -0.203728180 -1.666474840 -0.484455109
## [26] -0.741072661 1.160615779 1.012067125 -0.072078474 -1.136782298
## [31] 0.900624729 0.851770447 0.727715174 0.736502146 -0.352129617
## [36] 0.705515513 1.300357989 0.038252014 -0.979283770 0.793761231
## [41] 0.786506872 -0.310463131 1.698884846 -0.794593709 0.348437716
## [46] -2.265401074 -0.162205279 1.130864991 -0.455545976 -0.899166316
## [51] 0.726838902 -0.809440902 0.267085116 -1.737263711 -1.411425136
## [56] -0.453551227 -1.035491275 1.362142893 0.917456737 -0.785142161
## [61] 0.573518173 0.918196208 0.256287273 0.351966556 1.174337357
## [66] -0.480846375 -0.418829722 0.955112803 -1.289006611 0.186197433
## [71] -0.031325502 0.467097310 1.024197674 0.267358452 0.231826103
## [76] 0.747592465 1.217068511 0.383358345 -0.988052822 -0.156852910
## [81] 1.735535216 -0.352298306 0.688640044 1.224406096 0.794296303
## [86] -0.006402398 0.219150635 -0.886463751 0.439760291 -0.886389751
## [91] -0.853818454 -0.989994331 -0.650877737 1.053946660 -0.390878033
## [96] -0.070586394 -0.462050809 0.540908267 0.931634971 -0.209274345
# Getting the mean from y
mean(y)
## [1] 0.01103557
# Calculating the variance of a vector of numbers
var(y)
## [1] 0.7328675
# Applying the sqrt() to the output of var() will give the standard deviation.
sqrt(var(y))
## [1] 0.8560768
# Applyng the sd() will also give us the standard deviation
sd(y)
## [1] 0.8560768
# Using the plot() function to plot data in R
x=rnorm(100)
x
## [1] 0.61735005 -0.40507751 1.05310376 0.60228425 1.01746118 0.60816732
## [7] 0.20673600 -1.89772729 -0.68258283 0.48133842 -0.46303104 -0.27974170
## [13] -0.41369014 1.61876652 -0.72105571 -0.45309316 0.01425716 0.21576462
## [19] 0.18887020 -0.05014849 -1.49541963 0.36783775 0.51714402 -0.48433547
## [25] 0.67485562 -0.76244861 0.38607379 -0.66400334 -1.72434420 1.15631908
## [31] 0.69350659 0.14315636 1.49281356 -1.63215348 0.12784602 -2.40366373
## [37] 1.44392826 -0.87889305 -1.30643833 -0.87719899 -1.16438046 -1.98234768
## [43] -0.98994423 -0.15168460 0.91250679 0.40766981 -1.24218438 -0.64269441
## [49] 1.93024369 0.41019936 -1.29134932 2.63504537 0.48707229 0.85389232
## [55] 1.08844271 0.22601396 0.06819884 -0.98481553 -1.31085442 2.46405534
## [61] -0.66542807 0.91286265 0.96466423 1.60800294 1.83539952 0.70246273
## [67] 1.21785408 -1.12365383 0.66833012 1.21641149 0.23457535 -0.41869659
## [73] 0.23822009 -0.55058819 -0.50060277 1.16389749 2.15553695 -1.70915701
## [79] -1.60082262 -1.03855341 0.32309422 -0.88884715 0.39367898 0.23654147
## [85] -0.43049681 -0.54793313 -1.32225166 0.68212675 2.16278948 -0.41666965
## [91] -1.35731790 -0.67122646 0.64991824 0.77129117 2.67663193 -1.37087142
## [97] 0.05775915 -0.19706751 -1.26151763 -0.66244256
y=rnorm(100)
y
## [1] -1.332352324 0.277323621 1.085533849 -1.642718489 -0.457180258
## [6] 1.461171845 -1.672532602 1.561095996 -1.493370648 -1.182449141
## [11] -0.356586881 -0.915640125 0.849485485 -0.489607238 0.727983210
## [16] -0.619313219 0.343735543 1.813625401 1.453226830 0.412429443
## [21] 0.201975959 1.684346898 -0.964824478 0.662155194 -0.553343994
## [26] 2.444205231 -0.304629836 -0.117369869 0.969428787 0.590939630
## [31] -1.208958402 -0.301928423 0.401260475 -1.688368241 -0.712992471
## [36] -0.254300337 -1.001566828 -0.924100354 0.440245569 -0.326816611
## [41] -0.274475875 1.284142059 0.304211777 1.273682656 1.033447932
## [46] 0.665384156 -0.313328109 0.464076685 0.478605860 2.536235795
## [51] -0.535063085 1.368106222 0.141844290 -0.782814969 1.881518733
## [56] 0.552922865 -1.844856583 -0.550307914 -0.026014759 -2.159250362
## [61] 1.770600080 0.760473482 -0.484542846 -0.472763899 0.331672075
## [66] -0.972300798 -0.531244209 0.062036740 0.103529988 1.169799329
## [71] 0.394189201 -0.134102361 -0.379875566 1.278666582 -0.699332309
## [76] 0.564250526 0.302941956 0.215441157 0.921002999 -0.887551892
## [81] -0.002893014 -0.195173624 -2.234678901 0.478322985 1.622692015
## [86] -1.196074086 1.659509162 0.204545434 1.385975853 -0.857811795
## [91] 0.685862076 -0.581662540 -0.684552910 0.378472922 1.312957410
## [96] 0.314021521 -0.502185174 1.252114604 -0.303155378 0.811231864
plot(x,y)
plot(x, y, xlab="This is the x-axis", ylab="This is the y-axis", main="Plot of X vs Y")
plot(even_numbers, odd_numbers, xlab="Even numbers", ylab="Odd Numbers", main="Plotting Even Numbers vs Odd Numbers")
# Create a jpeg()
pdf("Figure.pdf")
plot(x,y,col="green")
# Type dev.off() to tell R what you're done with your plot
dev.off()
## png
## 2
# Using the seq() function to create a sequence of numbers.
x=seq(1,10)
x
## [1] 1 2 3 4 5 6 7 8 9 10
# Using the Contour() function to create a contour plot, or add contour lines to an existing plot.
y=x
y
## [1] 1 2 3 4 5 6 7 8 9 10
f=outer(x, y, function(x,y)cos(y)/(1+x^2))
f
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0.270151153 -0.208073418 -0.494996248 -0.326821810 0.141831093
## [2,] 0.108060461 -0.083229367 -0.197998499 -0.130728724 0.056732437
## [3,] 0.054030231 -0.041614684 -0.098999250 -0.065364362 0.028366219
## [4,] 0.031782489 -0.024479226 -0.058234853 -0.038449625 0.016686011
## [5,] 0.020780858 -0.016005648 -0.038076634 -0.025140139 0.010910084
## [6,] 0.014602765 -0.011247212 -0.026756554 -0.017666044 0.007666546
## [7,] 0.010806046 -0.008322937 -0.019799850 -0.013072872 0.005673244
## [8,] 0.008312343 -0.006402259 -0.015230654 -0.010056056 0.004364034
## [9,] 0.006589053 -0.005074961 -0.012073079 -0.007971264 0.003459295
## [10,] 0.005349528 -0.004120266 -0.009801906 -0.006471719 0.002808536
## [,6] [,7] [,8] [,9] [,10]
## [1,] 0.480085143 0.376951127 -0.072750017 -0.455565131 -0.419535765
## [2,] 0.192034057 0.150780451 -0.029100007 -0.182226052 -0.167814306
## [3,] 0.096017029 0.075390225 -0.014550003 -0.091113026 -0.083907153
## [4,] 0.056480605 0.044347191 -0.008558826 -0.053595898 -0.049357149
## [5,] 0.036929626 0.028996241 -0.005596155 -0.035043472 -0.032271982
## [6,] 0.025950548 0.020375737 -0.003932433 -0.024625142 -0.022677609
## [7,] 0.019203406 0.015078045 -0.002910001 -0.018222605 -0.016781431
## [8,] 0.014771851 0.011598496 -0.002238462 -0.014017389 -0.012908793
## [9,] 0.011709394 0.009193930 -0.001774391 -0.011111345 -0.010232580
## [10,] 0.009506637 0.007464379 -0.001440594 -0.009021092 -0.008307639
contour(x, y, f)
#contour(x,y,f,nlevels=45,add=T)
fa=(f-t(f))/2
fa
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0.00000000 -0.15806694 -0.2745132394 -0.1793021495 0.060525117
## [2,] 0.15806694 0.00000000 -0.0781919078 -0.0531247492 0.036369042
## [3,] 0.27451324 0.07819191 0.0000000000 -0.0035647547 0.033221427
## [4,] 0.17930215 0.05312475 0.0035647547 0.0000000000 0.020913075
## [5,] -0.06052512 -0.03636904 -0.0332214265 -0.0209130751 0.000000000
## [6,] -0.23274119 -0.10164063 -0.0613867913 -0.0370733245 -0.014631540
## [7,] -0.18307254 -0.07955169 -0.0475950377 -0.0287100319 -0.011661498
## [8,] 0.04053118 0.01134887 -0.0003403252 -0.0007486151 0.004980094
## [9,] 0.23107709 0.08857555 0.0395199735 0.0228123170 0.019251383
## [10,] 0.21244265 0.08184702 0.0370526235 0.0214427149 0.017540259
## [,6] [,7] [,8] [,9] [,10]
## [1,] 0.2327411892 0.1830725405 -0.0405311800 -0.231077092 -0.212442646
## [2,] 0.1016406346 0.0795516938 -0.0113488739 -0.088575545 -0.081847020
## [3,] 0.0613867913 0.0475950377 0.0003403252 -0.039519973 -0.037052624
## [4,] 0.0370733245 0.0287100319 0.0007486151 -0.022812317 -0.021442715
## [5,] 0.0146315404 0.0116614984 -0.0049800944 -0.019251383 -0.017540259
## [6,] 0.0000000000 0.0005861654 -0.0093521420 -0.018167268 -0.016092123
## [7,] -0.0005861654 0.0000000000 -0.0072542484 -0.013708268 -0.012122905
## [8,] 0.0093521420 0.0072542484 0.0000000000 -0.006121499 -0.005734099
## [9,] 0.0181672680 0.0137082676 0.0061214990 0.000000000 -0.000605744
## [10,] 0.0160921227 0.0121229047 0.0057340992 0.000605744 0.000000000
contour(x,y,fa,nlevels=15)
# Using the image() function to do the same thing as the contour(), except that it produces a color-coded plot whose colors depend on the z value, know as the heatmap.
image(x,y,fa)
# Creating a three-dimensional plot. The theta and phi control the angles at which the plot is viewed.
persp(x,y,fa)
# Using the persp() to produce a three-dimensional plot. Controling the angle. Theta = 30
persp(x,y,fa,theta=30)
# Theta = 30, and phi = 20
persp(x, y, fa, theta=30, phi=20)
# Theta = 30, and phi = 40
persp(x, y, fa, theta=30, phi=40)
# Theta = 30, and phi = 70
persp(x, y, fa, theta=30, phi=70)
# Indexing data
a = matrix(1:16, 4, 4)
a
## [,1] [,2] [,3] [,4]
## [1,] 1 5 9 13
## [2,] 2 6 10 14
## [3,] 3 7 11 15
## [4,] 4 8 12 16
a[1]
## [1] 1
a[1,3]
## [1] 9
a[c(1,3), c(2,4)]
## [,1] [,2]
## [1,] 5 13
## [2,] 7 15
a[1:3, 2:4]
## [,1] [,2] [,3]
## [1,] 5 9 13
## [2,] 6 10 14
## [3,] 7 11 15
a[1:2,]
## [,1] [,2] [,3] [,4]
## [1,] 1 5 9 13
## [2,] 2 6 10 14
a[,1:2]
## [,1] [,2]
## [1,] 1 5
## [2,] 2 6
## [3,] 3 7
## [4,] 4 8
# Using the dim() to output the number of rows and columns in a
dim(a)
## [1] 4 4
# Using names() to check the variable name
names(a)
## NULL
#library
library(ISLR)
require(ggplot2)
auto_df = ISLR::Auto
## Adding Graphical and Numerical Summaries
# Ploting cylinders vs mpg
plot(auto_df$cylinders, auto_df$mpg)
# Using the attach() in order to tell R to make the variables in this data frame available by name
attach(auto_df)
plot(cylinders, mpg)
auto_df$cylinders
## [1] 8 8 8 8 8 8 8 8 8 8 8 8 8 8 4 6 6 6 4 4 4 4 4 4 6 8 8 8 8 4 4 4 6 6 6 6 6
## [38] 8 8 8 8 8 8 8 6 4 6 6 4 4 4 4 4 4 4 4 4 4 4 4 4 8 8 8 8 8 8 8 8 8 3 8 8 8
## [75] 8 4 4 4 4 4 4 4 4 4 8 8 8 8 8 8 8 8 8 8 8 8 6 6 6 6 6 4 8 8 8 8 6 4 4 4 3
## [112] 4 6 4 8 8 4 4 4 4 8 4 6 8 6 6 6 4 4 4 4 6 6 6 8 8 8 8 8 4 4 4 4 4 4 4 4 4
## [149] 4 4 6 6 6 6 8 8 8 8 6 6 6 6 6 8 8 4 4 6 4 4 4 4 6 4 6 4 4 4 4 4 4 4 4 4 4
## [186] 8 8 8 8 6 6 6 6 4 4 4 4 6 6 6 6 4 4 4 4 4 8 4 6 6 8 8 8 8 4 4 4 4 4 8 8 8
## [223] 8 6 6 6 6 8 8 8 8 4 4 4 4 4 4 4 4 6 4 3 4 4 4 4 4 8 8 8 6 6 6 4 6 6 6 6 6
## [260] 6 8 6 8 8 4 4 4 4 4 4 4 4 5 6 4 6 4 4 6 6 4 6 6 8 8 8 8 8 8 8 8 4 4 4 4 5
## [297] 8 4 8 4 4 4 4 4 6 6 4 4 4 4 4 4 4 4 6 4 4 4 4 4 4 4 4 4 4 5 4 4 4 4 6 3 4
## [334] 4 4 4 4 6 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 6 6 6 6 8 6 6 4 4 4 4 4 4 4 4
## [371] 4 4 4 4 4 4 4 4 4 4 6 6 4 6 4 4 4 4 4 4 4 4
# Using the as.factor() to convert the quantitative variable into qualitative variables
cylinders=as.factor(cylinders)
# Simple Plotting cylinders vs mpg
plot(cylinders, mpg)
# Plotting cylinders vs mpg in red
plot(cylinders, mpg, col="red")
# Plotting cylinders vs mpg in red with varwidth=T
plot(cylinders, mpg, col="red", varwidth=T)
# Plotting cylinders vs mpg in red, varwidth=T, and horizontal=T
plot(cylinders, mpg, col="red", varwidth=T, horizontal=T)
# Plotting cylinders vs mpg in red, varwidth=T, xlab="Cylinders", ylab="MPG"
plot(cylinders, mpg, col="red", varwidth=T, xlab="Cylinders", ylab="MPG")
# Using the hist() function to plot a histogram
hist(mpg)
# Histogram with (col=2 is the same as col="red")
hist(mpg,col=2)
# Histogram with col=2, breaks = 15
hist(mpg,col=2,breaks=15)
# Creating scatterplots matrix for every pair of variables for any given data set using the pairs() in R
pairs(auto_df, col=2)
# Creating a scatterplot matrix for just a subset of variable
pairs(~mpg + displacement + horsepower + weight + acceleration, auto_df, col="blue")
# Creating a scatterplot matrix for just mpg, weight, acceleration
pairs(~mpg + weight + acceleration, col = "purple", auto_df)
# Plotting horsepower vs mpg
plot(horsepower, mpg, col=2)
# Using the summary() to produce a numerical summary of each variable in a particlar data set
summary(auto_df)
## mpg cylinders displacement horsepower weight
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0 Min. :1613
## 1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0 1st Qu.:2225
## Median :22.75 Median :4.000 Median :151.0 Median : 93.5 Median :2804
## Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5 Mean :2978
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0 3rd Qu.:3615
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0 Max. :5140
##
## acceleration year origin name
## Min. : 8.00 Min. :70.00 Min. :1.000 amc matador : 5
## 1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000 ford pinto : 5
## Median :15.50 Median :76.00 Median :1.000 toyota corolla : 5
## Mean :15.54 Mean :75.98 Mean :1.577 amc gremlin : 4
## 3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000 amc hornet : 4
## Max. :24.80 Max. :82.00 Max. :3.000 chevrolet chevette: 4
## (Other) :365
# Provide a numerical summary for the mpg dataset
summary(mpg)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 9.00 17.00 22.75 23.45 29.00 46.60
# Compactly display the internal structure of an R object, a diagnostic function and an alternative to summary
str(mpg)
## num [1:392] 18 15 18 16 17 15 14 14 14 15 ...