An Introduction to Statistical Learning with Applications in R by Gareth James

Library

library(dplyr)

Basic Commands

# Creating a vector of numbers
x <- c(1, 3, 2, 5)
y <- c(3, 6, 9, 12)
x
## [1] 1 3 2 5
# Another way of creating a vector of numbers 
x = c(1, 2, 3, 4)

# Checking the length
length(x)
## [1] 4
length(y)
## [1] 4
# Looking at the list of the objects we have
ls()
## [1] "x" "y"
# Removing the objects
rm(x, y)

# Removing all the objects at once
rm(list=ls())

# The matrix() function can be used to create a matric of numbers
x <- matrix(data=c(1, 2, 3, 4), nrow=2, ncol=2)
x
##      [,1] [,2]
## [1,]    1    3
## [2,]    2    4
even_numbers <- matrix(data=c(2, 4, 6, 8, 10, 12), nrow=3, ncol=3)
even_numbers
##      [,1] [,2] [,3]
## [1,]    2    8    2
## [2,]    4   10    4
## [3,]    6   12    6
odd_numbers <- matrix(data=c(1, 3, 5, 7, 9, 11), nrow=3, ncol=3)
odd_numbers
##      [,1] [,2] [,3]
## [1,]    1    7    1
## [2,]    3    9    3
## [3,]    5   11    5
even_num_col <- matrix(c(2, 4, 6, 8), 2, 2)
even_num_col
##      [,1] [,2]
## [1,]    2    6
## [2,]    4    8
even_num_row <- matrix(c(2, 4, 6, 8), 2, 2, byrow = TRUE)
even_num_row
##      [,1] [,2]
## [1,]    2    4
## [2,]    6    8
odd_num_col <- matrix(c(1, 3, 5, 7), 2, 2)
odd_num_col
##      [,1] [,2]
## [1,]    1    5
## [2,]    3    7
odd_num_row <- matrix(c(1, 3, 5, 7), 2, 2, byrow=TRUE)
odd_num_row
##      [,1] [,2]
## [1,]    1    3
## [2,]    5    7
# Adding matrixes
xy <- even_num_row + odd_num_row
xy
##      [,1] [,2]
## [1,]    3    7
## [2,]   11   15
# Squaring matrixes
sqrt(x)
##          [,1]     [,2]
## [1,] 1.000000 1.732051
## [2,] 1.414214 2.000000
# Generating a vector w/ 50 random normal variables 
g = rnorm(50)
g
##  [1] -1.70182588 -0.46807528  1.66960485 -1.42675566 -1.68851565 -0.50195589
##  [7]  1.02469414 -1.37718561 -0.13340481  1.19604500 -0.73358578 -1.07659771
## [13]  1.03497484  0.10257096 -0.11663881 -0.11938221  0.22473252  1.10688528
## [19] -1.45833317  0.74028186 -0.14612654  0.77552054 -0.66924461  0.96646941
## [25]  1.58080836  0.69527720 -0.68358213 -0.30081549 -1.02651401 -1.40632599
## [31] -2.05144748  0.88205383  0.18124067  1.53693190 -0.60122182 -1.36278109
## [37] -2.38858482 -0.07345429 -0.81375158 -0.59976911  1.20447839 -0.19625599
## [43]  0.40119519 -0.20162722  0.58276837 -2.05952529  0.04517033  0.57378775
## [49]  1.50252850 -1.89006688
# rnorm() creates standard normal random variable with a mean of 0 and a standard deviation of 1
a = g+rnorm(50, mean=50, sd=.1)
a
##  [1] 48.46225 49.48930 51.63280 48.47734 48.26294 49.62678 50.89145 48.59235
##  [9] 49.78914 51.16164 49.34270 48.79904 51.02977 50.10725 49.85503 49.92017
## [17] 50.16951 50.98986 48.49601 50.52136 49.84575 50.94179 49.30866 51.00701
## [25] 51.47158 50.63695 49.30131 49.50744 49.09042 48.46629 47.98530 50.86075
## [33] 50.09444 51.42789 49.43481 48.79913 47.74453 49.71255 49.03147 49.41765
## [41] 51.16549 49.79556 50.40500 49.91571 50.62489 47.86671 50.21538 50.68400
## [49] 51.56315 47.98324
# Cor computes the correlation between g and a = .9965
cor(g, a)
## [1] 0.9957382
# The set.seed allows you to produce the same exact set of random numbers
set.seed(1303)
rnorm(50)
##  [1] -1.1439763145  1.3421293656  2.1853904757  0.5363925179  0.0631929665
##  [6]  0.5022344825 -0.0004167247  0.5658198405 -0.5725226890 -1.1102250073
## [11] -0.0486871234 -0.6956562176  0.8289174803  0.2066528551 -0.2356745091
## [16] -0.5563104914 -0.3647543571  0.8623550343 -0.6307715354  0.3136021252
## [21] -0.9314953177  0.8238676185  0.5233707021  0.7069214120  0.4202043256
## [26] -0.2690521547 -1.5103172999 -0.6902124766 -0.1434719524 -1.0135274099
## [31]  1.5732737361  0.0127465055  0.8726470499  0.4220661905 -0.0188157917
## [36]  2.6157489689 -0.6931401748 -0.2663217810 -0.7206364412  1.3677342065
## [41]  0.2640073322  0.6321868074 -1.3306509858  0.0268888182  1.0406363208
## [46]  1.3120237985 -0.0300020767 -0.2500257125  0.0234144857  1.6598706557
# Applying sqrt() to the output of var() will give the standard deviation. Or we can simply use the sd() function
set.seed(3)

# Producing 100 exact numbers set of random numbers
y=rnorm(100)
y
##   [1] -0.961933416 -0.292525723  0.258788216 -1.152131886  0.195782826
##   [6]  0.030123945  0.085417732  1.116610213 -1.218857416  1.267368722
##  [11] -0.744781596 -1.131218571 -0.716358490  0.252652370  0.152045707
##  [16] -0.307656430 -0.953017331 -0.648242811  1.224313624  0.199811608
##  [21] -0.578483722 -0.942300733 -0.203728180 -1.666474840 -0.484455109
##  [26] -0.741072661  1.160615779  1.012067125 -0.072078474 -1.136782298
##  [31]  0.900624729  0.851770447  0.727715174  0.736502146 -0.352129617
##  [36]  0.705515513  1.300357989  0.038252014 -0.979283770  0.793761231
##  [41]  0.786506872 -0.310463131  1.698884846 -0.794593709  0.348437716
##  [46] -2.265401074 -0.162205279  1.130864991 -0.455545976 -0.899166316
##  [51]  0.726838902 -0.809440902  0.267085116 -1.737263711 -1.411425136
##  [56] -0.453551227 -1.035491275  1.362142893  0.917456737 -0.785142161
##  [61]  0.573518173  0.918196208  0.256287273  0.351966556  1.174337357
##  [66] -0.480846375 -0.418829722  0.955112803 -1.289006611  0.186197433
##  [71] -0.031325502  0.467097310  1.024197674  0.267358452  0.231826103
##  [76]  0.747592465  1.217068511  0.383358345 -0.988052822 -0.156852910
##  [81]  1.735535216 -0.352298306  0.688640044  1.224406096  0.794296303
##  [86] -0.006402398  0.219150635 -0.886463751  0.439760291 -0.886389751
##  [91] -0.853818454 -0.989994331 -0.650877737  1.053946660 -0.390878033
##  [96] -0.070586394 -0.462050809  0.540908267  0.931634971 -0.209274345
# Getting the mean from y
mean(y)
## [1] 0.01103557
# Calculating the variance of a vector of numbers
var(y)
## [1] 0.7328675
# Applying the sqrt() to the output of var() will give the standard deviation. 
sqrt(var(y))
## [1] 0.8560768
# Applyng the sd() will also give us the standard deviation
sd(y)
## [1] 0.8560768
# Using the plot() function to plot data in R
x=rnorm(100)
x
##   [1]  0.61735005 -0.40507751  1.05310376  0.60228425  1.01746118  0.60816732
##   [7]  0.20673600 -1.89772729 -0.68258283  0.48133842 -0.46303104 -0.27974170
##  [13] -0.41369014  1.61876652 -0.72105571 -0.45309316  0.01425716  0.21576462
##  [19]  0.18887020 -0.05014849 -1.49541963  0.36783775  0.51714402 -0.48433547
##  [25]  0.67485562 -0.76244861  0.38607379 -0.66400334 -1.72434420  1.15631908
##  [31]  0.69350659  0.14315636  1.49281356 -1.63215348  0.12784602 -2.40366373
##  [37]  1.44392826 -0.87889305 -1.30643833 -0.87719899 -1.16438046 -1.98234768
##  [43] -0.98994423 -0.15168460  0.91250679  0.40766981 -1.24218438 -0.64269441
##  [49]  1.93024369  0.41019936 -1.29134932  2.63504537  0.48707229  0.85389232
##  [55]  1.08844271  0.22601396  0.06819884 -0.98481553 -1.31085442  2.46405534
##  [61] -0.66542807  0.91286265  0.96466423  1.60800294  1.83539952  0.70246273
##  [67]  1.21785408 -1.12365383  0.66833012  1.21641149  0.23457535 -0.41869659
##  [73]  0.23822009 -0.55058819 -0.50060277  1.16389749  2.15553695 -1.70915701
##  [79] -1.60082262 -1.03855341  0.32309422 -0.88884715  0.39367898  0.23654147
##  [85] -0.43049681 -0.54793313 -1.32225166  0.68212675  2.16278948 -0.41666965
##  [91] -1.35731790 -0.67122646  0.64991824  0.77129117  2.67663193 -1.37087142
##  [97]  0.05775915 -0.19706751 -1.26151763 -0.66244256
y=rnorm(100)
y
##   [1] -1.332352324  0.277323621  1.085533849 -1.642718489 -0.457180258
##   [6]  1.461171845 -1.672532602  1.561095996 -1.493370648 -1.182449141
##  [11] -0.356586881 -0.915640125  0.849485485 -0.489607238  0.727983210
##  [16] -0.619313219  0.343735543  1.813625401  1.453226830  0.412429443
##  [21]  0.201975959  1.684346898 -0.964824478  0.662155194 -0.553343994
##  [26]  2.444205231 -0.304629836 -0.117369869  0.969428787  0.590939630
##  [31] -1.208958402 -0.301928423  0.401260475 -1.688368241 -0.712992471
##  [36] -0.254300337 -1.001566828 -0.924100354  0.440245569 -0.326816611
##  [41] -0.274475875  1.284142059  0.304211777  1.273682656  1.033447932
##  [46]  0.665384156 -0.313328109  0.464076685  0.478605860  2.536235795
##  [51] -0.535063085  1.368106222  0.141844290 -0.782814969  1.881518733
##  [56]  0.552922865 -1.844856583 -0.550307914 -0.026014759 -2.159250362
##  [61]  1.770600080  0.760473482 -0.484542846 -0.472763899  0.331672075
##  [66] -0.972300798 -0.531244209  0.062036740  0.103529988  1.169799329
##  [71]  0.394189201 -0.134102361 -0.379875566  1.278666582 -0.699332309
##  [76]  0.564250526  0.302941956  0.215441157  0.921002999 -0.887551892
##  [81] -0.002893014 -0.195173624 -2.234678901  0.478322985  1.622692015
##  [86] -1.196074086  1.659509162  0.204545434  1.385975853 -0.857811795
##  [91]  0.685862076 -0.581662540 -0.684552910  0.378472922  1.312957410
##  [96]  0.314021521 -0.502185174  1.252114604 -0.303155378  0.811231864
plot(x,y)

plot(x, y, xlab="This is the x-axis", ylab="This is the y-axis", main="Plot of X vs Y")

plot(even_numbers, odd_numbers, xlab="Even numbers", ylab="Odd Numbers", main="Plotting Even Numbers vs Odd Numbers")

# Create a jpeg()
pdf("Figure.pdf")
plot(x,y,col="green")

# Type dev.off() to tell R what you're done with your plot
dev.off()
## png 
##   2
# Using the seq() function to create a sequence of numbers. 
x=seq(1,10)
x
##  [1]  1  2  3  4  5  6  7  8  9 10
# Using the Contour() function to create a contour plot, or add contour lines to an existing plot.
y=x
y
##  [1]  1  2  3  4  5  6  7  8  9 10
f=outer(x, y, function(x,y)cos(y)/(1+x^2))
f
##              [,1]         [,2]         [,3]         [,4]        [,5]
##  [1,] 0.270151153 -0.208073418 -0.494996248 -0.326821810 0.141831093
##  [2,] 0.108060461 -0.083229367 -0.197998499 -0.130728724 0.056732437
##  [3,] 0.054030231 -0.041614684 -0.098999250 -0.065364362 0.028366219
##  [4,] 0.031782489 -0.024479226 -0.058234853 -0.038449625 0.016686011
##  [5,] 0.020780858 -0.016005648 -0.038076634 -0.025140139 0.010910084
##  [6,] 0.014602765 -0.011247212 -0.026756554 -0.017666044 0.007666546
##  [7,] 0.010806046 -0.008322937 -0.019799850 -0.013072872 0.005673244
##  [8,] 0.008312343 -0.006402259 -0.015230654 -0.010056056 0.004364034
##  [9,] 0.006589053 -0.005074961 -0.012073079 -0.007971264 0.003459295
## [10,] 0.005349528 -0.004120266 -0.009801906 -0.006471719 0.002808536
##              [,6]        [,7]         [,8]         [,9]        [,10]
##  [1,] 0.480085143 0.376951127 -0.072750017 -0.455565131 -0.419535765
##  [2,] 0.192034057 0.150780451 -0.029100007 -0.182226052 -0.167814306
##  [3,] 0.096017029 0.075390225 -0.014550003 -0.091113026 -0.083907153
##  [4,] 0.056480605 0.044347191 -0.008558826 -0.053595898 -0.049357149
##  [5,] 0.036929626 0.028996241 -0.005596155 -0.035043472 -0.032271982
##  [6,] 0.025950548 0.020375737 -0.003932433 -0.024625142 -0.022677609
##  [7,] 0.019203406 0.015078045 -0.002910001 -0.018222605 -0.016781431
##  [8,] 0.014771851 0.011598496 -0.002238462 -0.014017389 -0.012908793
##  [9,] 0.011709394 0.009193930 -0.001774391 -0.011111345 -0.010232580
## [10,] 0.009506637 0.007464379 -0.001440594 -0.009021092 -0.008307639
contour(x, y, f)

#contour(x,y,f,nlevels=45,add=T)
fa=(f-t(f))/2
fa
##              [,1]        [,2]          [,3]          [,4]         [,5]
##  [1,]  0.00000000 -0.15806694 -0.2745132394 -0.1793021495  0.060525117
##  [2,]  0.15806694  0.00000000 -0.0781919078 -0.0531247492  0.036369042
##  [3,]  0.27451324  0.07819191  0.0000000000 -0.0035647547  0.033221427
##  [4,]  0.17930215  0.05312475  0.0035647547  0.0000000000  0.020913075
##  [5,] -0.06052512 -0.03636904 -0.0332214265 -0.0209130751  0.000000000
##  [6,] -0.23274119 -0.10164063 -0.0613867913 -0.0370733245 -0.014631540
##  [7,] -0.18307254 -0.07955169 -0.0475950377 -0.0287100319 -0.011661498
##  [8,]  0.04053118  0.01134887 -0.0003403252 -0.0007486151  0.004980094
##  [9,]  0.23107709  0.08857555  0.0395199735  0.0228123170  0.019251383
## [10,]  0.21244265  0.08184702  0.0370526235  0.0214427149  0.017540259
##                [,6]         [,7]          [,8]         [,9]        [,10]
##  [1,]  0.2327411892 0.1830725405 -0.0405311800 -0.231077092 -0.212442646
##  [2,]  0.1016406346 0.0795516938 -0.0113488739 -0.088575545 -0.081847020
##  [3,]  0.0613867913 0.0475950377  0.0003403252 -0.039519973 -0.037052624
##  [4,]  0.0370733245 0.0287100319  0.0007486151 -0.022812317 -0.021442715
##  [5,]  0.0146315404 0.0116614984 -0.0049800944 -0.019251383 -0.017540259
##  [6,]  0.0000000000 0.0005861654 -0.0093521420 -0.018167268 -0.016092123
##  [7,] -0.0005861654 0.0000000000 -0.0072542484 -0.013708268 -0.012122905
##  [8,]  0.0093521420 0.0072542484  0.0000000000 -0.006121499 -0.005734099
##  [9,]  0.0181672680 0.0137082676  0.0061214990  0.000000000 -0.000605744
## [10,]  0.0160921227 0.0121229047  0.0057340992  0.000605744  0.000000000
contour(x,y,fa,nlevels=15)

# Using the image() function to do the same thing as the contour(), except that it produces a color-coded plot whose colors depend on the z value, know as the heatmap.
image(x,y,fa)

# Creating a three-dimensional plot. The theta and phi control the angles at which the plot is viewed.
persp(x,y,fa)

# Using the persp() to produce a three-dimensional plot. Controling the angle. Theta = 30
persp(x,y,fa,theta=30)

# Theta = 30, and phi = 20
persp(x, y, fa, theta=30, phi=20)

# Theta = 30, and phi = 40
persp(x, y, fa, theta=30, phi=40)

# Theta = 30, and phi = 70
persp(x, y, fa, theta=30, phi=70)

# Indexing data
a = matrix(1:16, 4, 4)
a
##      [,1] [,2] [,3] [,4]
## [1,]    1    5    9   13
## [2,]    2    6   10   14
## [3,]    3    7   11   15
## [4,]    4    8   12   16
a[1]
## [1] 1
a[1,3]
## [1] 9
a[c(1,3), c(2,4)]
##      [,1] [,2]
## [1,]    5   13
## [2,]    7   15
a[1:3, 2:4]
##      [,1] [,2] [,3]
## [1,]    5    9   13
## [2,]    6   10   14
## [3,]    7   11   15
a[1:2,]
##      [,1] [,2] [,3] [,4]
## [1,]    1    5    9   13
## [2,]    2    6   10   14
a[,1:2]
##      [,1] [,2]
## [1,]    1    5
## [2,]    2    6
## [3,]    3    7
## [4,]    4    8
# Using the dim() to output the number of rows and columns in a 
dim(a)
## [1] 4 4
# Using names() to check the variable name
names(a)
## NULL
#library
library(ISLR)
require(ggplot2)

auto_df = ISLR::Auto


## Adding Graphical and Numerical Summaries

# Ploting cylinders vs mpg
plot(auto_df$cylinders, auto_df$mpg)

# Using the attach() in order to tell R to make the variables in this data frame available by name
attach(auto_df)

plot(cylinders, mpg)

auto_df$cylinders
##   [1] 8 8 8 8 8 8 8 8 8 8 8 8 8 8 4 6 6 6 4 4 4 4 4 4 6 8 8 8 8 4 4 4 6 6 6 6 6
##  [38] 8 8 8 8 8 8 8 6 4 6 6 4 4 4 4 4 4 4 4 4 4 4 4 4 8 8 8 8 8 8 8 8 8 3 8 8 8
##  [75] 8 4 4 4 4 4 4 4 4 4 8 8 8 8 8 8 8 8 8 8 8 8 6 6 6 6 6 4 8 8 8 8 6 4 4 4 3
## [112] 4 6 4 8 8 4 4 4 4 8 4 6 8 6 6 6 4 4 4 4 6 6 6 8 8 8 8 8 4 4 4 4 4 4 4 4 4
## [149] 4 4 6 6 6 6 8 8 8 8 6 6 6 6 6 8 8 4 4 6 4 4 4 4 6 4 6 4 4 4 4 4 4 4 4 4 4
## [186] 8 8 8 8 6 6 6 6 4 4 4 4 6 6 6 6 4 4 4 4 4 8 4 6 6 8 8 8 8 4 4 4 4 4 8 8 8
## [223] 8 6 6 6 6 8 8 8 8 4 4 4 4 4 4 4 4 6 4 3 4 4 4 4 4 8 8 8 6 6 6 4 6 6 6 6 6
## [260] 6 8 6 8 8 4 4 4 4 4 4 4 4 5 6 4 6 4 4 6 6 4 6 6 8 8 8 8 8 8 8 8 4 4 4 4 5
## [297] 8 4 8 4 4 4 4 4 6 6 4 4 4 4 4 4 4 4 6 4 4 4 4 4 4 4 4 4 4 5 4 4 4 4 6 3 4
## [334] 4 4 4 4 6 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 6 6 6 6 8 6 6 4 4 4 4 4 4 4 4
## [371] 4 4 4 4 4 4 4 4 4 4 6 6 4 6 4 4 4 4 4 4 4 4
# Using the as.factor() to convert the quantitative variable into qualitative variables
cylinders=as.factor(cylinders)

# Simple Plotting cylinders vs mpg
plot(cylinders, mpg)

# Plotting cylinders vs mpg in red
plot(cylinders, mpg, col="red")

# Plotting cylinders vs mpg in red with varwidth=T
plot(cylinders, mpg, col="red", varwidth=T)

# Plotting cylinders vs mpg in red, varwidth=T, and horizontal=T
plot(cylinders, mpg, col="red", varwidth=T, horizontal=T)

# Plotting cylinders vs mpg in red, varwidth=T, xlab="Cylinders", ylab="MPG"
plot(cylinders, mpg, col="red", varwidth=T, xlab="Cylinders", ylab="MPG")

# Using the hist() function to plot a histogram 
hist(mpg)

# Histogram with (col=2 is the same as col="red")
hist(mpg,col=2)

# Histogram with col=2, breaks = 15
hist(mpg,col=2,breaks=15)

# Creating scatterplots matrix for every pair of variables for any given data set using the pairs() in R
pairs(auto_df, col=2)

# Creating a scatterplot matrix for just a subset of variable
pairs(~mpg + displacement + horsepower + weight + acceleration, auto_df, col="blue")

# Creating a scatterplot matrix for just mpg, weight, acceleration
pairs(~mpg + weight + acceleration, col = "purple", auto_df)

# Plotting horsepower vs mpg
plot(horsepower, mpg, col=2)

# Using the summary() to produce a numerical summary of each variable in a particlar data set
summary(auto_df)
##       mpg          cylinders      displacement     horsepower        weight    
##  Min.   : 9.00   Min.   :3.000   Min.   : 68.0   Min.   : 46.0   Min.   :1613  
##  1st Qu.:17.00   1st Qu.:4.000   1st Qu.:105.0   1st Qu.: 75.0   1st Qu.:2225  
##  Median :22.75   Median :4.000   Median :151.0   Median : 93.5   Median :2804  
##  Mean   :23.45   Mean   :5.472   Mean   :194.4   Mean   :104.5   Mean   :2978  
##  3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:275.8   3rd Qu.:126.0   3rd Qu.:3615  
##  Max.   :46.60   Max.   :8.000   Max.   :455.0   Max.   :230.0   Max.   :5140  
##                                                                                
##   acceleration        year           origin                      name    
##  Min.   : 8.00   Min.   :70.00   Min.   :1.000   amc matador       :  5  
##  1st Qu.:13.78   1st Qu.:73.00   1st Qu.:1.000   ford pinto        :  5  
##  Median :15.50   Median :76.00   Median :1.000   toyota corolla    :  5  
##  Mean   :15.54   Mean   :75.98   Mean   :1.577   amc gremlin       :  4  
##  3rd Qu.:17.02   3rd Qu.:79.00   3rd Qu.:2.000   amc hornet        :  4  
##  Max.   :24.80   Max.   :82.00   Max.   :3.000   chevrolet chevette:  4  
##                                                  (Other)           :365
# Provide a numerical summary for the mpg dataset
summary(mpg)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    9.00   17.00   22.75   23.45   29.00   46.60
# Compactly display the internal structure of an R object, a diagnostic function and an alternative to summary 
str(mpg)
##  num [1:392] 18 15 18 16 17 15 14 14 14 15 ...