Chapter 2 - Statistical Learning

Lets get the basics going. For example:
c() - creates a vector. In a way it concatenates all the inputs in one variable.

x <- c(1,3,2,5)
x
## [1] 1 3 2 5
x = c(1,6,2)
y = c(1,4,3)

# How to open a help page for a function
#?c()

length() - checks the length of a vector

length (x)
## [1] 3
length (y)
## [1] 3
x+y
## [1]  2 10  5
ls()
## [1] "x" "y"
rm(x,y)
ls()
## character(0)
# Removing all at once
#rm(list=ls())

Creating matrices: matrix() - creates a matrix of numbers. By default R creates matrices by successively filling in columns. byrow=TRUE option can be used to populate the matrix in order of the rows.

x=matrix(data=c(1,2,3,4) , nrow=2, ncol =2)
x
##      [,1] [,2]
## [1,]    1    3
## [2,]    2    4
# Same output as above without specific option value but then it must be in the same order as in the help file
x=matrix (c(1,2,3,4) ,2,2)

# Above resulted in default by column ordering of values
matrix (c(1,2,3,4) ,2,2,byrow =TRUE)
##      [,1] [,2]
## [1,]    1    2
## [2,]    3    4

Some more examples of functions:
* rnorm() function generates a vector of random normal variables, with first argument n the sample size.

sqrt(x)
##          [,1]     [,2]
## [1,] 1.000000 1.732051
## [2,] 1.414214 2.000000
x^2
##      [,1] [,2]
## [1,]    1    9
## [2,]    4   16
x=rnorm (50)
y=x+rnorm (50, mean=50, sd=.1)
cor(x,y)
## [1] 0.991476
set.seed (1303)
rnorm (50)
##  [1] -1.1439763145  1.3421293656  2.1853904757  0.5363925179  0.0631929665
##  [6]  0.5022344825 -0.0004167247  0.5658198405 -0.5725226890 -1.1102250073
## [11] -0.0486871234 -0.6956562176  0.8289174803  0.2066528551 -0.2356745091
## [16] -0.5563104914 -0.3647543571  0.8623550343 -0.6307715354  0.3136021252
## [21] -0.9314953177  0.8238676185  0.5233707021  0.7069214120  0.4202043256
## [26] -0.2690521547 -1.5103172999 -0.6902124766 -0.1434719524 -1.0135274099
## [31]  1.5732737361  0.0127465055  0.8726470499  0.4220661905 -0.0188157917
## [36]  2.6157489689 -0.6931401748 -0.2663217810 -0.7206364412  1.3677342065
## [41]  0.2640073322  0.6321868074 -1.3306509858  0.0268888182  1.0406363208
## [46]  1.3120237985 -0.0300020767 -0.2500257125  0.0234144857  1.6598706557
set.seed (3)
y=rnorm (100)
mean(y)
## [1] 0.01103557
var(y)
## [1] 0.7328675
sqrt(var(y))
## [1] 0.8560768
sd(y)
## [1] 0.8560768
# Typing 3:11 is a shorthand for seq(3,11) for integer arguments.

x=seq (1 ,10)
x
##  [1]  1  2  3  4  5  6  7  8  9 10
x=1:10
x
##  [1]  1  2  3  4  5  6  7  8  9 10
x=seq(-pi ,pi ,length =50)
x
##  [1] -3.14159265 -3.01336438 -2.88513611 -2.75690784 -2.62867957
##  [6] -2.50045130 -2.37222302 -2.24399475 -2.11576648 -1.98753821
## [11] -1.85930994 -1.73108167 -1.60285339 -1.47462512 -1.34639685
## [16] -1.21816858 -1.08994031 -0.96171204 -0.83348377 -0.70525549
## [21] -0.57702722 -0.44879895 -0.32057068 -0.19234241 -0.06411414
## [26]  0.06411414  0.19234241  0.32057068  0.44879895  0.57702722
## [31]  0.70525549  0.83348377  0.96171204  1.08994031  1.21816858
## [36]  1.34639685  1.47462512  1.60285339  1.73108167  1.85930994
## [41]  1.98753821  2.11576648  2.24399475  2.37222302  2.50045130
## [46]  2.62867957  2.75690784  2.88513611  3.01336438  3.14159265

Graphics


x=rnorm (100)
y=rnorm (100)
plot(x,y)

plot(x,y,xlab=" this is the x-axis",ylab=" this is the y-axis",
main = "Plot of X vs Y / Scatterplot ")

pdf (" Figure .pdf ")
plot(x,y,col =" green ")
dev.off ()
## png 
##   2
jpeg ("Figure.jpeg")


x=seq(-pi ,pi ,length =50)
x
##  [1] -3.14159265 -3.01336438 -2.88513611 -2.75690784 -2.62867957
##  [6] -2.50045130 -2.37222302 -2.24399475 -2.11576648 -1.98753821
## [11] -1.85930994 -1.73108167 -1.60285339 -1.47462512 -1.34639685
## [16] -1.21816858 -1.08994031 -0.96171204 -0.83348377 -0.70525549
## [21] -0.57702722 -0.44879895 -0.32057068 -0.19234241 -0.06411414
## [26]  0.06411414  0.19234241  0.32057068  0.44879895  0.57702722
## [31]  0.70525549  0.83348377  0.96171204  1.08994031  1.21816858
## [36]  1.34639685  1.47462512  1.60285339  1.73108167  1.85930994
## [41]  1.98753821  2.11576648  2.24399475  2.37222302  2.50045130
## [46]  2.62867957  2.75690784  2.88513611  3.01336438  3.14159265
y=x
f=outer(x,y,function (x,y)cos(y)/(1+x^2))
contour (x,y,f)
contour (x,y,f,nlevels =45, add=T)
fa=(f-t(f))/2
contour (x,y,fa,nlevels =15)

image(x,y,fa)
persp(x,y,fa)
persp(x,y,fa ,theta =30)
persp(x,y,fa ,theta =30, phi =20)
persp(x,y,fa ,theta =30, phi =70)
persp(x,y,fa ,theta =30, phi =40)
dev.off ()
## png 
##   2

Indexing Data


We often wish to examine part of a set of data. Suppose that our data is stored in the matrix A. What are different ways of accessing/selecting the element corresponding to:

dim() - outputs the number of rows followed by the number of columns of a given matrix.

A=matrix (1:16 ,4 ,4)
A
##      [,1] [,2] [,3] [,4]
## [1,]    1    5    9   13
## [2,]    2    6   10   14
## [3,]    3    7   11   15
## [4,]    4    8   12   16
A[2,3]
## [1] 10
A[c(1,3) ,c(2,4) ]
##      [,1] [,2]
## [1,]    5   13
## [2,]    7   15
A[1:3 ,2:4]
##      [,1] [,2] [,3]
## [1,]    5    9   13
## [2,]    6   10   14
## [3,]    7   11   15
A[1:2 ,]
##      [,1] [,2] [,3] [,4]
## [1,]    1    5    9   13
## [2,]    2    6   10   14
A[ ,1:2]
##      [,1] [,2]
## [1,]    1    5
## [2,]    2    6
## [3,]    3    7
## [4,]    4    8
A[1,]
## [1]  1  5  9 13
A[5]
## [1] 5
A[-c(1,3) ,]
##      [,1] [,2] [,3] [,4]
## [1,]    2    6   10   14
## [2,]    4    8   12   16
A[-c(1,3) ,-c(1,3,4)]
## [1] 6 8
dim(A)
## [1] 4 4
rm(Auto)
## Warning in rm(Auto): object 'Auto' not found

Loading Data


There are various ways to deal with the missing data. In this case, only five of the rows contain missing observations, and so we choose to use the na.omit() function to simply remove these rows.

#Auto=read.table ("Auto.data ")
#fix(Auto)

Auto=read.table("D:/Boston College/MS AE Courses/Spring 2018 - Big Data Econometrics/DataSets/Auto.data", header =T,na.strings ="?")
#fix(Auto)

Auto=read.csv("D:/Boston College/MS AE Courses/Spring 2018 - Big Data Econometrics/DataSets/Auto.csv", header =T,na.strings ="?")

#fix(Auto)
dim(Auto)
## [1] 397   9
Auto [1:4 ,]
##   mpg cylinders displacement horsepower weight acceleration year origin
## 1  18         8          307        130   3504         12.0   70      1
## 2  15         8          350        165   3693         11.5   70      1
## 3  18         8          318        150   3436         11.0   70      1
## 4  16         8          304        150   3433         12.0   70      1
##                        name
## 1 chevrolet chevelle malibu
## 2         buick skylark 320
## 3        plymouth satellite
## 4             amc rebel sst
Auto=na.omit(Auto)
dim(Auto)
## [1] 392   9
names(Auto)
## [1] "mpg"          "cylinders"    "displacement" "horsepower"  
## [5] "weight"       "acceleration" "year"         "origin"      
## [9] "name"
str(Auto)
## 'data.frame':    392 obs. of  9 variables:
##  $ mpg         : num  18 15 18 16 17 15 14 14 14 15 ...
##  $ cylinders   : int  8 8 8 8 8 8 8 8 8 8 ...
##  $ displacement: num  307 350 318 304 302 429 454 440 455 390 ...
##  $ horsepower  : int  130 165 150 150 140 198 220 215 225 190 ...
##  $ weight      : int  3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
##  $ acceleration: num  12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
##  $ year        : int  70 70 70 70 70 70 70 70 70 70 ...
##  $ origin      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ name        : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
##  - attr(*, "na.action")=Class 'omit'  Named int [1:5] 33 127 331 337 355
##   .. ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...

Data Descriptives


Before exiting R, we may want to save a record of all of the commands that we typed in the most recent session; this can be accomplished using the savehistory() function. Next time we enter R, we can load that history using the loadhistory() function.

#plot(cylinders , mpg)

plot(Auto$cylinders , Auto$mpg )

attach (Auto)
plot(cylinders , mpg)

cylinders =as.factor (cylinders )

plot(cylinders , mpg)

plot(cylinders , mpg , col ="red ")

plot(cylinders , mpg , col ="red", varwidth =T)

plot(cylinders , mpg , col ="red", varwidth =T,horizontal =T)

plot(cylinders , mpg , col ="red", varwidth =T, xlab=" cylinders ",ylab ="MPG ")

hist(mpg)

hist(mpg ,col =2)

hist(mpg ,col =2, breaks =15)

pairs(Auto)

pairs(~mpg + displacement + horsepower + weight + acceleration , Auto)

plot(horsepower ,mpg)
identify(horsepower ,mpg ,name)

## integer(0)
boxplot(Auto$mpg~Auto$cylinders)

summary(Auto)
##       mpg          cylinders      displacement     horsepower   
##  Min.   : 9.00   Min.   :3.000   Min.   : 68.0   Min.   : 46.0  
##  1st Qu.:17.00   1st Qu.:4.000   1st Qu.:105.0   1st Qu.: 75.0  
##  Median :22.75   Median :4.000   Median :151.0   Median : 93.5  
##  Mean   :23.45   Mean   :5.472   Mean   :194.4   Mean   :104.5  
##  3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:275.8   3rd Qu.:126.0  
##  Max.   :46.60   Max.   :8.000   Max.   :455.0   Max.   :230.0  
##                                                                 
##      weight      acceleration        year           origin     
##  Min.   :1613   Min.   : 8.00   Min.   :70.00   Min.   :1.000  
##  1st Qu.:2225   1st Qu.:13.78   1st Qu.:73.00   1st Qu.:1.000  
##  Median :2804   Median :15.50   Median :76.00   Median :1.000  
##  Mean   :2978   Mean   :15.54   Mean   :75.98   Mean   :1.577  
##  3rd Qu.:3615   3rd Qu.:17.02   3rd Qu.:79.00   3rd Qu.:2.000  
##  Max.   :5140   Max.   :24.80   Max.   :82.00   Max.   :3.000  
##                                                                
##                  name    
##  amc matador       :  5  
##  ford pinto        :  5  
##  toyota corolla    :  5  
##  amc gremlin       :  4  
##  amc hornet        :  4  
##  chevrolet chevette:  4  
##  (Other)           :365
summary(mpg)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    9.00   17.00   22.75   23.45   29.00   46.60