Data Analysis

Author

Nagender Aneja

Reference:

r-statistics.co

1 Introduction

1.1 Download:

R from http://cran.r-project.org/
RStudio from http://rstudio.com

Resources * An Introduction to Statistical Learning

1.2 Basic Commands

Functions are used to perform operations

1.2.1 Concatenate to create a Vector

x = c(1,2,3,4,5)
print(x)

[1] 1 2 3 4 5

x <- c(1,2,3,4,5)
print(x)

[1] 1 2 3 4 5

1.2.2 Adding Two Vectors

x = c(1,2,3,4,5)
y = c(1,2,3,4,5)
z = x+y # 2  4  6  8 10
print(z)

[1]  2  4  6  8 10

z = z+2 # 4  6  8 10 12
print(z)

[1]  4  6  8 10 12

x = c(1,2,3,4,5)
k = c(2,3)
print("x+k is Error")

[1] "x+k is Error"

x = c(1,2,3,4,5)
k = c(2)
z = x+k
print(z)

[1] 3 4 5 6 7

1.2.3 Length of vector

x <- c(1,2,3,4,5)
length(x)

[1] 5

y = c(1,2,3,4,5)
length(y)

[1] 5

k = c(2)
length(k)

[1] 1

1.2.4 List of All Objects and Remove Object(s)

print(ls())

[1] "k" "x" "y" "z"

rm(k, z)
print(ls())

[1] "x" "y"

rm(list=ls())
print(ls())

character(0)

1.2.5 Creating Function

f = function(x, y) x^2 + y^2
print(f(10, 10))

[1] 200

f = function(x, y){ # Shift+Enter
     z = x^2 + y^2
     z
}
print(f(10, 10))

[1] 200

1.2.6 Matrix

?matrix
x = matrix(data=c(1, 2, 3, 4), nrow=2, ncol=2)
print(x)

     [,1] [,2]
[1,]    1    3
[2,]    2    4

x = matrix(c(1, 2, 3, 4), 2, 2)
print(x)

     [,1] [,2]
[1,]    1    3
[2,]    2    4

x = matrix(nrow=2, ncol=2, data=c(1, 2, 3, 4))
print(x)

     [,1] [,2]
[1,]    1    3
[2,]    2    4

x = matrix(data=c(1, 2, 3, 4), nrow=2, ncol=2)
dim(x)

[1] 2 2

x = 1:5
y = x
z = x %o% y # outer product of vectors
print(z)

     [,1] [,2] [,3] [,4] [,5]
[1,]    1    2    3    4    5
[2,]    2    4    6    8   10
[3,]    3    6    9   12   15
[4,]    4    8   12   16   20
[5,]    5   10   15   20   25

x = 1:5
y = x
z = outer(x,y)
print(z)

     [,1] [,2] [,3] [,4] [,5]
[1,]    1    2    3    4    5
[2,]    2    4    6    8   10
[3,]    3    6    9   12   15
[4,]    4    8   12   16   20
[5,]    5   10   15   20   25

x = 1:5
y = x
z = outer(x, y, "+")
print(z)

     [,1] [,2] [,3] [,4] [,5]
[1,]    2    3    4    5    6
[2,]    3    4    5    6    7
[3,]    4    5    6    7    8
[4,]    5    6    7    8    9
[5,]    6    7    8    9   10

x = 1:5
y = x
f = function(x, y) x^2 + y^2
z = outer(x, y, f)
print(z)

     [,1] [,2] [,3] [,4] [,5]
[1,]    2    5   10   17   26
[2,]    5    8   13   20   29
[3,]   10   13   18   25   34
[4,]   17   20   25   32   41
[5,]   26   29   34   41   50

1.3 Other Functions

x <- 4
x = sqrt(x)

x <- 2
x = x^2
print(x)

[1] 4

1.3.1 rnorm() generates vector of normal variables with mean 0 and std 1

x = rnorm(5)
print(x)

[1]  0.1607087 -0.3911897  0.6331766  1.1846232 -0.5664601

y = rnorm(5, mean=50, sd=5)
print(y)

[1] 50.44063 57.69443 50.75348 51.55136 47.94516

1.3.2 Correlation

x = rnorm(5)
y = rnorm(5, mean=50, sd=5)
cor(x, y)

[1] 0.2691936

1.3.3 Seed

x = rnorm(5);
print(x)

[1] -0.07537298 -1.31504100  0.84138315  0.12311147  0.77717263

x = rnorm(5);
print(x)

[1] 0.92607212 1.54560344 1.55898802 0.93918978 0.05363251

set.seed(5); x = rnorm(5); x

[1] -0.84085548  1.38435934 -1.25549186  0.07014277  1.71144087

set.seed(5); x = rnorm(5); x

[1] -0.84085548  1.38435934 -1.25549186  0.07014277  1.71144087

1.3.4 Mean, Variance, Standard Deviation

mean(x); var(x); sd(x)

[1] 0.2139191

[1] 1.726223

[1] 1.313858

1.4 Graphics

# rnorm() normal variables with mean 0 and std 1
x = rnorm(100)
y = rnorm(100)
plot(x,y)

plot(x, y, xlim=c(0,2), ylim=c(0,2))

# "l" for lines and "b" for both points and lines
plot(x, y, type="p")

# "l" for lines and "b" for both points and lines
plot(x, y, type="b")

# xlabel and ylabel
plot(x,y, main="Scatter Plot", xlab="x-axis", ylab="y-axis")

1.4.1 Saving plot in pdf

pdf(file = "xy.pdf")
plot(x,y, main="xy plot", xlab="x-axis", ylab="y-axis")
dev.off() # complete plotting

quartz_off_screen 
                2

1.4.2 Saving plot in jpg

jpeg(file = "xy.jpg")
plot(x,y, main="xy plot", xlab="x-axis", ylab="y-axis")
dev.off() # complete plotting

quartz_off_screen 
                2

1.5 Sequence of Numbers

x = seq(1, 10)
x

 [1]  1  2  3  4  5  6  7  8  9 10

x = 1:10
x

 [1]  1  2  3  4  5  6  7  8  9 10

1.5.1 Sequence for vector of integers equally spaced

x = seq(1, 10, length=5)
x

[1]  1.00  3.25  5.50  7.75 10.00

x <- seq(-pi, pi, length = 5)
x

[1] -3.141593 -1.570796  0.000000  1.570796  3.141593

1.6 3D Plots

First dimension: a vector of the x values
Second dimension: a vector of the y values
Third dimension: a matrix of the z values whose elements correspond to each pair of (x, y) coordinates

x = 1:10
y = x
f = function(x, y) cos(y) / (1 + x^2)
z = outer(x, y, f)

1.6.1 3D Plot

persp(x, y, z)

persp(x, y, z, theta=30)

persp(x, y, z, theta=30, phi=70)

persp(x, y, z, theta=30, phi=40)

1.6.2 Heatmap based on values of z

image(x, y, z)

1.7 Indexing Data

A = matrix(1:16, 4, 4)
A

     [,1] [,2] [,3] [,4]
[1,]    1    5    9   13
[2,]    2    6   10   14
[3,]    3    7   11   15
[4,]    4    8   12   16

A[4,4]

[1] 16

A[c(1,2,3), c(1,2,3)]

     [,1] [,2] [,3]
[1,]    1    5    9
[2,]    2    6   10
[3,]    3    7   11

A[1:3, 1:3]

     [,1] [,2] [,3]
[1,]    1    5    9
[2,]    2    6   10
[3,]    3    7   11

A[4,]

[1]  4  8 12 16

A[,4]

[1] 13 14 15 16

A[1:3,]

     [,1] [,2] [,3] [,4]
[1,]    1    5    9   13
[2,]    2    6   10   14
[3,]    3    7   11   15

A[,1:3]

     [,1] [,2] [,3]
[1,]    1    5    9
[2,]    2    6   10
[3,]    3    7   11
[4,]    4    8   12

A[-1,]

     [,1] [,2] [,3] [,4]
[1,]    2    6   10   14
[2,]    3    7   11   15
[3,]    4    8   12   16

A[-c(1,2),]

     [,1] [,2] [,3] [,4]
[1,]    3    7   11   15
[2,]    4    8   12   16

A[-c(1,4),]

     [,1] [,2] [,3] [,4]
[1,]    2    6   10   14
[2,]    3    7   11   15

2 Auto Dataset from ISLR2

#install.packages("ISLR2")
library(ISLR2)

Auto = ISLR2::Auto
#Auto = read.csv("data/auto.csv")

head(Auto)

  mpg cylinders displacement horsepower weight acceleration year origin
1  18         8          307        130   3504         12.0   70      1
2  15         8          350        165   3693         11.5   70      1
3  18         8          318        150   3436         11.0   70      1
4  16         8          304        150   3433         12.0   70      1
5  17         8          302        140   3449         10.5   70      1
6  15         8          429        198   4341         10.0   70      1
                       name
1 chevrolet chevelle malibu
2         buick skylark 320
3        plymouth satellite
4             amc rebel sst
5               ford torino
6          ford galaxie 500

dim(Auto)

[1] 392   9

Auto[1:4, ]

  mpg cylinders displacement horsepower weight acceleration year origin
1  18         8          307        130   3504         12.0   70      1
2  15         8          350        165   3693         11.5   70      1
3  18         8          318        150   3436         11.0   70      1
4  16         8          304        150   3433         12.0   70      1
                       name
1 chevrolet chevelle malibu
2         buick skylark 320
3        plymouth satellite
4             amc rebel sst

Auto <- na.omit(Auto)
dim(Auto)

[1] 392   9

2.1 Get names of an object

names(Auto)

[1] "mpg"          "cylinders"    "displacement" "horsepower"   "weight"      
[6] "acceleration" "year"         "origin"       "name"

2.2 Set names of an object

x = 1:3
names(x) = c("a","b","c")
x

a b c 
1 2 3

2.3 Change Working Director

RStudio -> Session → Set Working Directory -> source file location
getwd()
setwd(“/Users/data”)
na.strings
- chars replaced with NA (missing element)
stringsAsFactors = T
- any variable containing character strings should be interpreted as a qualitative variable, and that each distinct character string represents a distinct level for that qualitative variable.

Auto = read.csv("data/auto.csv", header = T, na.strings = c("?"), stringsAsFactors = T)

names(Auto)

[1] "mpg"          "cylinders"    "displacement" "horsepower"   "weight"      
[6] "acceleration" "year"         "origin"       "name"

Auto[1:4, ]

  mpg cylinders displacement horsepower weight acceleration year origin
1  18         8          307        130   3504         12.0   70      1
2  15         8          350        165   3693         11.5   70      1
3  18         8          318        150   3436         11.0   70      1
4  16         8          304        150   3433         12.0   70      1
                       name
1 chevrolet chevelle malibu
2         buick skylark 320
3        plymouth satellite
4             amc rebel sst

Auto <- na.omit(Auto)
dim(Auto)

[1] 392   9

3 Additional Graphical and Numerical Summaries

3.1 Scatterplot of quantitative variables

library(ISLR2)
Auto = ISLR2::Auto
names(Auto)

[1] "mpg"          "cylinders"    "displacement" "horsepower"   "weight"      
[6] "acceleration" "year"         "origin"       "name"

plot(Auto$cylinders , Auto$mpg)

attach(Auto)
plot(cylinders , mpg)

detach()

3.2 Converting Quantative to Qualitative

Cylinders variable has small number of possible values so may be converted to qualitative

3.2.1 Box Plot

Auto$cylinders <- as.factor(Auto$cylinders)
plot(Auto$cylinders , Auto$mpg)

plot(Auto$cylinders , Auto$mpg , col="red")

plot(Auto$cylinders, Auto$mpg , col="red", varwidth=T)

plot(Auto$cylinders, Auto$mpg , col="red", varwidth=T, horizontal=T)

plot(Auto$cylinders, Auto$mpg, col="red", varwidth=T, xlab="cylinders", ylab="MPG")

3.3 Histogram

3.4 Numeric Variable

hist(Auto$mpg)

hist(Auto$mpg , col=2) # red

3.4.1 breaks is suggestion only

hist(Auto$mpg , col=2, breaks=15)

3.5 Pairs - Scatterplot Matrix

3.5.1 for every pair of variables

pairs(Auto)

3.5.2 pairs for subset of variables

pairs(~ mpg + displacement + horsepower + weight + acceleration, data=Auto)

4 Identify

Select points to be labeled and Esc to see labels

# Label some points with a variable
names(Auto) # Field names

[1] "mpg"          "cylinders"    "displacement" "horsepower"   "weight"      
[6] "acceleration" "year"         "origin"       "name"

plot(Auto$horsepower , Auto$mpg)
identify(Auto$horsepower, Auto$mpg, Auto$name) #

integer(0)

5 Summary

5.1 Numeric summary of each variable

summary(Auto)

      mpg        cylinders  displacement     horsepower        weight    
 Min.   : 9.00   3:  4     Min.   : 68.0   Min.   : 46.0   Min.   :1613  
 1st Qu.:17.00   4:199     1st Qu.:105.0   1st Qu.: 75.0   1st Qu.:2225  
 Median :22.75   5:  3     Median :151.0   Median : 93.5   Median :2804  
 Mean   :23.45   6: 83     Mean   :194.4   Mean   :104.5   Mean   :2978  
 3rd Qu.:29.00   8:103     3rd Qu.:275.8   3rd Qu.:126.0   3rd Qu.:3615  
 Max.   :46.60             Max.   :455.0   Max.   :230.0   Max.   :5140  
                                                                         
  acceleration        year           origin                      name    
 Min.   : 8.00   Min.   :70.00   Min.   :1.000   amc matador       :  5  
 1st Qu.:13.78   1st Qu.:73.00   1st Qu.:1.000   ford pinto        :  5  
 Median :15.50   Median :76.00   Median :1.000   toyota corolla    :  5  
 Mean   :15.54   Mean   :75.98   Mean   :1.577   amc gremlin       :  4  
 3rd Qu.:17.02   3rd Qu.:79.00   3rd Qu.:2.000   amc hornet        :  4  
 Max.   :24.80   Max.   :82.00   Max.   :3.000   chevrolet chevette:  4  
                                                 (Other)           :365

5.2 Summary of single variable

summary(Auto$mpg)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   9.00   17.00   22.75   23.45   29.00   46.60

6 ggplot2 Setup

make plots by adding layers
- tell ggplot what dataset (dataframe) to use, ggplot(df)
  - ggplot doesn’t take vectors as arguments
- add whatever aesthetics, aes()

library(ggplot2)

dim(diamonds)

[1] 53940    10

head(diamonds)

# A tibble: 6 × 10
  carat cut       color clarity depth table price     x     y     z
  <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31
4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75
6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48

#View(diamonds)
names(diamonds)

 [1] "carat"   "cut"     "color"   "clarity" "depth"   "table"   "price"  
 [8] "x"       "y"       "z"

# "carat"   "cut"     "color"   "clarity" "depth"   "table"   "price"  "x"       "y"       "z"

# if only the dataset is known
ggplot(diamonds)

# if only X-axis is known. The Y-axis can be specified in respective geoms
ggplot(diamonds, aes(x=carat))

# if both X and Y axes are fixed for all layers
ggplot(diamonds, 
       aes(x=carat, y=price))

# Each category of the 'cut' variable will now have a distinct  color, once a geom is added
ggplot(diamonds, 
       aes(x=carat, color=cut))

# Fix color and will not vary based on dataframe variable
ggplot(diamonds, 
       aes(x=carat),
       color="steelblue")

7 Layers

# Adding scatterplot geom (layer1) and smoothing geom (layer2)
ggplot(diamonds, 
       aes(x=carat, y=price, 
           color=cut)) + 
  geom_point() + 
  geom_smooth()

`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

# Same as above but specifying the aesthetics inside the geoms
ggplot(diamonds) + 
  geom_point(aes(x=carat, y=price, 
                 color=cut)) + 
  geom_smooth(aes(x=carat, y=price,
                  color=cut))

`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

# Remove color from geom_smooth for one smoothing line
ggplot(diamonds) + 
  geom_point(aes(x=carat, y=price, 
                 color=cut)) + 
  geom_smooth(aes(x=carat, y=price))

`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

# same but simpler
ggplot(diamonds, aes(x=carat, 
                     y=price)) + 
  geom_point(aes(color=cut)) + 
  geom_smooth()

`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

# shape of the points vary with color feature?
ggplot(diamonds, 
       aes(x=carat, y=price, 
           color=cut, shape=color)) + 
  geom_point()

Warning: Using shapes for an ordinal variable is not advised

Warning: The shape palette can deal with a maximum of 6 discrete values because
more than 6 becomes difficult to discriminate; you have 7. Consider
specifying shapes manually if you must have them.

Warning: Removed 2808 rows containing missing values (`geom_point()`).

8 Labels

# add axis lables and plot title.
gg <- ggplot(diamonds, aes(x=carat, y=price, color=cut)) + 
  geom_point() + 
  labs(title="Scatterplot", x="Carat", y="Price")

print(gg)

9 Theme

# add title and axis text, change legend title.
gg1 = gg + theme(
  plot.title = element_text(size=30,
                        face="bold"), 
  axis.text.x = element_text(size=15),
  axis.text.y = element_text(size=15),
  axis.title.x = element_text(size=25),
  axis.title.y = element_text(size=25)) + 
  scale_color_discrete(name="Cut of diamonds")
print(gg1)  # print the plot

If the legend shows a shape attribute based on a factor variable
- scale_shape_discrete(name="legend title")
Had it been a continuous variable,
- scale_shape_continuous(name="legend title")

10 Facets

# row ~ column

# columns defined by 'cut'
gg1 + facet_wrap( ~ cut, ncol=3)

# row: color, column: cut
gg1 + facet_wrap(color ~ cut)

# In a grid
gg1 + facet_grid(color ~ cut)

11 Other Charts

# Bar Charts

# Y axis derived from counts of X item
plot1 = ggplot(mtcars, aes(x=cyl)) + 
  geom_bar() + 
  labs(title="Frequency bar chart")
print(plot1)

# Y axis is explicit in the dataframe 'stat=identity'
df <- data.frame(var=c("a", "b", "c"), nums=c(1:3))
#View(df)
plot2 <- ggplot(df, aes(x=var, 
                        y=nums)) +
  geom_bar(stat = "identity")
print(plot2)

# Flipping coordinates
df <- data.frame(var=c("a", "b", "c"), nums=c(1:3))
ggplot(df, aes(x=var, y=nums)) + 
  geom_bar(stat = "identity") + 
  coord_flip() + 
  labs(title="Coordinates are flipped")

# Adjust X and Y axis limits
# coord_cartesian(xlim=c(x1,x2))
# xlim(c(x1,x2)) # delete datapoints
# scale_x_continuous(limits=c(x1,x2)) # delete datapoints

# Coord_cartesian zoomed in
ggplot(diamonds, aes(x=carat, y=price)) + 
  geom_point(aes(color=cut)) + 
  geom_smooth() + 
  coord_cartesian(ylim=c(0, 10000)) + 
  labs(title="Coord_cartesian zoomed in!")

`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

# Datapoints deleted: change in smoothing lines
ggplot(diamonds, aes(x=carat, y=price)) + 
  geom_point(aes(color=cut)) + 
  geom_smooth() + 
  ylim(c(0, 10000)) + 
  labs(title="Datapoints deleted: Note the change in smoothing lines!")

`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Warning: Removed 5222 rows containing non-finite values (`stat_smooth()`).

Warning: Removed 5222 rows containing missing values (`geom_point()`).

12 Function

# print squares of numbers
squares = function(n) {
  for(i in 1:n) {
    print(i^2)
  }
}   
squares(5)

[1] 1
[1] 4
[1] 9
[1] 16
[1] 25

# add squares of numbers
squares = function(a, b) {
  return(a^2 + b^2)
}   
result = squares(a=5, b=5)
print(result)

[1] 50

# Lazy Evaluation of Function
lazyfn <- function(a, b) {
  print(a)
  print(b) # error only when it is needed
}
#lazyfn(5)

13 Decision

# Check if Odd or Even
x = 6
if(x %% 2 == 0){
  print("x is even")
}else{
  print("x is odd")
}

[1] "x is even"

x = 0
if (x < 0) {
  print("Negative number")
} else if (x > 0) {
  print("Positive number")
} else{
  print("Zero")
}

[1] "Zero"

14 Loop

for (x in 1:5) {
  print(x)
}

[1] 1
[1] 2
[1] 3
[1] 4
[1] 5

days <- list("monday", "tuesday", "wednesday", "thursday", "friday")
for (x in days) {
  print(x)
}

[1] "monday"
[1] "tuesday"
[1] "wednesday"
[1] "thursday"
[1] "friday"

dice <- c(1, 2, 3, 4, 5, 6)
for (x in dice) {
  print(x)
}

[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6

15 Charts

library(ISLR2)

Boston = ISLR2::Boston

dim(Boston)

[1] 506  13

#View(Boston)
attach(Boston)

unique(Boston$rad)

[1]  1  2  3  5  4  8  6  7 24

table_rad = table(Boston$rad) # freqency of unique values
df = as.data.frame(table_rad)
print(df)

  Var1 Freq
1    1   20
2    2   24
3    3   38
4    4  110
5    5  115
6    6   26
7    7   17
8    8   24
9   24  132

pie(x=df$Freq, labels=df$Var1, main="Pie")

barplot(df$Freq, names.arg=df$Var1, main="Barplot")

boxplot(age, data=Boston, main="Box Plot")

hist(Boston$age, main="Histogram")

plot(Boston$medv, type="l", main="Line")

plot(Boston$medv, type="o", main="Line and Points")

plot(x=Boston$lstat, y=Boston$medv, main="Scatter")