DS_task

###Question 1 Use in-built dataset ‘airquality’,

a)explore the general feature of dataset using appropriate R functions.

require(ggplot2)

## Loading required package: ggplot2

require(moments)

## Loading required package: moments

aq <- data.frame(airquality)
str(aq)

## 'data.frame':    153 obs. of  6 variables:
##  $ Ozone  : int  41 36 12 18 NA 28 23 19 8 NA ...
##  $ Solar.R: int  190 118 149 313 NA NA 299 99 19 194 ...
##  $ Wind   : num  7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
##  $ Temp   : int  67 72 74 62 56 66 65 59 61 69 ...
##  $ Month  : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ Day    : int  1 2 3 4 5 6 7 8 9 10 ...

day and month should be factor variable

aq$Day <- factor(aq$Day, levels=c(1:31), ordered=TRUE)
aq$Month <- factor(aq$Month, levels=5:9, labels=month.abb[5:9], ordered=TRUE)
summary(aq)

##      Ozone           Solar.R           Wind             Temp      
##  Min.   :  1.00   Min.   :  7.0   Min.   : 1.700   Min.   :56.00  
##  1st Qu.: 18.00   1st Qu.:115.8   1st Qu.: 7.400   1st Qu.:72.00  
##  Median : 31.50   Median :205.0   Median : 9.700   Median :79.00  
##  Mean   : 42.13   Mean   :185.9   Mean   : 9.958   Mean   :77.88  
##  3rd Qu.: 63.25   3rd Qu.:258.8   3rd Qu.:11.500   3rd Qu.:85.00  
##  Max.   :168.00   Max.   :334.0   Max.   :20.700   Max.   :97.00  
##  NA's   :37       NA's   :7                                       
##  Month         Day     
##  May:31   1      :  5  
##  Jun:30   2      :  5  
##  Jul:31   3      :  5  
##  Aug:31   4      :  5  
##  Sep:30   5      :  5  
##           6      :  5  
##           (Other):123

1b

perform data cleansing if required

colSums(is.na(aq))

##   Ozone Solar.R    Wind    Temp   Month     Day 
##      37       7       0       0       0       0

There are multiple columns with NA in OZONE and Solar column

removing columns with na

aq_new<-na.omit(aq)
colSums(is.na(aq_new))

##   Ozone Solar.R    Wind    Temp   Month     Day 
##       0       0       0       0       0       0

1c

consider ‘Temp’ attributes and compute the central and variational measures.

summary(aq_new$Temp)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   57.00   71.00   79.00   77.79   84.50   97.00

ggplot(data=aq_new) + geom_histogram(mapping=aes(Temp))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

skewness(aq_new$Temp)

## [1] -0.2250959

It is negatively skewed as shown in the diagram

kurtosis(aq_new$Temp)

## [1] 2.331955

It is slighty close to normal

1d) apply boxplot technique to detect outlier of ‘wind’ attribute if any

qplot(Month,Wind, data=aq_new, geom="boxplot", color=Month)

Inference : There are outliers for the month of May and June

qplot(Day,Wind, data=aq_new, geom="boxplot", color=Day)

There are outliers on days 1,8,17,18,19,20 day

Question 2

Use dataset available on http://users.stat.ufl.edu/~winner/data/nfl2008_fga.csv , then

2a)Train the model using 80% of this dataset and suggest an appropriate GLM to model homekick to togo, ydline and kicker variables.

Reading the base data

base_data<-read.csv('D:/Freelancer_questions/Shivm/nfl2008_fga.csv')

subset_data<-base_data[,c("homekick","togo","ydline","kicker")]

subset_data$homekick<-as.factor(subset_data$homekick)

Dividing the data set to 80% for training and 20% for testing

library(caret) #this package has the createDataPartition function

## Loading required package: lattice

 set.seed(123) #randomization`

 #creating indices
 trainIndex <- createDataPartition(subset_data$homekick,p=0.75,list=FALSE)

 #splitting data into training/testing data using the trainIndex object
 TRAIN <- subset_data[trainIndex,] #training data (75% of data)

 TEST <- subset_data[-trainIndex,] #testing data (25% of data)

model1 <- glm(homekick ~ togo + ydline + kicker ,data=TRAIN, binomial)
summary(model1)

## 
## Call:
## glm(formula = homekick ~ togo + ydline + kicker, family = binomial, 
##     data = TRAIN)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.3732  -1.1663  -0.9302   1.1742   1.5095  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  0.329492   0.219198   1.503   0.1328  
## togo        -0.043578   0.018534  -2.351   0.0187 *
## ydline       0.005982   0.007810   0.766   0.4438  
## kicker      -0.008370   0.006427  -1.302   0.1928  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1078.4  on 777  degrees of freedom
## Residual deviance: 1071.1  on 774  degrees of freedom
##   (2 observations deleted due to missingness)
## AIC: 1079.1
## 
## Number of Fisher Scoring iterations: 4

(b) Specify the significant variables on homekick at the level of 𝛼=0.05, and estimate the parameters of your model

The significant variable is only togo

model2 <- glm(homekick ~ togo  ,data=TRAIN, binomial)
summary(model2)

## 
## Call:
## glm(formula = homekick ~ togo, family = binomial, data = TRAIN)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2659  -1.1657  -0.9685   1.1726   1.4453  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  0.24452    0.14107   1.733   0.0830 .
## togo        -0.03888    0.01751  -2.220   0.0264 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1078.4  on 777  degrees of freedom
## Residual deviance: 1073.4  on 776  degrees of freedom
##   (2 observations deleted due to missingness)
## AIC: 1077.4
## 
## Number of Fisher Scoring iterations: 3

2c Predict the test dataset using the trained model.

predicted_probability<-predict(model2, TEST, type="response")
pred <- ifelse(predicted_probability > 0.5,1,0)

2 d Provide the confusion matrix and obtain the probability of correctness of predictions

confusionMatrix(as.factor(pred), as.factor(TEST$homekick))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 48 50
##          1 83 78
##                                           
##                Accuracy : 0.4865          
##                  95% CI : (0.4241, 0.5491)
##     No Information Rate : 0.5058          
##     P-Value [Acc > NIR] : 0.752874        
##                                           
##                   Kappa : -0.0241         
##  Mcnemar's Test P-Value : 0.005524        
##                                           
##             Sensitivity : 0.3664          
##             Specificity : 0.6094          
##          Pos Pred Value : 0.4898          
##          Neg Pred Value : 0.4845          
##              Prevalence : 0.5058          
##          Detection Rate : 0.1853          
##    Detection Prevalence : 0.3784          
##       Balanced Accuracy : 0.4879          
##                                           
##        'Positive' Class : 0               
##

Question 3

Using Yahoo Finance API, select a specific stock market price, apply time series analysis, consider ‘close price as your time series variable:

Fetching the stock value of PBR

library(quantmod)

## Loading required package: xts

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: TTR

## Version 0.4-0 included new data defaults. See ?getSymbols.

pbr <- getSymbols("PBR", src = "yahoo", from = "2013-01-01", to = "2017-06-01", auto.assign = FALSE)

## 'getSymbols' currently uses auto.assign=TRUE by default, but will
## use auto.assign=FALSE in 0.5-0. You will still be able to use
## 'loadSymbols' to automatically load data. getOption("getSymbols.env")
## and getOption("getSymbols.auto.assign") will still be checked for
## alternate defaults.
## 
## This message is shown once per session and may be disabled by setting 
## options("getSymbols.warning4.0"=FALSE). See ?getSymbols for details.

## 
## WARNING: There have been significant changes to Yahoo Finance data.
## Please see the Warning section of '?getSymbols.yahoo' for details.
## 
## This message is shown once per session and may be disabled by setting
## options("getSymbols.yahoo.warning"=FALSE).

close price is the time series variable

(a) Validate the assumptions using graphical visualization.

chartSeries(pbr,theme="white")

library(forecast)
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following object is masked from 'package:base':
## 
##     date

library(tseries)
library(fpp)

## Loading required package: fma

## Loading required package: expsmooth

## Loading required package: lmtest

library(ggplot2)

(b) Fit the optimized model for ‘close price’ and provide the coefficient estimates for the fitted model.

Date<-index(pbr)
pbr$date<-as.character(Date)

## Warning in merge.xts(..., all = all, fill = fill, suffixes = suffixes): NAs
## introduced by coercion

# Variable to be modelled
## set the seed to make your partition reproducible



Count <- ts(as.integer(pbr$PBR.Close), start=c(2013),frequency=365)
plot(Count)

modArima <- auto.arima(D=0,Count)
modArima

## Series: Count 
## ARIMA(2,1,0) 
## 
## Coefficients:
##           ar1     ar2
##       -0.2478  -0.002
## s.e.   0.0300   0.030
## 
## sigma^2 estimated as 0.2567:  log likelihood=-819.36
## AIC=1644.72   AICc=1644.74   BIC=1659.75

AR=2 and MA = 0

3 d (d) Forecast h=10 step ahead prediction of price on the plot of the original time series.

Forecasted_values<-forecast(modArima,10)
Final_forecasted_values<-Forecasted_values$mean

forecast(modArima,pbr$PBR.Close) -> fc
autoplot(Count, series="Data") + 
autolayer(fc, series="Forecast") + 
autolayer(fitted(fc), series="Fitted")

4

4 a) Use LDA to classify the dataset into few classes so that at least 90% of information of dataset is explained through new classification. (Hint: model the variable “qtr” to variables “togo”, “kicker”, and “ydline”). How many LDs do you choose? Explain the reason.

subset_data2<-base_data[,c("qtr","togo","ydline","kicker")]

subset_data2$qtr<-as.factor(subset_data2$qtr)

library(MASS)

## 
## Attaching package: 'MASS'

## The following objects are masked from 'package:fma':
## 
##     cement, housing, petrol

library(caret)
lda.model = lda (qtr~togo+ydline+kicker, data=subset_data2)
lda.model

## Call:
## lda(qtr ~ togo + ydline + kicker, data = subset_data2)
## 
## Prior probabilities of groups:
##          1          2          3          4          5 
## 0.20636451 0.35969142 0.17550627 0.24590164 0.01253616 
## 
## Group means:
##       togo   ydline   kicker
## 1 6.481308 17.22897 19.64486
## 2 6.973190 19.30027 18.77212
## 3 6.543956 19.03297 19.96703
## 4 6.792157 18.53725 20.20000
## 5 5.923077 19.53846 22.61538
## 
## Coefficients of linear discriminants:
##                LD1         LD2         LD3
## togo    0.06665269  0.12498308  0.20996464
## ydline  0.07726467 -0.07173243 -0.02257770
## kicker -0.04134867 -0.06009657  0.05013225
## 
## Proportion of trace:
##   LD1   LD2   LD3 
## 0.615 0.322 0.063

Automatically 3 LD’s have been chosen by the model

4) 2

Apply PCA, and identify the important principle components involving at least 90% of dataset variation. Explain your decision strategy? Plot principle components versus their variance (Hint: to sketch the plot use the Scree plot).

library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

subset_data3<-base_data[,c("togo","ydline","kicker","distance","homekick","kickdiff")]
subset_data3<-na.omit(subset_data3)
res.pca <- prcomp(subset_data3, scale = TRUE)
fviz_eig(res.pca)

Till Principle component 5 the variance explained is 90%

PC1,PC2,PC3,PC4,PC5

###3. Split the dataset into two sets of variables so that X=( togo,kicker,ydline) and Y=( distance, homekick). Apply canonical correlation analysis to find the cross-correlation between X and Y. What is the correlation between ydline and distance?

subset_data4<-base_data[,c("togo","kicker","ydline","distance","homekick")]
X<-base_data[,c("togo","kicker","ydline")]

Y <-base_data[,c("distance","homekick")]

library("CCA")

## Warning: package 'CCA' was built under R version 3.5.3

## Loading required package: fda

## Warning: package 'fda' was built under R version 3.5.3

## Loading required package: splines

## Loading required package: Matrix

## 
## Attaching package: 'fda'

## The following object is masked from 'package:forecast':
## 
##     fourier

## The following object is masked from 'package:graphics':
## 
##     matplot

## Loading required package: fields

## Warning: package 'fields' was built under R version 3.5.3

## Loading required package: spam

## Warning: package 'spam' was built under R version 3.5.3

## Loading required package: dotCall64

## Warning: package 'dotCall64' was built under R version 3.5.3

## Loading required package: grid

## Spam version 2.5-1 (2019-12-12) is loaded.
## Type 'help( Spam)' or 'demo( spam)' for a short introduction 
## and overview of this package.
## Help for individual functions is also obtained by adding the
## suffix '.spam' to the function name, e.g. 'help( chol.spam)'.

## 
## Attaching package: 'spam'

## The following object is masked from 'package:Matrix':
## 
##     det

## The following objects are masked from 'package:base':
## 
##     backsolve, forwardsolve

## Loading required package: maps

## Warning: package 'maps' was built under R version 3.5.1

## 
## Attaching package: 'maps'

## The following object is masked from 'package:fma':
## 
##     ozone

## See https://github.com/NCAR/Fields for
##  an extensive vignette, other supplements and source code

correl <- matcor(X, Y )
img.matcor(correl, type = 2)

correl

## $Xcor
##               togo      kicker      ydline
## togo    1.00000000 -0.01387536  0.31449850
## kicker -0.01387536  1.00000000 -0.01737314
## ydline  0.31449850 -0.01737314  1.00000000
## 
## $Ycor
##            distance   homekick
## distance 1.00000000 0.04921831
## homekick 0.04921831 1.00000000
## 
## $XYcor
##                 togo       kicker      ydline     distance    homekick
## togo      1.00000000 -0.013875355  0.31449850  0.315641454 -0.04838438
## kicker   -0.01387536  1.000000000 -0.01737314 -0.008798867 -0.02573680
## ydline    0.31449850 -0.017373137  1.00000000  0.988479497  0.05373301
## distance  0.31564145 -0.008798867  0.98847950  1.000000000  0.04921831
## homekick -0.04838438 -0.025736803  0.05373301  0.049218308  1.00000000

4 )4. Use K-means clustering analysis to identify the most important classes. How many classes do you select? Why?

mydata <- subset_data4
mydata2<-na.omit(mydata)
wss <- (nrow(mydata2)-1)*sum(apply(mydata2,2,var))
  for (i in 2:15) wss[i] <- sum(kmeans(mydata2,
                                       centers=i)$withinss)
plot(1:15, wss, type="b", xlab="Number of Clusters",
     ylab="Within groups sum of squares")

Based on cluster analysis we can conclude 4 clusters as significant based on elbow curve