###Question 1 Use in-built dataset ‘airquality’,
a)explore the general feature of dataset using appropriate R functions.
require(ggplot2)
## Loading required package: ggplot2
require(moments)
## Loading required package: moments
aq <- data.frame(airquality)
str(aq)
## 'data.frame': 153 obs. of 6 variables:
## $ Ozone : int 41 36 12 18 NA 28 23 19 8 NA ...
## $ Solar.R: int 190 118 149 313 NA NA 299 99 19 194 ...
## $ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
## $ Temp : int 67 72 74 62 56 66 65 59 61 69 ...
## $ Month : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Day : int 1 2 3 4 5 6 7 8 9 10 ...
aq$Day <- factor(aq$Day, levels=c(1:31), ordered=TRUE)
aq$Month <- factor(aq$Month, levels=5:9, labels=month.abb[5:9], ordered=TRUE)
summary(aq)
## Ozone Solar.R Wind Temp
## Min. : 1.00 Min. : 7.0 Min. : 1.700 Min. :56.00
## 1st Qu.: 18.00 1st Qu.:115.8 1st Qu.: 7.400 1st Qu.:72.00
## Median : 31.50 Median :205.0 Median : 9.700 Median :79.00
## Mean : 42.13 Mean :185.9 Mean : 9.958 Mean :77.88
## 3rd Qu.: 63.25 3rd Qu.:258.8 3rd Qu.:11.500 3rd Qu.:85.00
## Max. :168.00 Max. :334.0 Max. :20.700 Max. :97.00
## NA's :37 NA's :7
## Month Day
## May:31 1 : 5
## Jun:30 2 : 5
## Jul:31 3 : 5
## Aug:31 4 : 5
## Sep:30 5 : 5
## 6 : 5
## (Other):123
colSums(is.na(aq))
## Ozone Solar.R Wind Temp Month Day
## 37 7 0 0 0 0
There are multiple columns with NA in OZONE and Solar column
aq_new<-na.omit(aq)
colSums(is.na(aq_new))
## Ozone Solar.R Wind Temp Month Day
## 0 0 0 0 0 0
summary(aq_new$Temp)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 57.00 71.00 79.00 77.79 84.50 97.00
ggplot(data=aq_new) + geom_histogram(mapping=aes(Temp))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
skewness(aq_new$Temp)
## [1] -0.2250959
It is negatively skewed as shown in the diagram
kurtosis(aq_new$Temp)
## [1] 2.331955
It is slighty close to normal
qplot(Month,Wind, data=aq_new, geom="boxplot", color=Month)
Inference : There are outliers for the month of May and June
qplot(Day,Wind, data=aq_new, geom="boxplot", color=Day)
There are outliers on days 1,8,17,18,19,20 day
Use dataset available on http://users.stat.ufl.edu/~winner/data/nfl2008_fga.csv , then
2a)Train the model using 80% of this dataset and suggest an appropriate GLM to model homekick to togo, ydline and kicker variables.
Reading the base data
base_data<-read.csv('D:/Freelancer_questions/Shivm/nfl2008_fga.csv')
subset_data<-base_data[,c("homekick","togo","ydline","kicker")]
subset_data$homekick<-as.factor(subset_data$homekick)
Dividing the data set to 80% for training and 20% for testing
library(caret) #this package has the createDataPartition function
## Loading required package: lattice
set.seed(123) #randomization`
#creating indices
trainIndex <- createDataPartition(subset_data$homekick,p=0.75,list=FALSE)
#splitting data into training/testing data using the trainIndex object
TRAIN <- subset_data[trainIndex,] #training data (75% of data)
TEST <- subset_data[-trainIndex,] #testing data (25% of data)
model1 <- glm(homekick ~ togo + ydline + kicker ,data=TRAIN, binomial)
summary(model1)
##
## Call:
## glm(formula = homekick ~ togo + ydline + kicker, family = binomial,
## data = TRAIN)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3732 -1.1663 -0.9302 1.1742 1.5095
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.329492 0.219198 1.503 0.1328
## togo -0.043578 0.018534 -2.351 0.0187 *
## ydline 0.005982 0.007810 0.766 0.4438
## kicker -0.008370 0.006427 -1.302 0.1928
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1078.4 on 777 degrees of freedom
## Residual deviance: 1071.1 on 774 degrees of freedom
## (2 observations deleted due to missingness)
## AIC: 1079.1
##
## Number of Fisher Scoring iterations: 4
The significant variable is only togo
model2 <- glm(homekick ~ togo ,data=TRAIN, binomial)
summary(model2)
##
## Call:
## glm(formula = homekick ~ togo, family = binomial, data = TRAIN)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2659 -1.1657 -0.9685 1.1726 1.4453
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.24452 0.14107 1.733 0.0830 .
## togo -0.03888 0.01751 -2.220 0.0264 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1078.4 on 777 degrees of freedom
## Residual deviance: 1073.4 on 776 degrees of freedom
## (2 observations deleted due to missingness)
## AIC: 1077.4
##
## Number of Fisher Scoring iterations: 3
predicted_probability<-predict(model2, TEST, type="response")
pred <- ifelse(predicted_probability > 0.5,1,0)
confusionMatrix(as.factor(pred), as.factor(TEST$homekick))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 48 50
## 1 83 78
##
## Accuracy : 0.4865
## 95% CI : (0.4241, 0.5491)
## No Information Rate : 0.5058
## P-Value [Acc > NIR] : 0.752874
##
## Kappa : -0.0241
## Mcnemar's Test P-Value : 0.005524
##
## Sensitivity : 0.3664
## Specificity : 0.6094
## Pos Pred Value : 0.4898
## Neg Pred Value : 0.4845
## Prevalence : 0.5058
## Detection Rate : 0.1853
## Detection Prevalence : 0.3784
## Balanced Accuracy : 0.4879
##
## 'Positive' Class : 0
##
Using Yahoo Finance API, select a specific stock market price, apply time series analysis, consider ‘close price as your time series variable:
Fetching the stock value of PBR
library(quantmod)
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: TTR
## Version 0.4-0 included new data defaults. See ?getSymbols.
pbr <- getSymbols("PBR", src = "yahoo", from = "2013-01-01", to = "2017-06-01", auto.assign = FALSE)
## 'getSymbols' currently uses auto.assign=TRUE by default, but will
## use auto.assign=FALSE in 0.5-0. You will still be able to use
## 'loadSymbols' to automatically load data. getOption("getSymbols.env")
## and getOption("getSymbols.auto.assign") will still be checked for
## alternate defaults.
##
## This message is shown once per session and may be disabled by setting
## options("getSymbols.warning4.0"=FALSE). See ?getSymbols for details.
##
## WARNING: There have been significant changes to Yahoo Finance data.
## Please see the Warning section of '?getSymbols.yahoo' for details.
##
## This message is shown once per session and may be disabled by setting
## options("getSymbols.yahoo.warning"=FALSE).
close price is the time series variable
chartSeries(pbr,theme="white")
library(forecast)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(tseries)
library(fpp)
## Loading required package: fma
## Loading required package: expsmooth
## Loading required package: lmtest
library(ggplot2)
Date<-index(pbr)
pbr$date<-as.character(Date)
## Warning in merge.xts(..., all = all, fill = fill, suffixes = suffixes): NAs
## introduced by coercion
# Variable to be modelled
## set the seed to make your partition reproducible
Count <- ts(as.integer(pbr$PBR.Close), start=c(2013),frequency=365)
plot(Count)
modArima <- auto.arima(D=0,Count)
modArima
## Series: Count
## ARIMA(2,1,0)
##
## Coefficients:
## ar1 ar2
## -0.2478 -0.002
## s.e. 0.0300 0.030
##
## sigma^2 estimated as 0.2567: log likelihood=-819.36
## AIC=1644.72 AICc=1644.74 BIC=1659.75
AR=2 and MA = 0
Forecasted_values<-forecast(modArima,10)
Final_forecasted_values<-Forecasted_values$mean
forecast(modArima,pbr$PBR.Close) -> fc
autoplot(Count, series="Data") +
autolayer(fc, series="Forecast") +
autolayer(fitted(fc), series="Fitted")
4 a) Use LDA to classify the dataset into few classes so that at least 90% of information of dataset is explained through new classification. (Hint: model the variable “qtr” to variables “togo”, “kicker”, and “ydline”). How many LDs do you choose? Explain the reason.
subset_data2<-base_data[,c("qtr","togo","ydline","kicker")]
subset_data2$qtr<-as.factor(subset_data2$qtr)
library(MASS)
##
## Attaching package: 'MASS'
## The following objects are masked from 'package:fma':
##
## cement, housing, petrol
library(caret)
lda.model = lda (qtr~togo+ydline+kicker, data=subset_data2)
lda.model
## Call:
## lda(qtr ~ togo + ydline + kicker, data = subset_data2)
##
## Prior probabilities of groups:
## 1 2 3 4 5
## 0.20636451 0.35969142 0.17550627 0.24590164 0.01253616
##
## Group means:
## togo ydline kicker
## 1 6.481308 17.22897 19.64486
## 2 6.973190 19.30027 18.77212
## 3 6.543956 19.03297 19.96703
## 4 6.792157 18.53725 20.20000
## 5 5.923077 19.53846 22.61538
##
## Coefficients of linear discriminants:
## LD1 LD2 LD3
## togo 0.06665269 0.12498308 0.20996464
## ydline 0.07726467 -0.07173243 -0.02257770
## kicker -0.04134867 -0.06009657 0.05013225
##
## Proportion of trace:
## LD1 LD2 LD3
## 0.615 0.322 0.063
Automatically 3 LD’s have been chosen by the model
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
subset_data3<-base_data[,c("togo","ydline","kicker","distance","homekick","kickdiff")]
subset_data3<-na.omit(subset_data3)
res.pca <- prcomp(subset_data3, scale = TRUE)
fviz_eig(res.pca)
Till Principle component 5 the variance explained is 90%
PC1,PC2,PC3,PC4,PC5
###3. Split the dataset into two sets of variables so that X=( togo,kicker,ydline) and Y=( distance, homekick). Apply canonical correlation analysis to find the cross-correlation between X and Y. What is the correlation between ydline and distance?
subset_data4<-base_data[,c("togo","kicker","ydline","distance","homekick")]
X<-base_data[,c("togo","kicker","ydline")]
Y <-base_data[,c("distance","homekick")]
library("CCA")
## Warning: package 'CCA' was built under R version 3.5.3
## Loading required package: fda
## Warning: package 'fda' was built under R version 3.5.3
## Loading required package: splines
## Loading required package: Matrix
##
## Attaching package: 'fda'
## The following object is masked from 'package:forecast':
##
## fourier
## The following object is masked from 'package:graphics':
##
## matplot
## Loading required package: fields
## Warning: package 'fields' was built under R version 3.5.3
## Loading required package: spam
## Warning: package 'spam' was built under R version 3.5.3
## Loading required package: dotCall64
## Warning: package 'dotCall64' was built under R version 3.5.3
## Loading required package: grid
## Spam version 2.5-1 (2019-12-12) is loaded.
## Type 'help( Spam)' or 'demo( spam)' for a short introduction
## and overview of this package.
## Help for individual functions is also obtained by adding the
## suffix '.spam' to the function name, e.g. 'help( chol.spam)'.
##
## Attaching package: 'spam'
## The following object is masked from 'package:Matrix':
##
## det
## The following objects are masked from 'package:base':
##
## backsolve, forwardsolve
## Loading required package: maps
## Warning: package 'maps' was built under R version 3.5.1
##
## Attaching package: 'maps'
## The following object is masked from 'package:fma':
##
## ozone
## See https://github.com/NCAR/Fields for
## an extensive vignette, other supplements and source code
correl <- matcor(X, Y )
img.matcor(correl, type = 2)
correl
## $Xcor
## togo kicker ydline
## togo 1.00000000 -0.01387536 0.31449850
## kicker -0.01387536 1.00000000 -0.01737314
## ydline 0.31449850 -0.01737314 1.00000000
##
## $Ycor
## distance homekick
## distance 1.00000000 0.04921831
## homekick 0.04921831 1.00000000
##
## $XYcor
## togo kicker ydline distance homekick
## togo 1.00000000 -0.013875355 0.31449850 0.315641454 -0.04838438
## kicker -0.01387536 1.000000000 -0.01737314 -0.008798867 -0.02573680
## ydline 0.31449850 -0.017373137 1.00000000 0.988479497 0.05373301
## distance 0.31564145 -0.008798867 0.98847950 1.000000000 0.04921831
## homekick -0.04838438 -0.025736803 0.05373301 0.049218308 1.00000000
mydata <- subset_data4
mydata2<-na.omit(mydata)
wss <- (nrow(mydata2)-1)*sum(apply(mydata2,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(mydata2,
centers=i)$withinss)
plot(1:15, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")
Based on cluster analysis we can conclude 4 clusters as significant based on elbow curve