A <- matrix(c(0, 1/2, 1/2, 0, 0, 0,
1/6, 1/6, 1/6, 1/6, 1/6, 1/6,
1/3, 1/3, 0, 0, 1/3, 0,
0, 0, 0, 0, 1/2, 1/2,
0, 0, 0, 1/2, 0, 1/2,
0, 0, 0, 1, 0, 0), nrow=6)
B <- .85*A + .15/6
B
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0.025 0.1666667 0.3083333 0.025 0.025 0.025
## [2,] 0.450 0.1666667 0.3083333 0.025 0.025 0.025
## [3,] 0.450 0.1666667 0.0250000 0.025 0.025 0.025
## [4,] 0.025 0.1666667 0.0250000 0.025 0.450 0.875
## [5,] 0.025 0.1666667 0.3083333 0.450 0.025 0.025
## [6,] 0.025 0.1666667 0.0250000 0.450 0.450 0.025
r <- matrix(c(1/6,1/6,1/6,1/6,1/6,1/6),nrow=6)
matrix.power(B,100) %*% r
## [,1]
## [1,] 0.05170475
## [2,] 0.07367926
## [3,] 0.05741241
## [4,] 0.34870369
## [5,] 0.19990381
## [6,] 0.26859608
eig_decomp <- eigen(B)
eig_decomp
## eigen() decomposition
## $values
## [1] 1.00000000+0i 0.57619235+0i -0.42500000+0i -0.42500000-0i -0.34991524+0i
## [6] -0.08461044+0i
##
## $vectors
## [,1] [,2] [,3]
## [1,] 0.1044385+0i 0.2931457+0i 2.945054e-15+5.507002e-22i
## [2,] 0.1488249+0i 0.5093703+0i -1.223015e-15-0.000000e+00i
## [3,] 0.1159674+0i 0.3414619+0i -2.241513e-15-6.032865e-22i
## [4,] 0.7043472+0i -0.5890805+0i -7.071068e-01+0.000000e+00i
## [5,] 0.4037861+0i -0.1413606+0i 7.071068e-01+0.000000e+00i
## [6,] 0.5425377+0i -0.4135367+0i 0.000000e+00-2.145851e-08i
## [,4] [,5] [,6]
## [1,] 2.945054e-15-5.507002e-22i -0.06471710+0i -0.212296003+0i
## [2,] -1.223015e-15+0.000000e+00i 0.01388698+0i 0.854071294+0i
## [3,] -2.241513e-15+6.032865e-22i 0.07298180+0i -0.363638739+0i
## [4,] -7.071068e-01+0.000000e+00i -0.66058664+0i 0.018399984+0i
## [5,] 7.071068e-01-0.000000e+00i 0.73761812+0i -0.304719509+0i
## [6,] 0.000000e+00+2.145851e-08i -0.09918316+0i 0.008182973+0i
graph <- graph.adjacency(t(A), weighted = TRUE, mode = "directed")
page.rank(graph)
## $vector
## [1] 0.05170475 0.07367926 0.05741241 0.34870369 0.19990381 0.26859608
##
## $value
## [1] 1
##
## $options
## NULL
trainraw<-read.csv("C:/Users/John Ledesma/Desktop/train.csv/train.csv")
test <- read.csv("C:\\Users\\John Ledesma\\Desktop\\test.csv")
train <- trainraw[,c(2:785)]
#Plot Image Function
plotimage <- function(a){
image <-matrix(train[a,c(1:784)],nrow=28,ncol=28)
image <- apply(image,2,as.numeric)
image <- t(t(image))
image <- flipImage(image)
image(image)
}
#Plot first 10 Images
par(mfrow=c(3,4))
for (i in 1:10){
plotimage(i)
}
### P2 Q4
table(trainraw[,1])
##
## 0 1 2 3 4 5 6 7 8 9
## 4132 4684 4177 4351 4072 3795 4137 4401 4063 4188
trainraw$average <- apply(train,1,mean)
meanpixeltable <- data.frame(matrix(ncol=2, nrow= 10))
meanpixeltable[,1]<- c(0,1,2,3,4,5,6,7,8,9)
colnames(meanpixeltable)[1] <- "Number"
colnames(meanpixeltable)[2] <- "Mean Pixel Density"
for (i in 0:9){
meanpixeltable[i+1,2]<-mean(trainraw[trainraw$label == as.character(i), 'average'])
}
meanpixeltable
## Number Mean Pixel Density
## 1 0 44.17399
## 2 1 19.37304
## 3 2 38.10089
## 4 3 36.12269
## 5 4 30.90908
## 6 5 32.95398
## 7 6 35.37617
## 8 7 29.24904
## 9 8 38.50019
## 10 9 31.31856
The mean pixel intensity tells us the “darkness” of the image. Higher numbers indicate more shaded in pixels.
prinComp_train<-princomp(train)
sd<-prinComp_train$sdev
cum_var <- cumsum(sd^2)/sum(sd^2)
plot(cum_var)
which.max(cum_var)
## Comp.708
## 708
100% of the variance is captured at the 705th component. There are a possible 784 components as there are 784 variables that are available.
tr <- train
tr<-cov(tr/255)
pca <- prcomp(tr)
par(mfrow=c(3,4))
for (i in 1:10) {
image(array(pca$x[,i], dim=c(28,28)))
}
eights <- filter(trainraw, trainraw$label == 8)
eights <- cov(eights/255)
eights_pca <- prcomp(eights)
par(mfrow=c(3,4))
for (i in 1:10) {
image(array(eights_pca$x[,i], dim=c(28,28)))
}
There are clearly defined eights with varying levels of darkness.
MN_model <- nnet::multinom(label~., data = trainraw, MaxNWts = 42000)
## # weights: 7870 (7074 variable)
## initial value 96708.573906
## iter 10 value 27932.928742
## iter 20 value 22897.023793
## iter 30 value 21671.757612
## iter 40 value 21248.433597
## iter 50 value 21070.878338
## iter 60 value 20985.749008
## iter 70 value 20904.370367
## iter 80 value 20840.141748
## iter 90 value 20767.634400
## iter 100 value 20700.473515
## final value 20700.473515
## stopped after 100 iterations
pred1 <- predict(MN_model, trainraw, type = 'class')
ActVSPred_DF <- as.data.frame(pred1)
ActVSPred_DF$actual<-trainraw$label
table(ActVSPred_DF$pred1,ActVSPred_DF$actual)
##
## 0 1 2 3 4 5 6 7 8 9
## 0 3961 0 11 5 10 24 45 6 18 13
## 1 25 4606 153 88 88 83 57 111 384 84
## 2 7 13 3644 88 36 16 64 39 26 9
## 3 8 15 77 3786 13 92 3 9 67 56
## 4 9 3 25 3 3448 12 18 15 20 63
## 5 24 14 23 176 46 3286 92 12 168 52
## 6 10 1 13 5 28 42 3768 3 14 0
## 7 36 11 106 81 26 47 18 4047 41 190
## 8 38 20 104 88 52 139 42 7 3257 23
## 9 14 1 21 31 325 54 30 152 68 3698
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
houseData <-read.csv("C:\\Users\\John Ledesma\\Downloads\\train.csv")
houseTestData <- read.csv("C:\\Users\\John Ledesma\\Downloads\\test.csv")
hist(houseData$SalePrice)
hist(houseData$OverallCond)
hist(houseData$OverallQual)
pairs(~SalePrice+YearBuilt+TotalBsmtSF, data = houseData)
HouseData1 <- (houseData[,c(1:10)])
corrmat <- (houseData[,c(18,5,81)])
corrmat1 <- cor(corrmat)
corrmat1
## OverallQual LotArea SalePrice
## OverallQual 1.0000000 0.1058057 0.7909816
## LotArea 0.1058057 1.0000000 0.2638434
## SalePrice 0.7909816 0.2638434 1.0000000
cor.test(houseData$OverallQual,houseData$LotArea, conf.level = 0.8)
##
## Pearson's product-moment correlation
##
## data: houseData$OverallQual and houseData$LotArea
## t = 4.0629, df = 1458, p-value = 5.106e-05
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.07250156 0.13887424
## sample estimates:
## cor
## 0.1058057
cor.test(houseData$SalePrice,houseData$TotalBsmtSF, conf.level = 0.8)
##
## Pearson's product-moment correlation
##
## data: houseData$SalePrice and houseData$TotalBsmtSF
## t = 29.671, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.5922142 0.6340846
## sample estimates:
## cor
## 0.6135806
cor.test(houseData$LotArea,houseData$SalePrice, conf.level = 0.8)
##
## Pearson's product-moment correlation
##
## data: houseData$LotArea and houseData$SalePrice
## t = 10.445, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.2323391 0.2947946
## sample estimates:
## cor
## 0.2638434
precisionMatrix <- inv(corrmat1)
precisionMatrix
##
## [1,] 2.7550503 0.3046752 -2.2595806
## [2,] 0.3046752 1.1085153 -0.5334669
## [3,] -2.2595806 -0.5334669 2.9280384
precisionMatrix %*% corrmat1
## OverallQual LotArea SalePrice
## [1,] 1.000000e+00 -6.625888e-10 3.683348e-09
## [2,] -5.127806e-09 1.000000e+00 -5.985858e-09
## [3,] 2.320795e-09 -3.950092e-09 1.000000e+00
lu.decomposition(corrmat1)
## $L
## [,1] [,2] [,3]
## [1,] 1.0000000 0.0000000 0
## [2,] 0.1058057 1.0000000 0
## [3,] 0.7909816 0.1821926 1
##
## $U
## [,1] [,2] [,3]
## [1,] 1 0.1058057 0.7909816
## [2,] 0 0.9888051 0.1801530
## [3,] 0 0.0000000 0.3415256
lu.decomposition(precisionMatrix)
## $L
## [,1] [,2] [,3]
## [1,] 1.0000000 0.0000000 0
## [2,] 0.1105879 1.0000000 0
## [3,] -0.8201595 -0.2638434 1
##
## $U
## [,1] [,2] [,3]
## [1,] 2.75505 0.3046752 -2.2595806
## [2,] 0.00000 1.0748219 -0.2835846
## [3,] 0.00000 0.0000000 1.0000000
library(MASS)
unloadNamespace('matlib')
hist(houseData$BsmtFinSF1)
expfitdistr<-fitdistr(houseData$BsmtFinSF1, densfun = 'exponential')
sample_pdf<- rexp(1000,expfitdistr[[1]])
hist(sample_pdf)
#sample_cdf<-ecdf(expfitdistr)
#plot(sample_cdf)
qexp(.95, rate = expfitdistr[[1]])
## [1] 1329.026
qexp(.05, rate = expfitdistr[[1]])
## [1] 22.75574
for (i in 1:80){
if (typeof(houseData[,i]) == "integer") {
houseData[,i][is.na(houseData[,i])] <- mean(houseData[,i],na.rm=TRUE)
}
if (typeof(houseData[,i]) == "character") {
houseData[,i][is.na(houseData[,i])] <- names(which.max(table(houseData$MSZoning)))
}
}
for (i in 1:80){
if (typeof(houseTestData[,i]) == "integer") {
houseTestData[,i][is.na(houseTestData[,i])] <- mean(houseTestData[,i],na.rm=TRUE)
}
if (typeof(houseTestData[,i]) == "character") {
houseTestData[,i][is.na(houseTestData[,i])] <- names(which.max(table(houseTestData$MSZoning)))
}
}
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(caTools)
rf <- randomForest(SalePrice ~., data = houseData, importance = TRUE, na.action=na.omit)
print(rf)
##
## Call:
## randomForest(formula = SalePrice ~ ., data = houseData, importance = TRUE, na.action = na.omit)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 26
##
## Mean of squared residuals: 830603828
## % Var explained: 86.83
round(importance(rf),2)
## %IncMSE IncNodePurity
## Id -1.35 3.393446e+10
## MSSubClass 11.72 2.282385e+10
## MSZoning 12.29 1.831691e+10
## LotFrontage 2.73 6.890800e+10
## LotArea 11.06 1.579514e+11
## Street -1.49 2.846711e+08
## Alley 3.42 3.364694e+09
## LotShape 2.70 1.162938e+10
## LandContour 3.22 1.722370e+10
## Utilities 0.00 6.589722e+06
## LotConfig 0.73 8.682292e+09
## LandSlope 3.77 7.707386e+09
## Neighborhood 11.32 5.691470e+10
## Condition1 1.94 5.455040e+09
## Condition2 -1.57 1.913344e+09
## BldgType 7.31 7.221290e+09
## HouseStyle 3.95 1.308405e+10
## OverallQual 26.30 2.254191e+12
## OverallCond 10.68 3.383604e+10
## YearBuilt 12.93 3.530724e+11
## YearRemodAdd 9.36 1.083801e+11
## RoofStyle 1.21 9.917513e+09
## RoofMatl -1.57 1.096999e+10
## Exterior1st 5.44 2.313752e+10
## Exterior2nd 3.50 1.911336e+10
## MasVnrType 1.75 1.037130e+10
## MasVnrArea 5.78 7.997824e+10
## ExterQual 11.51 5.720840e+11
## ExterCond 1.86 5.325032e+09
## Foundation 4.94 7.105335e+09
## BsmtQual 8.04 2.766947e+11
## BsmtCond 1.52 3.996489e+09
## BsmtExposure 3.80 1.773269e+10
## BsmtFinType1 11.02 2.436534e+10
## BsmtFinSF1 13.17 2.194429e+11
## BsmtFinType2 -0.04 4.534027e+09
## BsmtFinSF2 1.19 6.648553e+09
## BsmtUnfSF 6.90 5.531877e+10
## TotalBsmtSF 17.60 4.578960e+11
## Heating 0.30 1.514670e+09
## HeatingQC 5.17 9.374696e+09
## CentralAir 11.29 2.092308e+10
## Electrical 1.80 1.913857e+09
## X1stFlrSF 18.47 3.296222e+11
## X2ndFlrSF 15.91 2.444642e+11
## LowQualFinSF 1.50 1.752478e+09
## GrLivArea 34.23 1.078665e+12
## BsmtFullBath 7.53 1.756785e+10
## BsmtHalfBath 2.24 8.622298e+09
## FullBath 9.89 2.112542e+11
## HalfBath 8.88 1.184166e+10
## BedroomAbvGr 6.41 3.074772e+10
## KitchenAbvGr 4.81 6.551235e+09
## KitchenQual 7.44 1.516073e+11
## TotRmsAbvGrd 4.51 9.815129e+10
## Functional 4.15 5.354783e+09
## Fireplaces 10.34 8.121757e+10
## FireplaceQu 3.42 1.351328e+10
## GarageType 13.48 8.446160e+10
## GarageYrBlt 7.15 8.633559e+10
## GarageFinish 9.75 6.662426e+10
## GarageCars 15.44 1.043045e+12
## GarageArea 13.63 3.379889e+11
## GarageQual 4.51 8.188648e+09
## GarageCond 3.72 6.282434e+09
## PavedDrive 2.36 4.607520e+09
## WoodDeckSF 5.55 4.393990e+10
## OpenPorchSF 6.58 4.708913e+10
## EnclosedPorch 0.41 7.604939e+09
## X3SsnPorch 1.07 1.855209e+09
## ScreenPorch -0.96 1.190769e+10
## PoolArea -0.26 9.376911e+09
## PoolQC -1.42 1.225174e+10
## Fence 2.21 6.096321e+09
## MiscFeature -2.49 7.608679e+08
## MiscVal 0.35 7.853535e+08
## MoSold -1.02 3.262937e+10
## YrSold 0.45 1.326004e+10
## SaleType 1.66 8.392796e+09
## SaleCondition 0.96 1.436121e+10
prediction<-predict(rf, newdata = houseTestData)
pred <- as.data.frame(prediction)
pred$Id <- houseTestData$Id
names(pred)[2]<-"Id"
names(pred)[1]<-"SalePrice"
pred<- pred[c('Id','SalePrice')]
write.csv(pred,'prediction.csv')
kaggleScore