A <- matrix(c(0, 1/2, 1/2, 0, 0, 0,
             1/6, 1/6, 1/6, 1/6, 1/6, 1/6,
             1/3, 1/3, 0, 0, 1/3, 0,
             0, 0, 0, 0, 1/2, 1/2,
             0, 0, 0, 1/2, 0, 1/2,
             0, 0, 0, 1, 0, 0), nrow=6)

B <- .85*A + .15/6

B
##       [,1]      [,2]      [,3]  [,4]  [,5]  [,6]
## [1,] 0.025 0.1666667 0.3083333 0.025 0.025 0.025
## [2,] 0.450 0.1666667 0.3083333 0.025 0.025 0.025
## [3,] 0.450 0.1666667 0.0250000 0.025 0.025 0.025
## [4,] 0.025 0.1666667 0.0250000 0.025 0.450 0.875
## [5,] 0.025 0.1666667 0.3083333 0.450 0.025 0.025
## [6,] 0.025 0.1666667 0.0250000 0.450 0.450 0.025
r <- matrix(c(1/6,1/6,1/6,1/6,1/6,1/6),nrow=6)

matrix.power(B,100) %*% r
##            [,1]
## [1,] 0.05170475
## [2,] 0.07367926
## [3,] 0.05741241
## [4,] 0.34870369
## [5,] 0.19990381
## [6,] 0.26859608
eig_decomp <- eigen(B)
eig_decomp
## eigen() decomposition
## $values
## [1]  1.00000000+0i  0.57619235+0i -0.42500000+0i -0.42500000-0i -0.34991524+0i
## [6] -0.08461044+0i
## 
## $vectors
##              [,1]          [,2]                        [,3]
## [1,] 0.1044385+0i  0.2931457+0i  2.945054e-15+5.507002e-22i
## [2,] 0.1488249+0i  0.5093703+0i -1.223015e-15-0.000000e+00i
## [3,] 0.1159674+0i  0.3414619+0i -2.241513e-15-6.032865e-22i
## [4,] 0.7043472+0i -0.5890805+0i -7.071068e-01+0.000000e+00i
## [5,] 0.4037861+0i -0.1413606+0i  7.071068e-01+0.000000e+00i
## [6,] 0.5425377+0i -0.4135367+0i  0.000000e+00-2.145851e-08i
##                             [,4]           [,5]            [,6]
## [1,]  2.945054e-15-5.507002e-22i -0.06471710+0i -0.212296003+0i
## [2,] -1.223015e-15+0.000000e+00i  0.01388698+0i  0.854071294+0i
## [3,] -2.241513e-15+6.032865e-22i  0.07298180+0i -0.363638739+0i
## [4,] -7.071068e-01+0.000000e+00i -0.66058664+0i  0.018399984+0i
## [5,]  7.071068e-01-0.000000e+00i  0.73761812+0i -0.304719509+0i
## [6,]  0.000000e+00+2.145851e-08i -0.09918316+0i  0.008182973+0i

1 is clearly the largest eigenvalue though the corresponding eigenvector is not the same as expected.

graph <- graph.adjacency(t(A), weighted = TRUE, mode = "directed")

page.rank(graph)
## $vector
## [1] 0.05170475 0.07367926 0.05741241 0.34870369 0.19990381 0.26859608
## 
## $value
## [1] 1
## 
## $options
## NULL

The pageRank vector is the same as the above vectors sans the vector calculated through eigenvalue method.

Problem 2

trainraw<-read.csv("C:/Users/John Ledesma/Desktop/train.csv/train.csv")
test <- read.csv("C:\\Users\\John Ledesma\\Desktop\\test.csv")
train <- trainraw[,c(2:785)]

#Plot Image Function
plotimage <- function(a){
 
  image <-matrix(train[a,c(1:784)],nrow=28,ncol=28)
  image <- apply(image,2,as.numeric)
  image <- t(t(image))
  image <- flipImage(image)
  image(image)
  
}

#Plot first 10 Images
par(mfrow=c(3,4))
for (i in 1:10){

  plotimage(i)
}

### P2 Q4

table(trainraw[,1])
## 
##    0    1    2    3    4    5    6    7    8    9 
## 4132 4684 4177 4351 4072 3795 4137 4401 4063 4188

P2 Q5

trainraw$average <- apply(train,1,mean)

meanpixeltable <- data.frame(matrix(ncol=2, nrow= 10))

meanpixeltable[,1]<- c(0,1,2,3,4,5,6,7,8,9)
colnames(meanpixeltable)[1] <- "Number"
colnames(meanpixeltable)[2] <- "Mean Pixel Density"


for (i in 0:9){
  meanpixeltable[i+1,2]<-mean(trainraw[trainraw$label == as.character(i), 'average'])
}

meanpixeltable
##    Number Mean Pixel Density
## 1       0           44.17399
## 2       1           19.37304
## 3       2           38.10089
## 4       3           36.12269
## 5       4           30.90908
## 6       5           32.95398
## 7       6           35.37617
## 8       7           29.24904
## 9       8           38.50019
## 10      9           31.31856

The mean pixel intensity tells us the “darkness” of the image. Higher numbers indicate more shaded in pixels.

P2 Q6

prinComp_train<-princomp(train)
sd<-prinComp_train$sdev
cum_var <- cumsum(sd^2)/sum(sd^2)


plot(cum_var)

which.max(cum_var)
## Comp.708 
##      708

100% of the variance is captured at the 705th component. There are a possible 784 components as there are 784 variables that are available.

P2 Q7

tr <- train
tr<-cov(tr/255)
pca <- prcomp(tr)

par(mfrow=c(3,4))
for (i in 1:10) {
  image(array(pca$x[,i], dim=c(28,28)))
}

P2 Q8

eights <- filter(trainraw, trainraw$label == 8)
eights <- cov(eights/255)
eights_pca <- prcomp(eights)


par(mfrow=c(3,4))
for (i in 1:10) {
  image(array(eights_pca$x[,i], dim=c(28,28)))
}

There are clearly defined eights with varying levels of darkness.

P2 Q9

MN_model <- nnet::multinom(label~., data = trainraw, MaxNWts = 42000)
## # weights:  7870 (7074 variable)
## initial  value 96708.573906 
## iter  10 value 27932.928742
## iter  20 value 22897.023793
## iter  30 value 21671.757612
## iter  40 value 21248.433597
## iter  50 value 21070.878338
## iter  60 value 20985.749008
## iter  70 value 20904.370367
## iter  80 value 20840.141748
## iter  90 value 20767.634400
## iter 100 value 20700.473515
## final  value 20700.473515 
## stopped after 100 iterations
pred1 <- predict(MN_model, trainraw, type = 'class')
ActVSPred_DF <- as.data.frame(pred1)
ActVSPred_DF$actual<-trainraw$label

table(ActVSPred_DF$pred1,ActVSPred_DF$actual)
##    
##        0    1    2    3    4    5    6    7    8    9
##   0 3961    0   11    5   10   24   45    6   18   13
##   1   25 4606  153   88   88   83   57  111  384   84
##   2    7   13 3644   88   36   16   64   39   26    9
##   3    8   15   77 3786   13   92    3    9   67   56
##   4    9    3   25    3 3448   12   18   15   20   63
##   5   24   14   23  176   46 3286   92   12  168   52
##   6   10    1   13    5   28   42 3768    3   14    0
##   7   36   11  106   81   26   47   18 4047   41  190
##   8   38   20  104   88   52  139   42    7 3257   23
##   9   14    1   21   31  325   54   30  152   68 3698

Question 3

library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
houseData <-read.csv("C:\\Users\\John Ledesma\\Downloads\\train.csv")
houseTestData <- read.csv("C:\\Users\\John Ledesma\\Downloads\\test.csv")

P3 Descriptive and Inferential Statistics

hist(houseData$SalePrice)

hist(houseData$OverallCond)

hist(houseData$OverallQual)

pairs(~SalePrice+YearBuilt+TotalBsmtSF, data = houseData)

HouseData1 <- (houseData[,c(1:10)])
corrmat <- (houseData[,c(18,5,81)])
corrmat1 <- cor(corrmat)
corrmat1
##             OverallQual   LotArea SalePrice
## OverallQual   1.0000000 0.1058057 0.7909816
## LotArea       0.1058057 1.0000000 0.2638434
## SalePrice     0.7909816 0.2638434 1.0000000
  cor.test(houseData$OverallQual,houseData$LotArea, conf.level = 0.8)
## 
##  Pearson's product-moment correlation
## 
## data:  houseData$OverallQual and houseData$LotArea
## t = 4.0629, df = 1458, p-value = 5.106e-05
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.07250156 0.13887424
## sample estimates:
##       cor 
## 0.1058057
  cor.test(houseData$SalePrice,houseData$TotalBsmtSF, conf.level = 0.8)
## 
##  Pearson's product-moment correlation
## 
## data:  houseData$SalePrice and houseData$TotalBsmtSF
## t = 29.671, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.5922142 0.6340846
## sample estimates:
##       cor 
## 0.6135806
  cor.test(houseData$LotArea,houseData$SalePrice, conf.level = 0.8)
## 
##  Pearson's product-moment correlation
## 
## data:  houseData$LotArea and houseData$SalePrice
## t = 10.445, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.2323391 0.2947946
## sample estimates:
##       cor 
## 0.2638434

P3 Linear ALgebra and Correlation

precisionMatrix <- inv(corrmat1)
precisionMatrix
##                                      
## [1,]  2.7550503  0.3046752 -2.2595806
## [2,]  0.3046752  1.1085153 -0.5334669
## [3,] -2.2595806 -0.5334669  2.9280384
precisionMatrix %*% corrmat1
##        OverallQual       LotArea     SalePrice
## [1,]  1.000000e+00 -6.625888e-10  3.683348e-09
## [2,] -5.127806e-09  1.000000e+00 -5.985858e-09
## [3,]  2.320795e-09 -3.950092e-09  1.000000e+00
lu.decomposition(corrmat1)
## $L
##           [,1]      [,2] [,3]
## [1,] 1.0000000 0.0000000    0
## [2,] 0.1058057 1.0000000    0
## [3,] 0.7909816 0.1821926    1
## 
## $U
##      [,1]      [,2]      [,3]
## [1,]    1 0.1058057 0.7909816
## [2,]    0 0.9888051 0.1801530
## [3,]    0 0.0000000 0.3415256
lu.decomposition(precisionMatrix)
## $L
##            [,1]       [,2] [,3]
## [1,]  1.0000000  0.0000000    0
## [2,]  0.1105879  1.0000000    0
## [3,] -0.8201595 -0.2638434    1
## 
## $U
##         [,1]      [,2]       [,3]
## [1,] 2.75505 0.3046752 -2.2595806
## [2,] 0.00000 1.0748219 -0.2835846
## [3,] 0.00000 0.0000000  1.0000000

P3 Calculus Based Probability and Statistics

library(MASS)
unloadNamespace('matlib')

hist(houseData$BsmtFinSF1)

expfitdistr<-fitdistr(houseData$BsmtFinSF1, densfun = 'exponential')
sample_pdf<- rexp(1000,expfitdistr[[1]])
hist(sample_pdf)

#sample_cdf<-ecdf(expfitdistr)
#plot(sample_cdf)
 
qexp(.95, rate = expfitdistr[[1]])
## [1] 1329.026
qexp(.05, rate = expfitdistr[[1]])
## [1] 22.75574

P3 Modeling

Cleaning Data by replacing NA values with the mean of the column for numeric values and the mode for character values.

for (i in 1:80){
  
  if (typeof(houseData[,i]) == "integer") {
    
   houseData[,i][is.na(houseData[,i])] <- mean(houseData[,i],na.rm=TRUE)
   
  }
  
    if (typeof(houseData[,i]) == "character") {
    
   houseData[,i][is.na(houseData[,i])] <- names(which.max(table(houseData$MSZoning)))
   
  }
  
}

for (i in 1:80){
  
  if (typeof(houseTestData[,i]) == "integer") {
    
   houseTestData[,i][is.na(houseTestData[,i])] <- mean(houseTestData[,i],na.rm=TRUE)
   
  }
  
    if (typeof(houseTestData[,i]) == "character") {
    
   houseTestData[,i][is.na(houseTestData[,i])] <- names(which.max(table(houseTestData$MSZoning)))
   
  }
  
}

Using Random Forest to create model

library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(caTools)
rf <- randomForest(SalePrice ~., data = houseData, importance = TRUE, na.action=na.omit)
print(rf)
## 
## Call:
##  randomForest(formula = SalePrice ~ ., data = houseData, importance = TRUE,      na.action = na.omit) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 26
## 
##           Mean of squared residuals: 830603828
##                     % Var explained: 86.83
round(importance(rf),2)
##               %IncMSE IncNodePurity
## Id              -1.35  3.393446e+10
## MSSubClass      11.72  2.282385e+10
## MSZoning        12.29  1.831691e+10
## LotFrontage      2.73  6.890800e+10
## LotArea         11.06  1.579514e+11
## Street          -1.49  2.846711e+08
## Alley            3.42  3.364694e+09
## LotShape         2.70  1.162938e+10
## LandContour      3.22  1.722370e+10
## Utilities        0.00  6.589722e+06
## LotConfig        0.73  8.682292e+09
## LandSlope        3.77  7.707386e+09
## Neighborhood    11.32  5.691470e+10
## Condition1       1.94  5.455040e+09
## Condition2      -1.57  1.913344e+09
## BldgType         7.31  7.221290e+09
## HouseStyle       3.95  1.308405e+10
## OverallQual     26.30  2.254191e+12
## OverallCond     10.68  3.383604e+10
## YearBuilt       12.93  3.530724e+11
## YearRemodAdd     9.36  1.083801e+11
## RoofStyle        1.21  9.917513e+09
## RoofMatl        -1.57  1.096999e+10
## Exterior1st      5.44  2.313752e+10
## Exterior2nd      3.50  1.911336e+10
## MasVnrType       1.75  1.037130e+10
## MasVnrArea       5.78  7.997824e+10
## ExterQual       11.51  5.720840e+11
## ExterCond        1.86  5.325032e+09
## Foundation       4.94  7.105335e+09
## BsmtQual         8.04  2.766947e+11
## BsmtCond         1.52  3.996489e+09
## BsmtExposure     3.80  1.773269e+10
## BsmtFinType1    11.02  2.436534e+10
## BsmtFinSF1      13.17  2.194429e+11
## BsmtFinType2    -0.04  4.534027e+09
## BsmtFinSF2       1.19  6.648553e+09
## BsmtUnfSF        6.90  5.531877e+10
## TotalBsmtSF     17.60  4.578960e+11
## Heating          0.30  1.514670e+09
## HeatingQC        5.17  9.374696e+09
## CentralAir      11.29  2.092308e+10
## Electrical       1.80  1.913857e+09
## X1stFlrSF       18.47  3.296222e+11
## X2ndFlrSF       15.91  2.444642e+11
## LowQualFinSF     1.50  1.752478e+09
## GrLivArea       34.23  1.078665e+12
## BsmtFullBath     7.53  1.756785e+10
## BsmtHalfBath     2.24  8.622298e+09
## FullBath         9.89  2.112542e+11
## HalfBath         8.88  1.184166e+10
## BedroomAbvGr     6.41  3.074772e+10
## KitchenAbvGr     4.81  6.551235e+09
## KitchenQual      7.44  1.516073e+11
## TotRmsAbvGrd     4.51  9.815129e+10
## Functional       4.15  5.354783e+09
## Fireplaces      10.34  8.121757e+10
## FireplaceQu      3.42  1.351328e+10
## GarageType      13.48  8.446160e+10
## GarageYrBlt      7.15  8.633559e+10
## GarageFinish     9.75  6.662426e+10
## GarageCars      15.44  1.043045e+12
## GarageArea      13.63  3.379889e+11
## GarageQual       4.51  8.188648e+09
## GarageCond       3.72  6.282434e+09
## PavedDrive       2.36  4.607520e+09
## WoodDeckSF       5.55  4.393990e+10
## OpenPorchSF      6.58  4.708913e+10
## EnclosedPorch    0.41  7.604939e+09
## X3SsnPorch       1.07  1.855209e+09
## ScreenPorch     -0.96  1.190769e+10
## PoolArea        -0.26  9.376911e+09
## PoolQC          -1.42  1.225174e+10
## Fence            2.21  6.096321e+09
## MiscFeature     -2.49  7.608679e+08
## MiscVal          0.35  7.853535e+08
## MoSold          -1.02  3.262937e+10
## YrSold           0.45  1.326004e+10
## SaleType         1.66  8.392796e+09
## SaleCondition    0.96  1.436121e+10
prediction<-predict(rf, newdata = houseTestData)
pred <- as.data.frame(prediction)
pred$Id <- houseTestData$Id
names(pred)[2]<-"Id"
names(pred)[1]<-"SalePrice"
pred<- pred[c('Id','SalePrice')]
write.csv(pred,'prediction.csv')

kaggleScore