## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'dply'
## Loaded glmnet 2.0-2
## 
## This is mgcv 1.8-6. For overview type 'help("mgcv-package")'.
## 
## Attaching package: 'reshape'
## 
## The following object is masked from 'package:Matrix':
## 
##     expand
## 
## 
## Attaching package: 'boot'
## 
## The following object is masked from 'package:car':
## 
##     logit
## 
## The following object is masked from 'package:lattice':
## 
##     melanoma
## 
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.

Source

https://sites.google.com/site/cme250winter2015/mini-project

The goal of the competition, from the Kaggle website, is to “predict your classmates’ age”. The data set profiled here is the training data set.

data field

ID - an anonymous random id unique to each sample, you should not use this to fit your model

Age - student’s age

Height - student’s height in inches

Sibling - number of siblings

FBfriends - number of facebook friends

Credits - number of enrolled credits this quarter

Contacts - number of contacts on phone

Exercise - number of times exercising per week

News - number of news articles read per week

ParentContact - number of times in a month student speaks with their parents

Texts - number of texts/mobile messages sent yesterday

Emails - number of emails received yesterday

Pets - pets student has or has had

Twitter - does student have a twitter account?

Field - area of study/research

Origin - birthplace

Comments

There are five missing value in the data set. I am not trying to delete them directly because I have a small number of observations to fit models.As a result, every piece of information is precise for me. I will use linear regression to predict the missing values. There were no inherent character column and the data was already converted to factor-like integer values. But I found that there are 6 levels for the Field variable. In fact, there are only 5 possible values for this variable. It shows 6 levels because of the missing value. Given the small number of data, I will use leave-one-out method to calculate all the MSE.

##Load data into R 
setwd("C:/Users/CP Business 3/Desktop")
train <- read.csv("traindata.csv",header=T) 
test <- read.csv("testdata.csv", header=T)  
df <- as.data.frame(rbind(train[,-2],test[,-2])) 
##which line include missing value  
train[!complete.cases(train),] 
##    ID Age Height Siblings Fbfriends Credits Contacts Exercise News
## 55 78  30   74.5        1        NA       0      999        1   15
## 60  8  34     NA        1      1144      10     1294        1   20
## 64 62  22   66.0        0      1649      NA      516        7   50
## 65 75  28   67.3        2       840       0       NA        3    2
##    ParentContact Texts Emails    Pets Twitter
## 55             3    10     50 Neither     Yes
## 60            35    30     30     Dog     Yes
## 64             8   200     50 Neither     Yes
## 65             3     5     90 Neither     Yes
##                             Field                               Origin
## 55                    Engineering      Europe, Russia, or Central Asia
## 60 Social Sciences and Humanities  Caribbean, Central or South America
## 64                    Engineering                        USA or Canada
## 65                    Engineering South and West Asia and North Africa
## Change the level of Field  
levels(train$Field) 
## [1] ""                                                   
## [2] "Engineering"                                        
## [3] "Math or Statistics"                                 
## [4] "Medical or Life Sciences"                           
## [5] "Other Sciences (Physics, Chemistry, Earth Sciences)"
## [6] "Social Sciences and Humanities"
train$Field <- factor(train$Field, levels=c("Engineering","Math or Statistics","Medical or Life Sciences","Other Sciences (Physics, Chemistry, Earth Sciences)","Social Sciences and Humanities")) 

Estimate Missing Value

First, I need to plot the covariance graph to evaluate the possible relationship between different covariates and then build linear regression model to predict the missing value. It turned out this method didn’t suitable for calculating all the missing value. Alternatively, I may assign the mean of that variable to the null value. If the missing value is categorical value, I may estimate it based on the cluster result.

According to the above plot, I guess that height relates to news, text, emails field and origin. I tried different model to see which one is meaningful in order to estimate height. It turns out the regression function with height, Field and Origin as well as all the coefficients are significant. I get the estimation value of height based on this linear model. I also estimate another two null values in the same way. While I try to estimate Fbfriends, I found that none of the model is significant. So, I just simple assign the missing value with the mean of data in this column. I also guess observation 62th’s major is social science and humanities based on the hierarchical clustering result.
### Summary Statistics ###

summary(train)
##        ID            Age            Height         Siblings    
##  Min.   : 1.0   Min.   :18.00   Min.   :58.00   Min.   :0.000  
##  1st Qu.:26.0   1st Qu.:23.00   1st Qu.:65.00   1st Qu.:1.000  
##  Median :53.0   Median :24.00   Median :69.00   Median :1.000  
##  Mean   :50.4   Mean   :25.86   Mean   :68.13   Mean   :1.308  
##  3rd Qu.:73.0   3rd Qu.:27.00   3rd Qu.:71.00   3rd Qu.:2.000  
##  Max.   :95.0   Max.   :63.00   Max.   :75.00   Max.   :5.000  
##                                 NA's   :1                      
##    Fbfriends         Credits          Contacts         Exercise    
##  Min.   :   0.0   Min.   : 0.000   Min.   :  21.0   Min.   :1.000  
##  1st Qu.: 330.8   1st Qu.: 6.250   1st Qu.: 123.0   1st Qu.:1.000  
##  Median : 533.5   Median :10.000   Median : 239.5   Median :3.000  
##  Mean   : 618.7   Mean   : 9.938   Mean   : 362.0   Mean   :2.954  
##  3rd Qu.: 849.0   3rd Qu.:14.000   3rd Qu.: 453.8   3rd Qu.:4.000  
##  Max.   :1683.0   Max.   :34.000   Max.   :1543.0   Max.   :7.000  
##  NA's   :1        NA's   :1        NA's   :1                       
##       News        ParentContact        Texts            Emails      
##  Min.   :  0.00   Min.   : 0.000   Min.   :  0.00   Min.   :  5.00  
##  1st Qu.:  4.00   1st Qu.: 3.000   1st Qu.:  7.00   1st Qu.: 15.00  
##  Median : 10.00   Median : 4.000   Median : 15.00   Median : 21.00  
##  Mean   : 20.18   Mean   : 8.015   Mean   : 30.72   Mean   : 30.88  
##  3rd Qu.: 20.00   3rd Qu.:10.000   3rd Qu.: 40.00   3rd Qu.: 35.00  
##  Max.   :150.00   Max.   :67.000   Max.   :200.00   Max.   :140.00  
##                                                                     
##       Pets    Twitter 
##  Both   : 9   No :28  
##  Cat    : 3   Yes:37  
##  Dog    :21           
##  Neither:32           
##                       
##                       
##                       
##                                                  Field   
##  Engineering                                        :37  
##  Math or Statistics                                 : 4  
##  Medical or Life Sciences                           : 5  
##  Other Sciences (Physics, Chemistry, Earth Sciences):11  
##  Social Sciences and Humanities                     : 7  
##  NA's                                               : 1  
##                                                          
##                                   Origin  
##  Caribbean, Central or South America : 3  
##  East Asia or Southeast Asia         :12  
##  Europe, Russia, or Central Asia     : 9  
##  South and West Asia and North Africa: 9  
##  USA or Canada                       :32  
##                                           
## 

Graphical Model

plot(huge(x))
## Conducting Meinshausen & Buhlmann graph estimation (mb)....done

The above picture shows that when the sparsity is about 1%, the regularization parameter is around 0.399. In another word, the overfitting problem of the data is smallest when lambda equals to 0.3999 among three possible choice of lambda.

x.npn=huge.npn(x,npn.func="truncation")
## Conducting nonparanormal (npn) transformation via truncated ECDF....done.
out.npn <-huge(x.npn,nlambda=40,lambda.min.ratio=0.4)
## Conducting Meinshausen & Buhlmann graph estimation (mb)....done
plot(out.npn)

Then I make nonparanormal transformation to the data, the picture indicates the one with nonparanormal is much sparser under the similarity sparsity. According to this result, I also believe that any possible transformation of the covariates and model selection may improve the accuracy of the my model.

Outliers Analysis

It is important to remove outliers to make further analysis.Firstly, I fit a full model, the p-values shows that only Fbfriends, Credits, Contacts and News are significant. Then I try to build a new model by removing the covariates that don’t explain the response variable very well.It turns out the new model and all the coefficient are significant. Then I check for the cook distance to identify outliers.

According the above plot, I will remove observation 13, 48 and 63. I also try other method to detect the outliers. It is outlier detection by clustering. The idea is to detect outlier with the k-means algorithm. With k-means, the data are partitioned into k groups by assigning them to the closest cluster centers. After that, we can calculate the distance between each object and its cluster center, and pick those with the largest distances as outliers.

## [1] 29 60 34

For this data set, I prefer cook-distance to detect outliers since the following predictive model seem like perform better on it.

PCA

## Importance of components:
##                           Comp.1    Comp.2     Comp.3     Comp.4
## Standard deviation     1.7797151 1.5205135 1.38314080 1.32264553
## Proportion of Variance 0.1377124 0.1005201 0.08317733 0.07606049
## Cumulative Proportion  0.1377124 0.2382325 0.32140982 0.39747030
##                            Comp.5    Comp.6     Comp.7     Comp.8
## Standard deviation     1.30309259 1.2466598 1.15988728 1.12975003
## Proportion of Variance 0.07382827 0.0675722 0.05849298 0.05549283
## Cumulative Proportion  0.47129858 0.5388708 0.59736375 0.65285659
##                            Comp.9    Comp.10    Comp.11    Comp.12
## Standard deviation     1.09202320 1.02281338 0.96525882 0.93417648
## Proportion of Variance 0.05184846 0.04548466 0.04050976 0.03794286
## Cumulative Proportion  0.70470505 0.75018971 0.79069947 0.82864233
##                           Comp.13    Comp.14   Comp.15    Comp.16
## Standard deviation     0.83018687 0.75430018 0.7384116 0.69619469
## Proportion of Variance 0.02996566 0.02473777 0.0237066 0.02107335
## Cumulative Proportion  0.85860799 0.88334577 0.9070524 0.92812571
##                           Comp.17    Comp.18    Comp.19    Comp.20
## Standard deviation     0.68079961 0.60611810 0.58500807 0.49247786
## Proportion of Variance 0.02015166 0.01597301 0.01487976 0.01054498
## Cumulative Proportion  0.94827737 0.96425038 0.97913014 0.98967511
##                            Comp.21     Comp.22      Comp.23
## Standard deviation     0.448393238 0.190829638 2.932997e-08
## Proportion of Variance 0.008741587 0.001583302 3.740205e-17
## Cumulative Proportion  0.998416698 1.000000000 1.000000e+00

The first thirteen principle components explain around 80% of the variance in the data. This is a big amount of the variance. so I will choose the first thirteen principal components as the the significant prinncipal components. From the scree plot, we see that while each of the first four principal components explain a substantial amount of variance, there is a marked decrease in the variance explained by furter principal components, then the decrease becomes faster from the first six principal component to the thirteen one. Finally, the increase become much slower after the thirteen principal component. This suggests that there may be little benefit to examining more than thirteen principal components.

Hierarchical Clustering

The above result shows that the choice of linkage certainly does affect the results obtained. Typically, single linkage will tend to yield trailing clusters–very large clusters onto which individual observations attach one-by-one. On the other hand, complete linkage tends to yield more balanced, attractive clusters. I prefer complete linkage to single linkage. Clearly, people with similar age tend to be clustered together but they are varied in properties within different group.

K-means Clustering

Since I have a small number of observations, I divide them into two groups. The above plot shows the observations scatter into two groups based on the first two principal component scores.

Model Selection

For variable selection, I use subset selection, forward stepwise selection, backward stepwise selection and lasso regression. When I use subset selection, it return different result with different standards such as CP, adjr2 and AIC. In this case, I wrote a loop to calculate the MSE for each subset to see which one reach the smallest MSE, it turns out the model with 5 variable is the best subset since it have the smallest MSE.

   1         2         3         4         5          6         7         8 

24.99610 19.61245 19.76116 13.56199 10.34055 13.16404 13.26169 11.91733

Both forward and backward stepwise selection choose 5 variables, which are Contacts, News, Credits, Field and Fbfriends into the model. This is consensus with the best subset. Finally, I use lasso regression, the MSE for lasso regression is 13.4838, which is higher than the best model which I get from other variable selection method.Lasoo chooses 6 variables, which are Fbfriends, Credits, Contacts, ParentCOntact, Emails and Field.

Check for Non-linear Effect

##         resid variable value
## 677  5.432651   Emails    30
## 678  7.179480   Emails    30
## 679  1.273511   Emails    50
## 680  2.719926   Emails   140
## 681  2.378459   Emails    50
## 682 14.267250   Emails    90
## Warning in loop_apply(n, do.ply): Removed 1 rows containing missing values
## (stat_smooth).
## Warning in loop_apply(n, do.ply): pseudoinverse used at -0.025
## Warning in loop_apply(n, do.ply): neighborhood radius 2.025
## Warning in loop_apply(n, do.ply): reciprocal condition number 1.9721e-016
## Warning in loop_apply(n, do.ply): There are other near singularities as
## well. 1
## Warning in loop_apply(n, do.ply): pseudoinverse used at -0.025
## Warning in loop_apply(n, do.ply): neighborhood radius 2.025
## Warning in loop_apply(n, do.ply): reciprocal condition number 1.9721e-016
## Warning in loop_apply(n, do.ply): There are other near singularities as
## well. 1
## Warning in loop_apply(n, do.ply): Removed 1 rows containing missing values
## (stat_smooth).
## Warning in loop_apply(n, do.ply): Removed 1 rows containing missing values
## (stat_smooth).
## Warning in loop_apply(n, do.ply): Removed 1 rows containing missing values
## (stat_smooth).
## Warning in loop_apply(n, do.ply): Removed 1 rows containing missing values
## (geom_point).
## Warning in loop_apply(n, do.ply): Removed 1 rows containing missing values
## (geom_point).
## Warning in loop_apply(n, do.ply): Removed 1 rows containing missing values
## (geom_point).
## Warning in loop_apply(n, do.ply): Removed 1 rows containing missing values
## (geom_point).

From the above picture, I guess Fbfriends, News , Credits Height have non-linear relationship with the residual.

Splines

First, I build a additive model based on all the variables except for ID and I also add splines to Fbfriends, News and Credits, the models shows that only Fbfriends, News, Contacts, Field and Credits are significant. I decided to remove other unimportant variables. After all, they don’t contribute much on explaining the response variable. It turns out this small model perform much better. The leave one out MSE also decreases from 23.71507 to 8.482493. In order to improve the small model, I also try to add the interactive splines instead of two separated splines. As a result, the MSE seem change a little bit. And then I add Emails to the model, it also fails to increase the MSE. Another finding is when I add the poisson family to the additive model, they perform much better than the ones without poisson family.

Polynomial Regression

In order to choose the degree of polynomial terms, I fit respective polynomial regression with Age against every independent variable and choose the degree of freedom using cross-validation. Then, I try to build a full model including all the variables except for ID as well as the polynomial terms. The summary shows that only several terms are significant. Then I remove all the unimportant terms just like what I did before. Finally, I found that the model without polynomial terms reach the smallest MSE. After that, I try to add an interactive term of Fbfriends and Credits, it does improve my model. The smallest MSE I get from polynomial regression is 10.06359.

Linear Model

## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 3.588131    Df = 1     p = 0.05819366
## 
##  Shapiro-Wilk normality test
## 
## data:  ols.model1$residual
## W = 0.97242, p-value = 0.2083
##  lag Autocorrelation D-W Statistic p-value
##    1       0.3778928      1.106719       0
##  Alternative hypothesis: rho != 0

First, I build a full model with all the variable, a interactive term and a polynomial term. Then, I chose the variables that are significant to make further analysis. Finally, I use a relatively small model. After that,I diagnose this model with both plot and three test to check if satisfy the assumptions. The plot shows that there is still non-linearity terms that don’t include into my model since the red line in picture 1 is not flat. The tests indicate that the variance is non-constant, normality and independent. Hence, I consider making transformations to fix the problem of non-linearity of this model.

Transformation for Response

## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 0.6594613    Df = 1     p = 0.4167502
## 
##  Shapiro-Wilk normality test
## 
## data:  ols.model.trans$residual
## W = 0.99018, p-value = 0.9205

I use the boxcox function in MASS library to choose the possible transformation for the response variable. The above picture shows that when lambda closes to -1, it may reach the biggest log-likelihood. So I use 1/Y as the new response. It turns out the model with the transformed response variable performs even bad since it resists the assumption that the variance should be non-constant. In one word, the transformation for the response term fails to improve the model.

Weighted Linear Regression

WLS usually used to fix the problem of non-constant residual for MLS model, I just want to see whether it can improve my model or not.

ols.model <- lm(Age~.-ID,data=train)
plot(ols.model1, which=1)

The right-opening megaphone seems in the graph, suggesting non-constant variance, then I build a model of the absolute value of reside that I get from the MLR model on x to get an estimate of sigma, then I define weight equals to 1 over the estimate of sigma. If I get my weight, I can easily fit the WLR model. In order to check for the performance of this model, I calculate the leave-one-out MSE of this model. The MSE is 25.2038, which may be not good. Therefore, I try another WLR based on the forward or backward stepwise selection. As a result,the MSE of it is 23.6517. it does decrease the MSE, but still not good. Perhaps, WLR is not suitable for my data set.

Ridge Regression

I also try ridge regression and its MSE is 15.19955.And the lasso regression is better than it. it seems like smaller model is suitable for my dataset.

Support Vector Regression

SVM is known as a classification method, but it can used to build regression model. I try two different kernel functions to fit the decision boundary. It is also important to choose the cost,which determines the number of support vector. The result indicates that the one using the linear kernel function reach the smallest leave one out cross validation, which is 13.26392.

KNN Regression

Ihis is another common classification method, but it can be used to fit regression model. The most important thing when fit kNN regression is to decide the number of neighbor to predict new data. The leave one out cross-validation shows that when the number of neighbors equals to 7. it get the smallest MSE, which is 18.15405.

Random Forest

Finally, I try to use random forest regression to fit my model. I choose the number of ntree=100. First, I use the full model to fit the random forest regression, I still use leave one out cross validation to calculate MSE, which is 17.25867, then I try to use a smaller model including Fbfriends, Credits, Contacts, News, Field, Origin, Pets and Twitter.It turns out the MSE decline from 17.25867 to 14.20598.

Bootstrap to Estimate Confidence Interval

x <- model.matrix(Age~.-ID-1,data=train)
xmean <- apply(x,2,mean)
names(xmean)
##  [1] "Height"                                                  
##  [2] "Siblings"                                                
##  [3] "Fbfriends"                                               
##  [4] "Credits"                                                 
##  [5] "Contacts"                                                
##  [6] "Exercise"                                                
##  [7] "News"                                                    
##  [8] "ParentContact"                                           
##  [9] "Texts"                                                   
## [10] "Emails"                                                  
## [11] "PetsBoth"                                                
## [12] "PetsCat"                                                 
## [13] "PetsDog"                                                 
## [14] "PetsNeither"                                             
## [15] "TwitterYes"                                              
## [16] "FieldMath or Statistics"                                 
## [17] "FieldMedical or Life Sciences"                           
## [18] "FieldOther Sciences (Physics, Chemistry, Earth Sciences)"
## [19] "FieldSocial Sciences and Humanities"                     
## [20] "OriginEast Asia or Southeast Asia"                       
## [21] "OriginEurope, Russia, or Central Asia"                   
## [22] "OriginSouth and West Asia and North Africa"              
## [23] "OriginUSA or Canada"
boots <- matrix(0,nrow=10000,1)
set.seed(1)
for(i in 1:length(xmean))
{
  for(j in 1:10000)
  {
    y <- sample(nrow(x),10000,replace=T)
    boots[j] <- mean(x[y,i])
    
  }
  print(names(xmean)[i])
  print(xmean[i] + quantile(boots-xmean[i],c(0.025,0.975)))
}
## [1] "Height"
##    2.5%   97.5% 
## 68.1235 68.2767 
## [1] "Siblings"
##   2.5%  97.5% 
## 1.2618 1.2999 
## [1] "Fbfriends"
##     2.5%    97.5% 
## 578.3245 592.2903 
## [1] "Credits"
##     2.5%    97.5% 
## 10.62379 10.88840 
## [1] "Contacts"
##     2.5%    97.5% 
## 300.4506 311.6892 
## [1] "Exercise"
##   2.5%  97.5% 
## 2.7719 2.8429 
## [1] "News"
##     2.5%    97.5% 
## 14.82989 15.52822 
## [1] "ParentContact"
##     2.5%    97.5% 
## 7.562898 7.947902 
## [1] "Texts"
##    2.5%   97.5% 
## 28.6677 29.9883 
## [1] "Emails"
##     2.5%    97.5% 
## 25.89209 26.63610 
## [1] "PetsBoth"
##   2.5%  97.5% 
## 0.1508 0.1650 
## [1] "PetsCat"
##   2.5%  97.5% 
## 0.0484 0.0570 
## [1] "PetsDog"
##   2.5%  97.5% 
## 0.3241 0.3426 
## [1] "PetsNeither"
##      2.5%     97.5% 
## 0.4462000 0.4658025 
## [1] "TwitterYes"
##   2.5%  97.5% 
## 0.5342 0.5539 
## [1] "FieldMath or Statistics"
##   2.5%  97.5% 
## 0.0483 0.0571 
## [1] "FieldMedical or Life Sciences"
##   2.5%  97.5% 
## 0.0820 0.0933 
## [1] "FieldOther Sciences (Physics, Chemistry, Earth Sciences)"
##      2.5%     97.5% 
## 0.1852000 0.2008025 
## [1] "FieldSocial Sciences and Humanities"
##   2.5%  97.5% 
## 0.0993 0.1112 
## [1] "OriginEast Asia or Southeast Asia"
##   2.5%  97.5% 
## 0.2027 0.2186 
## [1] "OriginEurope, Russia, or Central Asia"
##   2.5%  97.5% 
## 0.1334 0.1471 
## [1] "OriginSouth and West Asia and North Africa"
##   2.5%  97.5% 
## 0.0993 0.1113 
## [1] "OriginUSA or Canada"
##   2.5%  97.5% 
## 0.4991 0.5185

The Confidence Interval for glm.fit3

glm.fit3 <- glm(Age~News+Contacts+Field+Credits+Fbfriends,data=train)
confint(glm.fit3)
## Waiting for profiling to be done...
##                                                                 2.5 %
## (Intercept)                                              23.388355445
## News                                                      0.008053169
## Contacts                                                  0.002028710
## FieldMath or Statistics                                  -4.973896856
## FieldMedical or Life Sciences                            -1.722032963
## FieldOther Sciences (Physics, Chemistry, Earth Sciences) -1.167046667
## FieldSocial Sciences and Humanities                       1.482556233
## Credits                                                  -0.249855590
## Fbfriends                                                -0.005229628
##                                                                 97.5 %
## (Intercept)                                              26.9454377610
## News                                                      0.0858691132
## Contacts                                                  0.0076588042
## FieldMath or Statistics                                   1.5773995621
## FieldMedical or Life Sciences                             3.5627163578
## FieldOther Sciences (Physics, Chemistry, Earth Sciences)  2.5325380195
## FieldSocial Sciences and Humanities                       6.3117985796
## Credits                                                  -0.0181863905
## Fbfriends                                                -0.0007883498

My best model is the additive model, I can’t get the confidence interval from this model. So I choose my second best model which is the glm.fit3. It turns out the confidence that I get from different model vary greatly. Perhaps, I need more data to fit my model to reach a more believable conclusion.

Conclusion

Currently, my best model may be the general additive model, which is gam2. Its formula is Age ~ s(Fbfriends) + s(Credits) + s(Contacts) + News + Field. The family of this model is Poisson and the MSE is 8.482493. I put all the R-Code in the Appendix part.

Appendix

##import data set into R
setwd("C:/Users/ytr/Desktop/6210/hw6/")
train <- read.csv("traindata.csv",header=T)
test <- read.csv("testdata.csv", header=T)
str(train)
df <- as.data.frame(rbind(train[,-2],test[,-2]))
##which line include missing value 
train[!complete.cases(train),]
## Change the level of Field 
levels(train$Field)
train$Field <- factor(train$Field, levels=c("Engineering","Math or Statistics","Medical or Life Sciences","Other Sciences (Physics, Chemistry, Earth Sciences)","Social Sciences and Humanities"))
##plot the corvariance matrix graph
x <- model.matrix(Age~.-ID-1,data=train)
library(lattice)
levelplot(cor(x))

###I guess observation 62's major is social science and Humanities based on the hierarchical Clustering method 
train[62,15] <- "Social Sciences and Humanities"
train[62,]

### I guess height relates to news, text, emails, field, origin
height.fit <- lm(Height~Field+Origin,data=train)
summary(height.fit)
height.pred <- predict(height.fit, newdata=train[60,], type="response")
height.pred
train[60,3] <- height.pred

##I guess that fbfriend  relates to   excerise , news, petboth
fbfriends.fit <- lm(Fbfriends~Exercise+News+Pets,data=train)
summary(fbfriends.fit)
## the function is not significant. so I just assign the mean value of fbfriends to the 55 observation
summary(train$Fbfriends)
train[55,"Fbfriends"] <- 619
train[55,]
### I guess credit highly relates to contact, exercise, news and origin
credits.fit <- lm(Credits~Fbfriends,data=train)
summary(credits.fit)
## but after fit the model, I found that only fbfriends is significant. 
credits.pred <- predict(credits.fit, newdata=train[64,], type="response")
credits.pred
train[64,"Credits"] <- 15
### observation 65 has a missing value about contact
contacts.fit <- lm(Contacts~Age+Fbfriends+Field+Field,data=train)
summary(contacts.fit)
contacts.pred <- predict(contacts.fit, newdata=train[65,], type="response")
contacts.pred
train[65,"Contacts"]=531

##outliers analysis##
fit1 <- lm(Age~.-ID, data=train)
summary(fit1)
fit2 <- lm(Age~Fbfriends+News+Credits+Contacts,data=train)
par(mfrow=c(3,1))
plot(fit2, which=1)
plot(fit2, which=4)
plot(fit2, which=6)
par(mfrow=c(1,1))

## outlier detection by clustering ##
set.seed(1)
sd.x <- scale(x)
km.out <- kmeans(sd.x,2)
##cluster center
km.out$center
## calculate distance between objects and cluster centers
centers <- km.out$centers[km.out$cluster,]
distances <- sqrt(rowSums((sd.x-centers)^2))
outliers <- order(distances, decreasing=T)[1:3]
outliers

train <- train[-c(13,48,63),]
dim(train)

## k-means clustering ##
set.seed(1)
x <- model.matrix(Age~.-ID-1,data=train)
sd.x <- scale(x)
km.out <- kmeans(sd.x,2)
km.clusters=km.out$cluster
km.clusters
fit.pca <- princomp(x, cor=T)
plot(fit.pca$scores[,1:2],col=(km.out$cluster+1),main="k-Means Clustering Results with K=2", xlab="", ylab="", pch=20, cex=2)
library(cluster)
clusplot(sd.x,km.clusters,color=T,shade=T,labels=2,lines=0)
train_1 <- train[km.clusters==2,]
train_2 <- train[km.clusters==1,]
library(plyr)
ddply(train_1,~Age, summarise, length_Age=length(Age))
ddply(train_2,~Age, summarise, length_Age=length(Age))
dim(train_1)
dim(train_2)

## lasso regression ##
set.seed(1)
library(glmnet)
x <- model.matrix(Age~.-ID-1,data=train)
dim(x)
y <- as.matrix(train$Age)
length(y)
lasso.fit <- glmnet(x,y,alpha=1)
cv.lasso <- cv.glmnet(x=x, y=y, alpha=1,nfolds=20)
mse.lasso <- min(cv.lasso$cvm)
mse.lasso
bestlam <- cv.lasso$lambda.min
coef <- coef(lasso.fit, s=bestlam)
coef
library(lattice)
levelplot(t(as.matrix(coef)))

##ridge regression##
set.seed(1)
ridge.fit <- glmnet(x,y,alpha=0)
cv.ridge <- cv.glmnet(x=x, y=y, alpha=0,nfolds=20)
mse.ridge <- min(cv.ridge$cvm)
mse.ridge
bestlam1 <- cv.ridge$lambda.min
coef <- coef(ridge.fit, s=bestlam1)
coef
levelplot(t(as.matrix(coef)))

###model selection###
library(leaps)
regfit.full <- regsubsets(Age~.-ID, data=train)
regfit.full
reg.summary <- summary(regfit.full)
which.max(reg.summary$adjr2)
which.max(reg.summary$adjr2)
which.min(reg.summary$cp)
test$Age <- 0
test.mat <- model.matrix(Age~.-ID, data=test)
x <- model.matrix(Age~.-ID,data=train)
errors =matrix (NA ,62,8, dimnames =list(NULL , paste (1:8) ))
for(j in 1:62)
   {
     regfit.full <- regsubsets(Age~.-ID, data=train[-j,])   
         for(i in 1:8)
            {
              coefi <- coef(regfit.full, id=i)
              pred <- x[j,names(coefi)]%*%coefi
              errors[j,i] <- mean((pred-train$Age[j])^2)
             }
   }
 errors
apply(errors,2,mean)

##so I would like to choose the coef5 based on the test mse
coef5 <- coef(regfit.full, 5)
coef5


##forward stepwise selection##
library("MASS")
null <- lm(Age~1, data=train)
full <- lm(Age~.-ID,data=train)
step.forward.aic <- stepAIC(null, scope=formula(full), direction="forward")
summary(step.forward.aic)
##backward stepwise selection##
step.backward.aic <- stepAIC(full, scope=formula(null), direction="backward", trace=F)
summary(step.backward.aic)
coef5 <- coef(regfit.full, 5)
coef5


## check for non-linear effect ##
library(ggplot2)
library(reshape)
fit2 <- lm(Age~Fbfriends+News+Credits+Contacts,data=train)
resid <- summary(fit2)$residuals
train1 <- cbind(train,resid)
train.melt <- melt(train1[,c(2:12,17)],id=c("resid"))
tail(train.melt)
ggplot(train.melt, aes(x=value, y=resid))+geom_point()+facet_wrap(~variable, scale="free")
library("mgcv")
library("gamclass")
pred.gam1 <- rep(0,62)
for(i in 1:62)
{
  gam1 <- gam(Age~s(Fbfriends)+s(Contacts)+s(Credits)+News+Height+Siblings+Texts+Emails+Pets+Field+Origin+Twitter,data=train[-i,],family=poisson)
  pred.gam1 <- predict(gam1, newdata=train[i,], type="response")
}
mse.gam1 <- mean((pred.gam1-train$Age)^2)
mse.gam1
gam1 <- gam(Age~s(Fbfriends)+s(Contacts)+s(Credits)+News+Height+Siblings+Texts+Emails+Pets+Field+Origin+Twitter,data=train,family=poisson)
summary(gam1)


pred.gam2 <- rep(0,62)
for(i in 1:62)
{
  gam2 <- gam(Age~s(Fbfriends)+s(Credits)+s(Contacts)+News+Field,data=train[-i,],family=poisson)
  pred.gam2[i] <- predict(gam2, newdata=train[i,], type="response")
}
mse.gam2 <- mean((pred.gam2-train$Age)^2)
mse.gam2
gam2 <- gam(Age~s(Fbfriends)+s(Credits)+s(Contacts)+News+Field,data=train)
summary(gam2)


pred.gam3 <- rep(0,62)
for(i in 1:62)
{
  gam3 <- gam(Age~s(Contacts)+News+s(Fbfriends,Credits)+Field+Emails,data=train[-i,],family=poisson)
  pred.gam3[i] <- predict(gam3, newdata=train[i,], type="response") 
}
mse.gam3 <- mean((pred.gam3-train$Age)^2)
mse.gam3

gam3 <- gam(Age~s(Contacts)+News+s(Fbfriends,Credits)+Emails,data=train,family=poisson)
summary(gam3)

##polynomial regression##
library("boot")
##for Fbfriends
cv.error=rep(0,5)
for(i in 1:5){
  glm.fit <- glm(Age~poly(Fbfriends,i),data=train)
  cv.error[i] <- cv.glm(train,glm.fit,K=10)$delta[1]
}
which(cv.error==min(cv.error))

##for News
cv.error=rep(0,5)
for(i in 1:5){
  glm.fit <- glm(Age~poly(News,i),data=train)
  cv.error[i] <- cv.glm(train,glm.fit,K=10)$delta[1]
}
which(cv.error==min(cv.error))

##for Credits
cv.error=rep(0,5)
for(i in 1:5){
  glm.fit <- glm(Age~poly(Credits,i),data=train)
  cv.error[i] <- cv.glm(train,glm.fit,K=10)$delta[1]
}
which(cv.error==min(cv.error))
##3

##for Emails
cv.error=rep(0,5)
for(i in 1:5){
  glm.fit <- glm(Age~poly(Emails,i),data=train)
  cv.error[i] <- cv.glm(train,glm.fit,K=10)$delta[1]
}
which(cv.error==min(cv.error))
##for Contacts
cv.error=rep(0,5)
for(i in 1:5){
  glm.fit <- glm(Age~poly(Contacts,i),data=train)
  cv.error[i] <- cv.glm(train,glm.fit,K=10)$delta[1]
}
which(cv.error==min(cv.error))
##3

#for ParentContact
cv.error=rep(0,5)
for(i in 1:5){
  glm.fit <- glm(Age~poly(ParentContact,i),data=train)
  cv.error[i] <- cv.glm(train,glm.fit,K=10)$delta[1]
}
which(cv.error==min(cv.error))


set.seed(1)
glm.fit1 <- glm(Age~Height+Siblings+Fbfriends+poly(Credits,3)+poly(Contacts,3)+Exercise+News+ParentContact+Texts+Emails+Pets+Twitter+Field+Origin,data=train)
summary(glm.fit1)
cv.glm.fit1 <- cv.glm(train,glm.fit1,K=10)$delta[1]
cv.glm.fit1
set.seed(1)
glm.fit2 <- glm(Age~News+Contacts+I(Contacts^3)+Field+Credits,data=train)
summary(glm.fit2)
cv.glm.fit2 <- cv.glm(train, glm.fit2, K=10)$delta[1]
cv.glm.fit2
set.seed(1)
glm.fit3 <- glm(Age~News+Contacts+Field+Credits+Fbfriends,data=train)
summary(glm.fit3)
cv.glm.fit3 <- cv.glm(train,glm.fit3,K=10)$delta[1]
cv.glm.fit3
glm.fit4 <- glm(Age~News+Contacts+Field+Credits+Fbfriends+I(Fbfriends*Credits),data=train)
summary(glm.fit3)
cv.glm.fit4 <- cv.glm(train,glm.fit4,K=10)$delta[1]
cv.glm.fit4


### Linear Model ###
ols.model <- lm(Age~.-ID+I(Contacts^2)+I(Credits*Fbfriends),data=train)
summary(ols.model)
ols.model1 <- lm(Age~I(Contacts^2)+I(Credits*Fbfriends)+Field+News,data=train)
summary(ols.model1)
par(mfrow=c(2,2))
plot(ols.model1)
library(car)
ncvTest(ols.model1)
shapiro.test(ols.model1$residual)
durbinWatsonTest(ols.model1)
### transformation for response varaible ###
library(MASS)
boxcox(ols.model1)
ols.model.trans <- lm((1/Age)~Fbfriends+Credits+Field+News+Contacts,data=train)
summary(ols.model.trans)
plot(ols.model.trans)
ncvTest(ols.model.trans)
shapiro.test(ols.model.trans$residual)

##weighted linear regression##
ols.model <- lm(Age~.-ID,data=train)
plot(ols.model1, which=1)
plot(train$Age, abs(ols.model$resid), xlab="Age", ylab="absolute value of Residual")
## regression absolute value of residuals on x to get estimate of SD
sd.model <-lm(abs(ols.model$resid)~.-ID,data=train)
sd.hat <- sd.model$fitted
sd.hat
w <- 1/(sd.hat^2)
for(i in 1:62)
{
  wls.model <- lm(Age~.-ID,data=train[-i,],weights=w[-i])
  pred <- predict(wls.model, new=train[i,], type="response")
}
mse.wls <- mean((pred-train$Age)^2)
mse.wls
## linear model based on the best model subset##
ols.model1 <- lm(Age~Fbfriends+Credits+Contacts+News+Field,data=train)
plot(ols.model1,which=1)
summary(ols.model1)
plot(train$Age, abs(ols.model1$resid), xlab="Age", ylab="absolute value of Residual")
## regression absolute value of residuals on x to get estimate of SD
sd.model1 <- lm(abs(ols.model1$resid)~Fbfriends+Credits+Contacts+News+Field,data=train)
sd.hat1 <- sd.model1$fitted
sd.hat1
##Eestimate weights to be inverse of estimated variance
w <- 1/(sd.hat1^2)
for(i in 1:62)
{
  wls.model <- lm(Age~Contacts+Credits+Fbfriends+News+Field,data=train[-i,],weights=w[-i])
  pred <- predict(wls.model, new=train[i,], type="response")
}
mse.wls <- mean((pred-train$Age)^2)
mse.wls



###kNN regression 
library(FNN)
y <- train$Age
mse.knn=rep(0,10)
x1 <- model.matrix(Age~.-ID,data=train)
pred <- rep(0,62)
for (j in 1:20)
{
  for (i in 1:62)
  {
    knn.fit <- knn.reg(x1[-i,],x1[i,],y,k=j)
    pred[i]<- knn.fit$pred
  }
  mse.knn[j] <- mean((pred-train$Age)^2)
}
which.min(mse.knn)
## k=7, it get the smallest mse. 
mse.knn[7]

## support vector regression ##
library(e1071)
svmfit <- svm(Age~.-ID, data=train, kernel="linear",cost=10)
summary(svmfit)
set.seed(1)
pred <- rep(0,62)
for (i in 1:62)
{
  tune.out <- tune(svm,Age~.-ID,data=train[-i,], kernel="linear", ranges=list(cost=c(0.001,0.05,0.1,0.5,1,10,100)))
  bestmodel <- tune.out$best.model
  pred[i] <- predict(bestmodel, newdata=train[i,],type="response")  
}
mse.svm.linear <- mean((pred-train$Age)^2)
mse.svm.linear



svmfit1 <- svm(Age~.-ID, data=train, kernel="radial",cost=0.1)
summary(svmfit1)
tune.out1 <- tune(svm,Age~.-ID,data=train, kernel="radial", ranges=list(cost=c(0.01,0.1,1,10,100,500,1000),gamma=c(0.1,0.005,0.05,0.5,1,2,3,4)))
bestmodel1 <- tune.out1$best.model
summary(bestmodel1)
set.seed(1)
pred <- rep(0,62)
for (i in 1:62)
{
  tune.out1 <- tune(svm,Age~.-ID,data=train[-i,], kernel="radial", ranges=list(cost=c(0.01,0.1,1,10,100,500,1000),gamma=c(0.1,0.005,0.05,0.5,1,2,3,4)))
  bestmodel1 <- tune.out1$best.model
  pred <- predict(bestmodel1, new=train[i,])  
}
mse.svm.radial <- mean((pred-train$Age)^2)
mse.svm.radial

### random forest ###
set.seed(1)
library(randomForest)
pred <- rep(0,62)
for(i in 1:62)
{
  rf <- randomForest(Age~.,data=train[-i,],ntree=100)
  pred[i] <- predict(rf,train[i,])
}
mse.rf <- mean((pred-train$Age)^2)
mse.rf
set.seed(1)
train1 <- train[,c(2,5,6,7,9,15,16,13,14)]]
head(train1)
for(i in 1:62)
{
  rf <- randomForest(Age~.,data=train1[-i,],ntree=100)
  pred[i] <- predict(rf,train1[i,])
}
mse.rf <- mean((pred-train1$Age)^2)
mse.rf

### bootstrap to predict the confidence interval ###

x <- model.matrix(Age~.-ID-1,data=train)
colnames(x)
xmean <- apply(x,2,mean)
names(xmean)
boots <- matrix(0,nrow=10000,1)
set.seed(1)
for(i in 1:length(xmean))
{
  for(j in 1:10000)
  {
    y <- sample(nrow(x),10000,replace=T)
    boots[j] <- mean(x[y,i])
    
  }
  print(names(xmean)[i])
  print(xmean[i] + quantile(boots-xmean[i],c(0.025,0.975)))
}
confint(glm.fit3)
library(huge)
plot(huge(x))
x.npn=huge.npn(x,npn.func="truncation")
out.npn <-huge(x.npn,nlambda=40,lambda.min.ratio=0.4)
plot(out.npn)