ANLY 530 - Lab1

Import Data

credit <- read.csv("~/530/Lab1/credit.csv")
str(credit)

## 'data.frame':    1000 obs. of  21 variables:
##  $ Creditability                    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Account.Balance                  : int  1 1 2 1 1 1 1 1 4 2 ...
##  $ Duration.of.Credit..month.       : int  18 9 12 12 12 10 8 6 18 24 ...
##  $ Payment.Status.of.Previous.Credit: int  4 4 2 4 4 4 4 4 4 2 ...
##  $ Purpose                          : int  2 0 9 0 0 0 0 0 3 3 ...
##  $ Credit.Amount                    : int  1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
##  $ Value.Savings.Stocks             : int  1 1 2 1 1 1 1 1 1 3 ...
##  $ Length.of.current.employment     : int  2 3 4 3 3 2 4 2 1 1 ...
##  $ Instalment.per.cent              : int  4 2 2 3 4 1 1 2 4 1 ...
##  $ Sex...Marital.Status             : int  2 3 2 3 3 3 3 3 2 2 ...
##  $ Guarantors                       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Duration.in.Current.address      : int  4 2 4 2 4 3 4 4 4 4 ...
##  $ Most.valuable.available.asset    : int  2 1 1 1 2 1 1 1 3 4 ...
##  $ Age..years.                      : int  21 36 23 39 38 48 39 40 65 23 ...
##  $ Concurrent.Credits               : int  3 3 3 3 1 3 3 3 3 3 ...
##  $ Type.of.apartment                : int  1 1 1 1 2 1 2 2 2 1 ...
##  $ No.of.Credits.at.this.Bank       : int  1 2 1 2 2 2 2 1 2 1 ...
##  $ Occupation                       : int  3 3 2 2 2 2 2 2 1 1 ...
##  $ No.of.dependents                 : int  1 2 1 2 1 2 1 2 1 1 ...
##  $ Telephone                        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Foreign.Worker                   : int  1 1 1 2 2 2 2 2 1 1 ...

Explore the data

summary(credit$`Credit Amount`)

## Length  Class   Mode 
##      0   NULL   NULL

table(credit$Creditability)

## 
##   0   1 
## 300 700

Split data into training and test records

set.seed(12350)
credit_rand <- credit[order(runif(1000)), ]

Check the randomization

summary(credit$`Credit Amount`)

## Length  Class   Mode 
##      0   NULL   NULL

Subset

train_set<-credit_rand[1:900,]
test_set<-credit_rand[901:1000,]

check percentage

prop.table(table(train_set$'Creditability'))

## 
##         0         1 
## 0.3011111 0.6988889

prop.table(table(test_set$'Creditability'))

## 
##    0    1 
## 0.29 0.71

library('C50')
train_set$'Creditability'<-as.factor(train_set$'Creditability')
train_model <- C5.0(x = train_set[-1], y = train_set$'Creditability')
train_model

## 
## Call:
## C5.0.default(x = train_set[-1], y = train_set$Creditability)
## 
## Classification Tree
## Number of samples: 900 
## Number of predictors: 20 
## 
## Tree size: 77 
## 
## Non-standard options: attempt to group attributes

Evaluate model performance

cred_pred <- predict(train_model, test_set)
library('gmodels')
CrossTable(test_set$'Creditability', cred_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('Actual Creditability', 'Predicted Creditability'))

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  100 
## 
##  
##                      | Predicted Creditability 
## Actual Creditability |         0 |         1 | Row Total | 
## ---------------------|-----------|-----------|-----------|
##                    0 |        18 |        11 |        29 | 
##                      |     0.180 |     0.110 |           | 
## ---------------------|-----------|-----------|-----------|
##                    1 |        17 |        54 |        71 | 
##                      |     0.170 |     0.540 |           | 
## ---------------------|-----------|-----------|-----------|
##         Column Total |        35 |        65 |       100 | 
## ---------------------|-----------|-----------|-----------|
## 
##

#Q1- If you see an accuracy of 100%, what does it mean? Does this mean that we design a perfect model? This is some thing that needs more discussion. Write a few sentences about accuracy of 100%.

It probably means that your test set is part of your training set.The decision tree is prone to overfit. One should always use a completed independent test data set from training set to gain real prediction for unkown values.

Model 2 Random Forest

library('randomForest')

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

credit <- data.frame(credit)
set.seed(12350)
credit_rand <- credit[order(runif(1000)), ]
train_set<-credit_rand[1:900,]
test_set<-credit_rand[901:1000,]
train_set$'Creditability' <- as.factor(train_set$'Creditability')
random_model <- randomForest(Creditability~., data= train_set)
summary(random_model)

##                 Length Class  Mode     
## call               3   -none- call     
## type               1   -none- character
## predicted        900   factor numeric  
## err.rate        1500   -none- numeric  
## confusion          6   -none- numeric  
## votes           1800   matrix numeric  
## oob.times        900   -none- numeric  
## classes            2   -none- character
## importance        20   -none- numeric  
## importanceSD       0   -none- NULL     
## localImportance    0   -none- NULL     
## proximity          0   -none- NULL     
## ntree              1   -none- numeric  
## mtry               1   -none- numeric  
## forest            14   -none- list     
## y                900   factor numeric  
## test               0   -none- NULL     
## inbag              0   -none- NULL     
## terms              3   terms  call

cred_pred <- predict(random_model, test_set)
(p <- table(cred_pred, test_set$Creditability))

##          
## cred_pred  0  1
##         0 19 10
##         1 10 61

(Accuracy <- sum(diag(p))/sum(p)*100)

## [1] 80

importance(random_model)

##                                   MeanDecreaseGini
## Account.Balance                          41.263854
## Duration.of.Credit..month.               38.529634
## Payment.Status.of.Previous.Credit        20.982267
## Purpose                                  23.090131
## Credit.Amount                            50.535178
## Value.Savings.Stocks                     18.545158
## Length.of.current.employment             20.033049
## Instalment.per.cent                      15.897317
## Sex...Marital.Status                     13.698222
## Guarantors                                7.472563
## Duration.in.Current.address              15.453941
## Most.valuable.available.asset            17.345769
## Age..years.                              38.450508
## Concurrent.Credits                        9.217700
## Type.of.apartment                         9.726398
## No.of.Credits.at.this.Bank                8.048862
## Occupation                               12.256265
## No.of.dependents                          5.369362
## Telephone                                 7.450824
## Foreign.Worker                            1.616620

#Credit.Amount                            50.535178
#Account.Balance                          41.263854
#Duration.of.Credit..month.               38.529634

set.seed(23458)
credit_rand1 <- credit[order(runif(1000)), ]
train_set1<-credit_rand1[1:900,]
test_set1<-credit_rand1[901:1000,]
train_set1$'Creditability' <- as.factor(train_set1$'Creditability')
random_model1 <- randomForest(Creditability~., data= train_set1)
cred_pred1 <- predict(random_model, test_set1)
(p1 <- table(cred_pred1, test_set1$Creditability))

##           
## cred_pred1  0  1
##          0 28  3
##          1  3 66

(Accuracy <- sum(diag(p1))/sum(p1)*100)

## [1] 94

# New accuracy is 94 which is better than previous value 80

Adding regression to trees

wine <- read.csv("~/530/Lab1/whitewines.csv")
str(wine)

## 'data.frame':    4898 obs. of  12 variables:
##  $ fixed.acidity       : num  6.7 5.7 5.9 5.3 6.4 7 7.9 6.6 7 6.5 ...
##  $ volatile.acidity    : num  0.62 0.22 0.19 0.47 0.29 0.14 0.12 0.38 0.16 0.37 ...
##  $ citric.acid         : num  0.24 0.2 0.26 0.1 0.21 0.41 0.49 0.28 0.3 0.33 ...
##  $ residual.sugar      : num  1.1 16 7.4 1.3 9.65 0.9 5.2 2.8 2.6 3.9 ...
##  $ chlorides           : num  0.039 0.044 0.034 0.036 0.041 0.037 0.049 0.043 0.043 0.027 ...
##  $ free.sulfur.dioxide : num  6 41 33 11 36 22 33 17 34 40 ...
##  $ total.sulfur.dioxide: num  62 113 123 74 119 95 152 67 90 130 ...
##  $ density             : num  0.993 0.999 0.995 0.991 0.993 ...
##  $ pH                  : num  3.41 3.22 3.49 3.48 2.99 3.25 3.18 3.21 2.88 3.28 ...
##  $ sulphates           : num  0.32 0.46 0.42 0.54 0.34 0.43 0.47 0.47 0.47 0.39 ...
##  $ alcohol             : num  10.4 8.9 10.1 11.2 10.9 ...
##  $ quality             : int  5 6 6 4 6 6 6 6 6 7 ...

hist(wine$quality)

Exploring and Preparing the Data

wine_train <- wine[1:3750, ]
wine_test <- wine[3751:4898, ]

library('rpart')
m.rpart <- rpart(quality ~ ., data=wine_train)
m.rpart

## n= 3750 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 3750 2945.53200 5.870933  
##    2) alcohol< 10.85 2372 1418.86100 5.604975  
##      4) volatile.acidity>=0.2275 1611  821.30730 5.432030  
##        8) volatile.acidity>=0.3025 688  278.97670 5.255814 *
##        9) volatile.acidity< 0.3025 923  505.04230 5.563380 *
##      5) volatile.acidity< 0.2275 761  447.36400 5.971091 *
##    3) alcohol>=10.85 1378 1070.08200 6.328737  
##      6) free.sulfur.dioxide< 10.5 84   95.55952 5.369048 *
##      7) free.sulfur.dioxide>=10.5 1294  892.13600 6.391036  
##       14) alcohol< 11.76667 629  430.11130 6.173291  
##         28) volatile.acidity>=0.465 11   10.72727 4.545455 *
##         29) volatile.acidity< 0.465 618  389.71680 6.202265 *
##       15) alcohol>=11.76667 665  403.99400 6.596992 *

library('rpart.plot')
rpart.plot(m.rpart, digits=3)

rpart.plot(m.rpart, digits=4, fallen.leaves = TRUE, type = 3, extra = 101)

Evaluate model performance

p.rpart<- predict(m.rpart, wine_test)
summary(p.rpart)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   4.545   5.563   5.971   5.893   6.202   6.597

summary(wine_test$quality)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.000   5.000   6.000   5.901   6.000   9.000

cor(p.rpart, wine_test$quality)

## [1] 0.5369525

Question #3

#What is your interpretation about this amount of RMSE?

#The lower the RMSE value is, the better fit the model is. 0.84 indicates a poor fit of our model for prediction.

Question #4

news <- read.csv("~/530/Lab1/OnlineNewsPopularity_for_R.csv")
#remove first two columns which are non prdictive variables
news <- news[,-(1:2)]
View(news)
#select only useful data
news1 <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)
#rename the columns
colnames(news1) <- c("title", "content", "unique_tokens", "non_stop_words", "num_hrefs", "num_imgs", "num_videos", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")
#1400 to be a popular article
news1$shares <- as.factor(ifelse(news1$shares > 1400,1,0))

Random Forest

set.seed(12345)

news_rand <- news1[order(runif(39643)), ]

news_train <- news_rand[1:7928, ]
news_test <- news_rand[7928:39643, ]
news_train$shares <- as.factor(news_train$shares)
random_News <- randomForest(shares ~ . , data= news_train)
news_pred_RF <- predict(random_News, news_test)
(p3 <- table(news_pred_RF, news_test$shares))

##             
## news_pred_RF    0    1
##            0 9516 6805
##            1 6574 8821

(Accuracy <- sum(diag(p3))/sum(p3)*100)

## [1] 57.81624

#57.8% it's okay but not ideal

importance(random_News)

##                              MeanDecreaseGini
## title                                239.5514
## content                              387.3460
## unique_tokens                        425.9253
## non_stop_words                       383.3808
## num_hrefs                            301.2238
## num_imgs                             210.4306
## num_videos                           119.6143
## num_keywords                         204.3838
## kw_max_max                           107.5791
## global_sentiment_polarity            451.8444
## avg_positive_polarity                412.4018
## title_subjectivity                   167.8869
## title_sentiment_polarity             178.5317
## abs_title_subjectivity               151.4992
## abs_title_sentiment_polarity         147.9789

# top three important features:
#Global sentiment polarity,number of unique tokens,avg_positive_polarity

Decision Trees

dt_model <- C5.0(news_train[-16], news_train$shares)
summary(dt_model)

## 
## Call:
## C5.0.default(x = news_train[-16], y = news_train$shares)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Sun Sep 20 16:11:05 2020
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 7928 cases (16 attributes) from undefined.data
## 
## Decision tree:
## 
## num_imgs > 3:
## :...num_keywords > 4: 1 (1977/769)
## :   num_keywords <= 4:
## :   :...avg_positive_polarity <= 0.235552: 1 (5)
## :       avg_positive_polarity > 0.235552: 0 (85/28)
## num_imgs <= 3:
## :...unique_tokens <= 0.4312321:
##     :...num_keywords > 9: 1 (105/33)
##     :   num_keywords <= 9:
##     :   :...kw_max_max <= 690400: 1 (75/19)
##     :       kw_max_max > 690400:
##     :       :...num_videos > 2:
##     :           :...avg_positive_polarity <= 0.2989583: 1 (4)
##     :           :   avg_positive_polarity > 0.2989583: 0 (19/2)
##     :           num_videos <= 2:
##     :           :...global_sentiment_polarity > 0.121371:
##     :               :...num_hrefs <= 4: 0 (8)
##     :               :   num_hrefs > 4: 1 (123/35)
##     :               global_sentiment_polarity <= 0.121371:
##     :               :...non_stop_words <= 0: 1 (145/64)
##     :                   non_stop_words > 0:
##     :                   :...global_sentiment_polarity <= -0.005681818: 0 (9)
##     :                       global_sentiment_polarity > -0.005681818:
##     :                       :...title > 13: 0 (14/2)
##     :                           title <= 13:
##     :                           :...num_keywords <= 4: 1 (11/2)
##     :                               num_keywords > 4:
##     :                               :...num_videos <= 1: 0 (76/32)
##     :                                   num_videos > 1: 1 (7/2)
##     unique_tokens > 0.4312321:
##     :...kw_max_max <= 617900: 1 (790/358)
##         kw_max_max > 617900:
##         :...global_sentiment_polarity <= 0.08353174:
##             :...num_imgs > 0:
##             :   :...num_imgs <= 2: 0 (1207/405)
##             :   :   num_imgs > 2:
##             :   :   :...num_keywords <= 6:
##             :   :       :...num_videos <= 0: 0 (23/4)
##             :   :       :   num_videos > 0: 1 (11/4)
##             :   :       num_keywords > 6:
##             :   :       :...content <= 1038: 1 (43/15)
##             :   :           content > 1038: 0 (4)
##             :   num_imgs <= 0:
##             :   :...title <= 9: 0 (96/30)
##             :       title > 9:
##             :       :...kw_max_max <= 690400:
##             :           :...num_videos <= 1: 1 (15/3)
##             :           :   num_videos > 1: 0 (11/4)
##             :           kw_max_max > 690400:
##             :           :...num_keywords > 7:
##             :               :...title <= 11: 0 (42/20)
##             :               :   title > 11:
##             :               :   :...abs_title_sentiment_polarity <= 0.2467532: 1 (34/7)
##             :               :       abs_title_sentiment_polarity > 0.2467532: 0 (10/2)
##             :               num_keywords <= 7:
##             :               :...title > 11: 0 (53/17)
##             :                   title <= 11:
##             :                   :...num_hrefs > 13: 1 (4)
##             :                       num_hrefs <= 13:
##             :                       :...title <= 10: 0 (30/10)
##             :                           title > 10:
##             :                           :...title_subjectivity <= 0.6333333: 1 (37/15)
##             :                               title_subjectivity > 0.6333333: 0 (5)
##             global_sentiment_polarity > 0.08353174:
##             :...num_videos > 2: 1 (207/78)
##                 num_videos <= 2:
##                 :...num_imgs > 1:
##                     :...num_videos <= 0: 0 (292/143)
##                     :   num_videos > 0:
##                     :   :...abs_title_sentiment_polarity <= 0.1305556: 1 (73/19)
##                     :       abs_title_sentiment_polarity > 0.1305556:
##                     :       :...title <= 9: 1 (6)
##                     :           title > 9: 0 (38/13)
##                     num_imgs <= 1:
##                     :...kw_max_max > 690400: 0 (1825/781)
##                         kw_max_max <= 690400:
##                         :...num_hrefs > 11: 1 (88/32)
##                             num_hrefs <= 11:
##                             :...num_videos <= 0:
##                                 :...num_imgs > 0: 0 (181/67)
##                                 :   num_imgs <= 0:
##                                 :   :...abs_title_subjectivity <= 0.1583333: 0 (21/2)
##                                 :       abs_title_subjectivity > 0.1583333: 1 (38/12)
##                                 num_videos > 0:
##                                 :...abs_title_subjectivity <= 0.08522727: 1 (7)
##                                     abs_title_subjectivity > 0.08522727:
##                                     :...num_videos > 1: 1 (8/2)
##                                         num_videos <= 1:
##                                         :...num_keywords > 8: 0 (9/1)
##                                             num_keywords <= 8:
##                                             :...num_imgs > 0: [S1]
##                                                 num_imgs <= 0:
##                                                 :...num_keywords > 7: 0 (4)
##                                                     num_keywords <= 7:
##                                                     :...content <= 171: 1 (19/4)
##                                                         content > 171: 0 (17/5)
## 
## SubTree [S1]
## 
## avg_positive_polarity <= 0.3797951: 1 (13/3)
## avg_positive_polarity > 0.3797951: 0 (4)
## 
## 
## Evaluation on training data (7928 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##      50 3044(38.4%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##    2515  1476    (a): class 0
##    1568  2369    (b): class 1
## 
## 
##  Attribute usage:
## 
##  100.00% num_imgs
##   73.93% unique_tokens
##   72.60% kw_max_max
##   61.40% global_sentiment_polarity
##   41.95% num_videos
##   38.16% num_keywords
##    7.77% num_hrefs
##    6.17% title
##    3.30% non_stop_words
##    2.03% abs_title_sentiment_polarity
##    1.77% abs_title_subjectivity
##    1.64% avg_positive_polarity
##    1.05% content
##    0.53% title_subjectivity
## 
## 
## Time: 0.1 secs

dt_pred <- predict(dt_model, news_test)
library('gmodels')
CrossTable(news_test$shares, dt_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('actual', 'prediction'))

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  31716 
## 
##  
##              | prediction 
##       actual |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##            0 |      9400 |      6690 |     16090 | 
##              |     0.296 |     0.211 |           | 
## -------------|-----------|-----------|-----------|
##            1 |      6853 |      8773 |     15626 | 
##              |     0.216 |     0.277 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |     16253 |     15463 |     31716 | 
## -------------|-----------|-----------|-----------|
## 
##

(p4 <- table(dt_pred, news_test$shares))

##        
## dt_pred    0    1
##       0 9400 6853
##       1 6690 8773

(Accuracy <- sum(diag(p4))/sum(p4)*100)

## [1] 57.29916

#we got 57.3% from decision tree so these two models basically have the same prediction accuracy.

ANLY 530 - Lab1

Lab 1

sizhe Xie

2020-09-20

Import Data

Explore the data

Split data into training and test records

Check the randomization

Subset

check percentage

Evaluate model performance

Model 2 Random Forest

Adding regression to trees

Exploring and Preparing the Data

Evaluate model performance

Question #3

Question #4

Random Forest

Decision Trees