credit <- read.csv("~/530/Lab1/credit.csv")
str(credit)
## 'data.frame': 1000 obs. of 21 variables:
## $ Creditability : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Account.Balance : int 1 1 2 1 1 1 1 1 4 2 ...
## $ Duration.of.Credit..month. : int 18 9 12 12 12 10 8 6 18 24 ...
## $ Payment.Status.of.Previous.Credit: int 4 4 2 4 4 4 4 4 4 2 ...
## $ Purpose : int 2 0 9 0 0 0 0 0 3 3 ...
## $ Credit.Amount : int 1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
## $ Value.Savings.Stocks : int 1 1 2 1 1 1 1 1 1 3 ...
## $ Length.of.current.employment : int 2 3 4 3 3 2 4 2 1 1 ...
## $ Instalment.per.cent : int 4 2 2 3 4 1 1 2 4 1 ...
## $ Sex...Marital.Status : int 2 3 2 3 3 3 3 3 2 2 ...
## $ Guarantors : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Duration.in.Current.address : int 4 2 4 2 4 3 4 4 4 4 ...
## $ Most.valuable.available.asset : int 2 1 1 1 2 1 1 1 3 4 ...
## $ Age..years. : int 21 36 23 39 38 48 39 40 65 23 ...
## $ Concurrent.Credits : int 3 3 3 3 1 3 3 3 3 3 ...
## $ Type.of.apartment : int 1 1 1 1 2 1 2 2 2 1 ...
## $ No.of.Credits.at.this.Bank : int 1 2 1 2 2 2 2 1 2 1 ...
## $ Occupation : int 3 3 2 2 2 2 2 2 1 1 ...
## $ No.of.dependents : int 1 2 1 2 1 2 1 2 1 1 ...
## $ Telephone : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Foreign.Worker : int 1 1 1 2 2 2 2 2 1 1 ...
summary(credit$`Credit Amount`)
## Length Class Mode
## 0 NULL NULL
table(credit$Creditability)
##
## 0 1
## 300 700
set.seed(12350)
credit_rand <- credit[order(runif(1000)), ]
summary(credit$`Credit Amount`)
## Length Class Mode
## 0 NULL NULL
train_set<-credit_rand[1:900,]
test_set<-credit_rand[901:1000,]
prop.table(table(train_set$'Creditability'))
##
## 0 1
## 0.3011111 0.6988889
prop.table(table(test_set$'Creditability'))
##
## 0 1
## 0.29 0.71
library('C50')
train_set$'Creditability'<-as.factor(train_set$'Creditability')
train_model <- C5.0(x = train_set[-1], y = train_set$'Creditability')
train_model
##
## Call:
## C5.0.default(x = train_set[-1], y = train_set$Creditability)
##
## Classification Tree
## Number of samples: 900
## Number of predictors: 20
##
## Tree size: 77
##
## Non-standard options: attempt to group attributes
cred_pred <- predict(train_model, test_set)
library('gmodels')
CrossTable(test_set$'Creditability', cred_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('Actual Creditability', 'Predicted Creditability'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 100
##
##
## | Predicted Creditability
## Actual Creditability | 0 | 1 | Row Total |
## ---------------------|-----------|-----------|-----------|
## 0 | 18 | 11 | 29 |
## | 0.180 | 0.110 | |
## ---------------------|-----------|-----------|-----------|
## 1 | 17 | 54 | 71 |
## | 0.170 | 0.540 | |
## ---------------------|-----------|-----------|-----------|
## Column Total | 35 | 65 | 100 |
## ---------------------|-----------|-----------|-----------|
##
##
#Q1- If you see an accuracy of 100%, what does it mean? Does this mean that we design a perfect model? This is some thing that needs more discussion. Write a few sentences about accuracy of 100%.
It probably means that your test set is part of your training set.The decision tree is prone to overfit. One should always use a completed independent test data set from training set to gain real prediction for unkown values.
library('randomForest')
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
credit <- data.frame(credit)
set.seed(12350)
credit_rand <- credit[order(runif(1000)), ]
train_set<-credit_rand[1:900,]
test_set<-credit_rand[901:1000,]
train_set$'Creditability' <- as.factor(train_set$'Creditability')
random_model <- randomForest(Creditability~., data= train_set)
summary(random_model)
## Length Class Mode
## call 3 -none- call
## type 1 -none- character
## predicted 900 factor numeric
## err.rate 1500 -none- numeric
## confusion 6 -none- numeric
## votes 1800 matrix numeric
## oob.times 900 -none- numeric
## classes 2 -none- character
## importance 20 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 900 factor numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
cred_pred <- predict(random_model, test_set)
(p <- table(cred_pred, test_set$Creditability))
##
## cred_pred 0 1
## 0 19 10
## 1 10 61
(Accuracy <- sum(diag(p))/sum(p)*100)
## [1] 80
importance(random_model)
## MeanDecreaseGini
## Account.Balance 41.263854
## Duration.of.Credit..month. 38.529634
## Payment.Status.of.Previous.Credit 20.982267
## Purpose 23.090131
## Credit.Amount 50.535178
## Value.Savings.Stocks 18.545158
## Length.of.current.employment 20.033049
## Instalment.per.cent 15.897317
## Sex...Marital.Status 13.698222
## Guarantors 7.472563
## Duration.in.Current.address 15.453941
## Most.valuable.available.asset 17.345769
## Age..years. 38.450508
## Concurrent.Credits 9.217700
## Type.of.apartment 9.726398
## No.of.Credits.at.this.Bank 8.048862
## Occupation 12.256265
## No.of.dependents 5.369362
## Telephone 7.450824
## Foreign.Worker 1.616620
#Credit.Amount 50.535178
#Account.Balance 41.263854
#Duration.of.Credit..month. 38.529634
set.seed(23458)
credit_rand1 <- credit[order(runif(1000)), ]
train_set1<-credit_rand1[1:900,]
test_set1<-credit_rand1[901:1000,]
train_set1$'Creditability' <- as.factor(train_set1$'Creditability')
random_model1 <- randomForest(Creditability~., data= train_set1)
cred_pred1 <- predict(random_model, test_set1)
(p1 <- table(cred_pred1, test_set1$Creditability))
##
## cred_pred1 0 1
## 0 28 3
## 1 3 66
(Accuracy <- sum(diag(p1))/sum(p1)*100)
## [1] 94
# New accuracy is 94 which is better than previous value 80
wine <- read.csv("~/530/Lab1/whitewines.csv")
str(wine)
## 'data.frame': 4898 obs. of 12 variables:
## $ fixed.acidity : num 6.7 5.7 5.9 5.3 6.4 7 7.9 6.6 7 6.5 ...
## $ volatile.acidity : num 0.62 0.22 0.19 0.47 0.29 0.14 0.12 0.38 0.16 0.37 ...
## $ citric.acid : num 0.24 0.2 0.26 0.1 0.21 0.41 0.49 0.28 0.3 0.33 ...
## $ residual.sugar : num 1.1 16 7.4 1.3 9.65 0.9 5.2 2.8 2.6 3.9 ...
## $ chlorides : num 0.039 0.044 0.034 0.036 0.041 0.037 0.049 0.043 0.043 0.027 ...
## $ free.sulfur.dioxide : num 6 41 33 11 36 22 33 17 34 40 ...
## $ total.sulfur.dioxide: num 62 113 123 74 119 95 152 67 90 130 ...
## $ density : num 0.993 0.999 0.995 0.991 0.993 ...
## $ pH : num 3.41 3.22 3.49 3.48 2.99 3.25 3.18 3.21 2.88 3.28 ...
## $ sulphates : num 0.32 0.46 0.42 0.54 0.34 0.43 0.47 0.47 0.47 0.39 ...
## $ alcohol : num 10.4 8.9 10.1 11.2 10.9 ...
## $ quality : int 5 6 6 4 6 6 6 6 6 7 ...
hist(wine$quality)
wine_train <- wine[1:3750, ]
wine_test <- wine[3751:4898, ]
library('rpart')
m.rpart <- rpart(quality ~ ., data=wine_train)
m.rpart
## n= 3750
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 3750 2945.53200 5.870933
## 2) alcohol< 10.85 2372 1418.86100 5.604975
## 4) volatile.acidity>=0.2275 1611 821.30730 5.432030
## 8) volatile.acidity>=0.3025 688 278.97670 5.255814 *
## 9) volatile.acidity< 0.3025 923 505.04230 5.563380 *
## 5) volatile.acidity< 0.2275 761 447.36400 5.971091 *
## 3) alcohol>=10.85 1378 1070.08200 6.328737
## 6) free.sulfur.dioxide< 10.5 84 95.55952 5.369048 *
## 7) free.sulfur.dioxide>=10.5 1294 892.13600 6.391036
## 14) alcohol< 11.76667 629 430.11130 6.173291
## 28) volatile.acidity>=0.465 11 10.72727 4.545455 *
## 29) volatile.acidity< 0.465 618 389.71680 6.202265 *
## 15) alcohol>=11.76667 665 403.99400 6.596992 *
library('rpart.plot')
rpart.plot(m.rpart, digits=3)
rpart.plot(m.rpart, digits=4, fallen.leaves = TRUE, type = 3, extra = 101)
p.rpart<- predict(m.rpart, wine_test)
summary(p.rpart)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.545 5.563 5.971 5.893 6.202 6.597
summary(wine_test$quality)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 5.000 6.000 5.901 6.000 9.000
cor(p.rpart, wine_test$quality)
## [1] 0.5369525
#What is your interpretation about this amount of RMSE?
#The lower the RMSE value is, the better fit the model is. 0.84 indicates a poor fit of our model for prediction.
news <- read.csv("~/530/Lab1/OnlineNewsPopularity_for_R.csv")
#remove first two columns which are non prdictive variables
news <- news[,-(1:2)]
View(news)
#select only useful data
news1 <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)
#rename the columns
colnames(news1) <- c("title", "content", "unique_tokens", "non_stop_words", "num_hrefs", "num_imgs", "num_videos", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")
#1400 to be a popular article
news1$shares <- as.factor(ifelse(news1$shares > 1400,1,0))
set.seed(12345)
news_rand <- news1[order(runif(39643)), ]
news_train <- news_rand[1:7928, ]
news_test <- news_rand[7928:39643, ]
news_train$shares <- as.factor(news_train$shares)
random_News <- randomForest(shares ~ . , data= news_train)
news_pred_RF <- predict(random_News, news_test)
(p3 <- table(news_pred_RF, news_test$shares))
##
## news_pred_RF 0 1
## 0 9516 6805
## 1 6574 8821
(Accuracy <- sum(diag(p3))/sum(p3)*100)
## [1] 57.81624
#57.8% it's okay but not ideal
importance(random_News)
## MeanDecreaseGini
## title 239.5514
## content 387.3460
## unique_tokens 425.9253
## non_stop_words 383.3808
## num_hrefs 301.2238
## num_imgs 210.4306
## num_videos 119.6143
## num_keywords 204.3838
## kw_max_max 107.5791
## global_sentiment_polarity 451.8444
## avg_positive_polarity 412.4018
## title_subjectivity 167.8869
## title_sentiment_polarity 178.5317
## abs_title_subjectivity 151.4992
## abs_title_sentiment_polarity 147.9789
# top three important features:
#Global sentiment polarity,number of unique tokens,avg_positive_polarity
dt_model <- C5.0(news_train[-16], news_train$shares)
summary(dt_model)
##
## Call:
## C5.0.default(x = news_train[-16], y = news_train$shares)
##
##
## C5.0 [Release 2.07 GPL Edition] Sun Sep 20 16:11:05 2020
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 7928 cases (16 attributes) from undefined.data
##
## Decision tree:
##
## num_imgs > 3:
## :...num_keywords > 4: 1 (1977/769)
## : num_keywords <= 4:
## : :...avg_positive_polarity <= 0.235552: 1 (5)
## : avg_positive_polarity > 0.235552: 0 (85/28)
## num_imgs <= 3:
## :...unique_tokens <= 0.4312321:
## :...num_keywords > 9: 1 (105/33)
## : num_keywords <= 9:
## : :...kw_max_max <= 690400: 1 (75/19)
## : kw_max_max > 690400:
## : :...num_videos > 2:
## : :...avg_positive_polarity <= 0.2989583: 1 (4)
## : : avg_positive_polarity > 0.2989583: 0 (19/2)
## : num_videos <= 2:
## : :...global_sentiment_polarity > 0.121371:
## : :...num_hrefs <= 4: 0 (8)
## : : num_hrefs > 4: 1 (123/35)
## : global_sentiment_polarity <= 0.121371:
## : :...non_stop_words <= 0: 1 (145/64)
## : non_stop_words > 0:
## : :...global_sentiment_polarity <= -0.005681818: 0 (9)
## : global_sentiment_polarity > -0.005681818:
## : :...title > 13: 0 (14/2)
## : title <= 13:
## : :...num_keywords <= 4: 1 (11/2)
## : num_keywords > 4:
## : :...num_videos <= 1: 0 (76/32)
## : num_videos > 1: 1 (7/2)
## unique_tokens > 0.4312321:
## :...kw_max_max <= 617900: 1 (790/358)
## kw_max_max > 617900:
## :...global_sentiment_polarity <= 0.08353174:
## :...num_imgs > 0:
## : :...num_imgs <= 2: 0 (1207/405)
## : : num_imgs > 2:
## : : :...num_keywords <= 6:
## : : :...num_videos <= 0: 0 (23/4)
## : : : num_videos > 0: 1 (11/4)
## : : num_keywords > 6:
## : : :...content <= 1038: 1 (43/15)
## : : content > 1038: 0 (4)
## : num_imgs <= 0:
## : :...title <= 9: 0 (96/30)
## : title > 9:
## : :...kw_max_max <= 690400:
## : :...num_videos <= 1: 1 (15/3)
## : : num_videos > 1: 0 (11/4)
## : kw_max_max > 690400:
## : :...num_keywords > 7:
## : :...title <= 11: 0 (42/20)
## : : title > 11:
## : : :...abs_title_sentiment_polarity <= 0.2467532: 1 (34/7)
## : : abs_title_sentiment_polarity > 0.2467532: 0 (10/2)
## : num_keywords <= 7:
## : :...title > 11: 0 (53/17)
## : title <= 11:
## : :...num_hrefs > 13: 1 (4)
## : num_hrefs <= 13:
## : :...title <= 10: 0 (30/10)
## : title > 10:
## : :...title_subjectivity <= 0.6333333: 1 (37/15)
## : title_subjectivity > 0.6333333: 0 (5)
## global_sentiment_polarity > 0.08353174:
## :...num_videos > 2: 1 (207/78)
## num_videos <= 2:
## :...num_imgs > 1:
## :...num_videos <= 0: 0 (292/143)
## : num_videos > 0:
## : :...abs_title_sentiment_polarity <= 0.1305556: 1 (73/19)
## : abs_title_sentiment_polarity > 0.1305556:
## : :...title <= 9: 1 (6)
## : title > 9: 0 (38/13)
## num_imgs <= 1:
## :...kw_max_max > 690400: 0 (1825/781)
## kw_max_max <= 690400:
## :...num_hrefs > 11: 1 (88/32)
## num_hrefs <= 11:
## :...num_videos <= 0:
## :...num_imgs > 0: 0 (181/67)
## : num_imgs <= 0:
## : :...abs_title_subjectivity <= 0.1583333: 0 (21/2)
## : abs_title_subjectivity > 0.1583333: 1 (38/12)
## num_videos > 0:
## :...abs_title_subjectivity <= 0.08522727: 1 (7)
## abs_title_subjectivity > 0.08522727:
## :...num_videos > 1: 1 (8/2)
## num_videos <= 1:
## :...num_keywords > 8: 0 (9/1)
## num_keywords <= 8:
## :...num_imgs > 0: [S1]
## num_imgs <= 0:
## :...num_keywords > 7: 0 (4)
## num_keywords <= 7:
## :...content <= 171: 1 (19/4)
## content > 171: 0 (17/5)
##
## SubTree [S1]
##
## avg_positive_polarity <= 0.3797951: 1 (13/3)
## avg_positive_polarity > 0.3797951: 0 (4)
##
##
## Evaluation on training data (7928 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 50 3044(38.4%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 2515 1476 (a): class 0
## 1568 2369 (b): class 1
##
##
## Attribute usage:
##
## 100.00% num_imgs
## 73.93% unique_tokens
## 72.60% kw_max_max
## 61.40% global_sentiment_polarity
## 41.95% num_videos
## 38.16% num_keywords
## 7.77% num_hrefs
## 6.17% title
## 3.30% non_stop_words
## 2.03% abs_title_sentiment_polarity
## 1.77% abs_title_subjectivity
## 1.64% avg_positive_polarity
## 1.05% content
## 0.53% title_subjectivity
##
##
## Time: 0.1 secs
dt_pred <- predict(dt_model, news_test)
library('gmodels')
CrossTable(news_test$shares, dt_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('actual', 'prediction'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 31716
##
##
## | prediction
## actual | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 9400 | 6690 | 16090 |
## | 0.296 | 0.211 | |
## -------------|-----------|-----------|-----------|
## 1 | 6853 | 8773 | 15626 |
## | 0.216 | 0.277 | |
## -------------|-----------|-----------|-----------|
## Column Total | 16253 | 15463 | 31716 |
## -------------|-----------|-----------|-----------|
##
##
(p4 <- table(dt_pred, news_test$shares))
##
## dt_pred 0 1
## 0 9400 6853
## 1 6690 8773
(Accuracy <- sum(diag(p4))/sum(p4)*100)
## [1] 57.29916
#we got 57.3% from decision tree so these two models basically have the same prediction accuracy.