Credit Data
Step 1: Loading Data into R
library(readr)
creditData <- read.csv("C:/Users/gmutya048/Downloads/creditData.csv")
str(creditData)
## 'data.frame': 1000 obs. of 21 variables:
## $ Creditability : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Account.Balance : int 1 1 2 1 1 1 1 1 4 2 ...
## $ Duration.of.Credit..month. : int 18 9 12 12 12 10 8 6 18 24 ...
## $ Payment.Status.of.Previous.Credit: int 4 4 2 4 4 4 4 4 4 2 ...
## $ Purpose : int 2 0 9 0 0 0 0 0 3 3 ...
## $ Credit.Amount : int 1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
## $ Value.Savings.Stocks : int 1 1 2 1 1 1 1 1 1 3 ...
## $ Length.of.current.employment : int 2 3 4 3 3 2 4 2 1 1 ...
## $ Instalment.per.cent : int 4 2 2 3 4 1 1 2 4 1 ...
## $ Sex...Marital.Status : int 2 3 2 3 3 3 3 3 2 2 ...
## $ Guarantors : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Duration.in.Current.address : int 4 2 4 2 4 3 4 4 4 4 ...
## $ Most.valuable.available.asset : int 2 1 1 1 2 1 1 1 3 4 ...
## $ Age..years. : int 21 36 23 39 38 48 39 40 65 23 ...
## $ Concurrent.Credits : int 3 3 3 3 1 3 3 3 3 3 ...
## $ Type.of.apartment : int 1 1 1 1 2 1 2 2 2 1 ...
## $ No.of.Credits.at.this.Bank : int 1 2 1 2 2 2 2 1 2 1 ...
## $ Occupation : int 3 3 2 2 2 2 2 2 1 1 ...
## $ No.of.dependents : int 1 2 1 2 1 2 1 2 1 1 ...
## $ Telephone : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Foreign.Worker : int 1 1 1 2 2 2 2 2 1 1 ...
Step 2: Data Preparation
creditData$Creditability <- as.factor(creditData$Creditability)
sum(is.na(creditData))
## [1] 0
Step 3: Split the data into training and test datasets
set.seed(12345)
credit_rand <- creditData[order(runif(1000)), ]
credit_train <- credit_rand[1:750, ]
credit_test <- credit_rand[751:1000, ]
prop.table(table(credit_train$Creditability))
##
## 0 1
## 0.3146667 0.6853333
prop.table(table(credit_test$Creditability))
##
## 0 1
## 0.256 0.744
Step 4: Model Design
library(naivebayes)
## naivebayes 0.9.7 loaded
naive_model <- naive_bayes(Creditability ~ ., data= credit_train)
naive_model
##
## ================================== Naive Bayes ==================================
##
## Call:
## naive_bayes.formula(formula = Creditability ~ ., data = credit_train)
##
## ---------------------------------------------------------------------------------
##
## Laplace smoothing: 0
##
## ---------------------------------------------------------------------------------
##
## A priori probabilities:
##
## 0 1
## 0.3146667 0.6853333
##
## ---------------------------------------------------------------------------------
##
## Tables:
##
## ---------------------------------------------------------------------------------
## ::: Account.Balance (Gaussian)
## ---------------------------------------------------------------------------------
##
## Account.Balance 0 1
## mean 1.923729 2.793774
## sd 1.036826 1.252008
##
## ---------------------------------------------------------------------------------
## ::: Duration.of.Credit..month. (Gaussian)
## ---------------------------------------------------------------------------------
##
## Duration.of.Credit..month. 0 1
## mean 24.46610 19.20039
## sd 13.82208 11.13433
##
## ---------------------------------------------------------------------------------
## ::: Payment.Status.of.Previous.Credit (Gaussian)
## ---------------------------------------------------------------------------------
##
## Payment.Status.of.Previous.Credit 0 1
## mean 2.161017 2.665370
## sd 1.071649 1.045219
##
## ---------------------------------------------------------------------------------
## ::: Purpose (Gaussian)
## ---------------------------------------------------------------------------------
##
## Purpose 0 1
## mean 2.927966 2.803502
## sd 2.944722 2.633253
##
## ---------------------------------------------------------------------------------
## ::: Credit.Amount (Gaussian)
## ---------------------------------------------------------------------------------
##
## Credit.Amount 0 1
## mean 3964.195 2984.177
## sd 3597.093 2379.685
##
## ---------------------------------------------------------------------------------
##
## # ... and 15 more tables
##
## ---------------------------------------------------------------------------------
step 5: Evaluate the Model
(conf_nat <- table(predict(naive_model, credit_test), credit_test$Creditability))
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
##
## 0 1
## 0 42 35
## 1 22 151
Accuracy in percentage
(Accuracy <- sum(diag(conf_nat))/sum(conf_nat)*100)
## [1] 77.2
Lab- Part 2 Model Performance improvement
Step2: Data Exploration
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
creditDataScaled <- scale(credit_rand[,2:ncol(credit_rand)], center=TRUE, scale = TRUE)
m <- cor(creditDataScaled)
(highlycor <- findCorrelation(m, 0.30))
## [1] 5 12 19 15 3
Removing highly correlated data and divide into train and test sets
filteredData <- credit_rand[, -(highlycor[4]+1)]
filteredTraining <- filteredData[1:750, ]
filteredTest <- filteredData[751:1000, ]
Step 3: Training Data
nb_model <- naive_bayes(Creditability ~ ., data=filteredTraining)
step 4:Model Evaluation
filteredTestPred <- predict(nb_model, newdata = filteredTest)
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
table(filteredTestPred, filteredTest$Creditability)
##
## filteredTestPred 0 1
## 0 41 34
## 1 23 152
step 5: Evaluate the Model
(conf_nat <- table(filteredTestPred, filteredTest$Creditability))
##
## filteredTestPred 0 1
## 0 41 34
## 1 23 152
Accuracy in percentage
(Accuracy <- sum(diag(conf_nat))/sum(conf_nat)*100)
## [1] 77.2
Part 3- Support Vector Machine
Letters Data
Step 1: Loading Data into R
library(readr)
letters <- read.csv("C:/Users/gmutya048/Downloads/letterdata.csv",stringsAsFactors=TRUE)
Step 2: Data preparation
str(letters)
## 'data.frame': 20000 obs. of 17 variables:
## $ letter: Factor w/ 26 levels "A","B","C","D",..: 20 9 4 14 7 19 2 1 10 13 ...
## $ xbox : int 2 5 4 7 2 4 4 1 2 11 ...
## $ ybox : int 8 12 11 11 1 11 2 1 2 15 ...
## $ width : int 3 3 6 6 3 5 5 3 4 13 ...
## $ height: int 5 7 8 6 1 8 4 2 4 9 ...
## $ onpix : int 1 2 6 3 1 3 4 1 2 7 ...
## $ xbar : int 8 10 10 5 8 8 8 8 10 13 ...
## $ ybar : int 13 5 6 9 6 8 7 2 6 2 ...
## $ x2bar : int 0 5 2 4 6 6 6 2 2 6 ...
## $ y2bar : int 6 4 6 6 6 9 6 2 6 2 ...
## $ xybar : int 6 13 10 4 6 5 7 8 12 12 ...
## $ x2ybar: int 10 3 3 4 5 6 6 2 4 1 ...
## $ xy2bar: int 8 9 7 10 9 6 6 8 8 9 ...
## $ xedge : int 0 2 3 6 1 0 2 1 1 8 ...
## $ xedgey: int 8 8 7 10 7 8 8 6 6 1 ...
## $ yedge : int 0 4 3 2 5 9 7 2 1 1 ...
## $ yedgex: int 8 10 9 8 10 7 10 7 7 8 ...
Step 3: Split the data into training and test datasets
letters_train <- letters[1:18000,]
letters_test <- letters[18001:20000,]
step 4: Training the model on Data
library(kernlab)
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
letter_classifier<-ksvm(letter ~ ., data=letters_train, kernel="vanilladot")
## Setting default kernel parameters
summary(letter_classifier)
## Length Class Mode
## 1 ksvm S4
step 5: Evaluate the Model
letter_predictions <- predict(letter_classifier, letters_test)
(p <- table(letter_predictions,letters_test$letter))
##
## letter_predictions A B C D E F G H I J K L M N O P Q R S T
## A 73 0 0 0 0 0 0 0 0 1 0 0 0 0 3 0 4 0 0 1
## B 0 61 0 3 2 0 1 1 0 0 1 1 0 0 0 2 0 1 3 0
## C 0 0 64 0 2 0 4 2 1 0 1 2 0 0 1 0 0 0 0 0
## D 2 1 0 67 0 0 1 3 3 2 1 2 0 3 4 2 1 2 0 0
## E 0 0 1 0 64 1 1 0 0 0 2 2 0 0 0 0 2 0 6 0
## F 0 0 0 0 0 70 1 1 4 0 0 0 0 0 0 5 1 0 2 0
## G 1 1 2 1 3 2 68 1 0 0 0 1 0 0 0 0 4 1 3 2
## H 0 0 0 1 0 1 0 46 0 2 3 1 1 1 9 0 0 5 0 3
## I 0 0 0 0 0 0 0 0 65 3 0 0 0 0 0 0 0 0 2 0
## J 0 1 0 0 0 1 0 0 3 61 0 0 0 0 1 0 0 0 1 0
## K 0 1 4 0 0 0 0 5 0 0 56 0 0 2 0 0 0 4 0 1
## L 0 0 0 0 1 0 0 1 0 0 0 63 0 0 0 0 0 0 0 0
## M 0 0 1 0 0 0 1 0 0 0 0 0 70 2 0 0 0 0 0 0
## N 0 0 0 0 0 0 0 0 0 0 0 0 0 77 0 0 0 1 0 0
## O 0 0 1 1 0 0 0 1 0 1 0 0 0 0 49 1 2 0 0 0
## P 0 0 0 0 0 3 0 0 0 0 0 0 0 0 2 69 0 0 0 0
## Q 0 0 0 0 0 0 3 1 0 0 0 2 0 0 2 1 52 0 1 0
## R 0 4 0 0 1 0 0 3 0 0 3 0 0 0 1 0 0 64 0 1
## S 0 1 0 0 1 1 1 0 1 1 0 0 0 0 0 0 6 0 47 1
## T 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 1 83
## U 0 0 2 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## V 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0
## W 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## X 0 1 0 0 1 0 0 1 0 0 1 4 0 0 0 0 0 1 0 0
## Y 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 0 0 0 1
## Z 1 0 0 0 2 0 0 0 0 2 0 0 0 0 0 0 0 0 5 1
##
## letter_predictions U V W X Y Z
## A 2 0 1 0 0 0
## B 0 0 0 0 0 0
## C 0 0 0 0 0 0
## D 0 0 0 0 1 0
## E 0 0 0 1 0 0
## F 0 1 0 0 2 0
## G 0 0 0 0 0 0
## H 0 2 0 0 1 0
## I 0 0 0 2 1 0
## J 0 0 0 1 0 4
## K 2 0 0 4 0 0
## L 0 0 0 0 0 0
## M 1 0 6 0 0 0
## N 1 0 2 0 0 0
## O 1 0 0 0 0 0
## P 0 0 0 0 1 0
## Q 0 0 0 0 0 0
## R 0 1 0 0 0 0
## S 0 0 0 1 0 6
## T 1 0 0 0 2 2
## U 83 0 0 0 0 0
## V 0 64 1 0 1 0
## W 0 3 59 0 0 0
## X 0 0 0 76 1 0
## Y 0 0 0 1 58 0
## Z 0 0 0 0 0 70
Accuracy in percentage
(Accuracy <- sum(diag(p))/sum(p)*100)
## [1] 83.95
News Popularity Data
Step1: Loading data into r
library(readr)
news<- read.csv("C:/Users/gmutya048/Downloads/OnlineNewsPopularity_for_R.csv")
str(news)
## 'data.frame': 39644 obs. of 61 variables:
## $ url : chr "http://mashable.com/2013/01/07/amazon-instant-video-browser/" "http://mashable.com/2013/01/07/ap-samsung-sponsored-tweets/" "http://mashable.com/2013/01/07/apple-40-billion-app-downloads/" "http://mashable.com/2013/01/07/astronaut-notre-dame-bcs/" ...
## $ timedelta : num 731 731 731 731 731 731 731 731 731 731 ...
## $ n_tokens_title : num 12 9 9 9 13 10 8 12 11 10 ...
## $ n_tokens_content : num 219 255 211 531 1072 ...
## $ n_unique_tokens : num 0.664 0.605 0.575 0.504 0.416 ...
## $ n_non_stop_words : num 1 1 1 1 1 ...
## $ n_non_stop_unique_tokens : num 0.815 0.792 0.664 0.666 0.541 ...
## $ num_hrefs : num 4 3 3 9 19 2 21 20 2 4 ...
## $ num_self_hrefs : num 2 1 1 0 19 2 20 20 0 1 ...
## $ num_imgs : num 1 1 1 1 20 0 20 20 0 1 ...
## $ num_videos : num 0 0 0 0 0 0 0 0 0 1 ...
## $ average_token_length : num 4.68 4.91 4.39 4.4 4.68 ...
## $ num_keywords : num 5 4 6 7 7 9 10 9 7 5 ...
## $ data_channel_is_lifestyle : num 0 0 0 0 0 0 1 0 0 0 ...
## $ data_channel_is_entertainment: num 1 0 0 1 0 0 0 0 0 0 ...
## $ data_channel_is_bus : num 0 1 1 0 0 0 0 0 0 0 ...
## $ data_channel_is_socmed : num 0 0 0 0 0 0 0 0 0 0 ...
## $ data_channel_is_tech : num 0 0 0 0 1 1 0 1 1 0 ...
## $ data_channel_is_world : num 0 0 0 0 0 0 0 0 0 1 ...
## $ kw_min_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ self_reference_min_shares : num 496 0 918 0 545 8500 545 545 0 0 ...
## $ self_reference_max_shares : num 496 0 918 0 16000 8500 16000 16000 0 0 ...
## $ self_reference_avg_sharess : num 496 0 918 0 3151 ...
## $ weekday_is_monday : num 1 1 1 1 1 1 1 1 1 1 ...
## $ weekday_is_tuesday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_wednesday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_thursday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_friday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_saturday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_sunday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ is_weekend : num 0 0 0 0 0 0 0 0 0 0 ...
## $ LDA_00 : num 0.5003 0.7998 0.2178 0.0286 0.0286 ...
## $ LDA_01 : num 0.3783 0.05 0.0333 0.4193 0.0288 ...
## $ LDA_02 : num 0.04 0.0501 0.0334 0.4947 0.0286 ...
## $ LDA_03 : num 0.0413 0.0501 0.0333 0.0289 0.0286 ...
## $ LDA_04 : num 0.0401 0.05 0.6822 0.0286 0.8854 ...
## $ global_subjectivity : num 0.522 0.341 0.702 0.43 0.514 ...
## $ global_sentiment_polarity : num 0.0926 0.1489 0.3233 0.1007 0.281 ...
## $ global_rate_positive_words : num 0.0457 0.0431 0.0569 0.0414 0.0746 ...
## $ global_rate_negative_words : num 0.0137 0.01569 0.00948 0.02072 0.01213 ...
## $ rate_positive_words : num 0.769 0.733 0.857 0.667 0.86 ...
## $ rate_negative_words : num 0.231 0.267 0.143 0.333 0.14 ...
## $ avg_positive_polarity : num 0.379 0.287 0.496 0.386 0.411 ...
## $ min_positive_polarity : num 0.1 0.0333 0.1 0.1364 0.0333 ...
## $ max_positive_polarity : num 0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
## $ avg_negative_polarity : num -0.35 -0.119 -0.467 -0.37 -0.22 ...
## $ min_negative_polarity : num -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
## $ max_negative_polarity : num -0.2 -0.1 -0.133 -0.167 -0.05 ...
## $ title_subjectivity : num 0.5 0 0 0 0.455 ...
## $ title_sentiment_polarity : num -0.188 0 0 0 0.136 ...
## $ abs_title_subjectivity : num 0 0.5 0.5 0.5 0.0455 ...
## $ abs_title_sentiment_polarity : num 0.188 0 0 0 0.136 ...
## $ shares : int 593 711 1500 1200 505 855 556 891 3600 710 ...
Step2: Data Preparation
newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)
colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")
newsShort$popular = rep('na', nrow(newsShort))
for(i in 1:39644) {
if(newsShort$shares[i] >= 1400) {
newsShort$popular[i] = "yes"}
else {newsShort$popular[i] = "no"}
}
newsShort$shares = newsShort$popular
newsShort$shares <- as.factor(newsShort$shares)
news_rand <- newsShort[order(runif(10000)), ]
set.seed(12345)
Step 3: Split the data into training and test datasets
news_train <- news_rand[1:9000, ]
news_test <- news_rand[9001:10000, ]
Step 4: Model Design
nb_model <- naive_bayes(shares ~ ., data=news_train)
## Warning: naive_bayes(): Feature popular - zero probabilities are present.
## Consider Laplace smoothing.
nb_model
##
## ================================== Naive Bayes ==================================
##
## Call:
## naive_bayes.formula(formula = shares ~ ., data = news_train)
##
## ---------------------------------------------------------------------------------
##
## Laplace smoothing: 0
##
## ---------------------------------------------------------------------------------
##
## A priori probabilities:
##
## no yes
## 0.4291111 0.5708889
##
## ---------------------------------------------------------------------------------
##
## Tables:
##
## ---------------------------------------------------------------------------------
## ::: n_tokens_title (Gaussian)
## ---------------------------------------------------------------------------------
##
## n_tokens_title no yes
## mean 9.820559 9.695991
## sd 1.929249 1.987754
##
## ---------------------------------------------------------------------------------
## ::: n_tokens_content (Gaussian)
## ---------------------------------------------------------------------------------
##
## n_tokens_content no yes
## mean 452.2315 515.1051
## sd 347.1779 450.0206
##
## ---------------------------------------------------------------------------------
## ::: n_unique_tokens (Gaussian)
## ---------------------------------------------------------------------------------
##
## n_unique_tokens no yes
## mean 0.5702437 0.5542023
## sd 0.1127776 0.1232687
##
## ---------------------------------------------------------------------------------
## ::: n_non_stop_words (Gaussian)
## ---------------------------------------------------------------------------------
##
## n_non_stop_words no yes
## mean 0.99404453 0.99124172
## sd 0.07695147 0.09318398
##
## ---------------------------------------------------------------------------------
## ::: num_hrefs (Gaussian)
## ---------------------------------------------------------------------------------
##
## num_hrefs no yes
## mean 9.147851 10.570650
## sd 8.644083 11.540711
##
## ---------------------------------------------------------------------------------
##
## # ... and 12 more tables
##
## ---------------------------------------------------------------------------------
step 5: Evaluate the Model
news_Pred <- predict(nb_model, newdata = news_test)
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
(conf_nat <- table(news_Pred, news_test$shares))
##
## news_Pred no yes
## no 419 0
## yes 11 570
Accuracy in percentage
(Accuracy <- sum(diag(conf_nat))/sum(conf_nat)*100)
## [1] 98.9