ANLY 530 Lab 2

Credit Data

Step 1: Loading Data into R

library(readr)
creditData <- read.csv("C:/Users/gmutya048/Downloads/creditData.csv")
str(creditData)

## 'data.frame':    1000 obs. of  21 variables:
##  $ Creditability                    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Account.Balance                  : int  1 1 2 1 1 1 1 1 4 2 ...
##  $ Duration.of.Credit..month.       : int  18 9 12 12 12 10 8 6 18 24 ...
##  $ Payment.Status.of.Previous.Credit: int  4 4 2 4 4 4 4 4 4 2 ...
##  $ Purpose                          : int  2 0 9 0 0 0 0 0 3 3 ...
##  $ Credit.Amount                    : int  1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
##  $ Value.Savings.Stocks             : int  1 1 2 1 1 1 1 1 1 3 ...
##  $ Length.of.current.employment     : int  2 3 4 3 3 2 4 2 1 1 ...
##  $ Instalment.per.cent              : int  4 2 2 3 4 1 1 2 4 1 ...
##  $ Sex...Marital.Status             : int  2 3 2 3 3 3 3 3 2 2 ...
##  $ Guarantors                       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Duration.in.Current.address      : int  4 2 4 2 4 3 4 4 4 4 ...
##  $ Most.valuable.available.asset    : int  2 1 1 1 2 1 1 1 3 4 ...
##  $ Age..years.                      : int  21 36 23 39 38 48 39 40 65 23 ...
##  $ Concurrent.Credits               : int  3 3 3 3 1 3 3 3 3 3 ...
##  $ Type.of.apartment                : int  1 1 1 1 2 1 2 2 2 1 ...
##  $ No.of.Credits.at.this.Bank       : int  1 2 1 2 2 2 2 1 2 1 ...
##  $ Occupation                       : int  3 3 2 2 2 2 2 2 1 1 ...
##  $ No.of.dependents                 : int  1 2 1 2 1 2 1 2 1 1 ...
##  $ Telephone                        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Foreign.Worker                   : int  1 1 1 2 2 2 2 2 1 1 ...

Step 2: Data Preparation

creditData$Creditability <- as.factor(creditData$Creditability)
sum(is.na(creditData))

## [1] 0

Step 3: Split the data into training and test datasets

set.seed(12345)
credit_rand <- creditData[order(runif(1000)), ]
credit_train <- credit_rand[1:750, ]
credit_test <- credit_rand[751:1000, ]

prop.table(table(credit_train$Creditability))

## 
##         0         1 
## 0.3146667 0.6853333

prop.table(table(credit_test$Creditability))

## 
##     0     1 
## 0.256 0.744

Step 4: Model Design

library(naivebayes)

## naivebayes 0.9.7 loaded

naive_model <- naive_bayes(Creditability ~ ., data= credit_train)
naive_model

## 
## ================================== Naive Bayes ================================== 
##  
##  Call: 
## naive_bayes.formula(formula = Creditability ~ ., data = credit_train)
## 
## --------------------------------------------------------------------------------- 
##  
## Laplace smoothing: 0
## 
## --------------------------------------------------------------------------------- 
##  
##  A priori probabilities: 
## 
##         0         1 
## 0.3146667 0.6853333 
## 
## --------------------------------------------------------------------------------- 
##  
##  Tables: 
## 
## --------------------------------------------------------------------------------- 
##  ::: Account.Balance (Gaussian) 
## --------------------------------------------------------------------------------- 
##                
## Account.Balance        0        1
##            mean 1.923729 2.793774
##            sd   1.036826 1.252008
## 
## --------------------------------------------------------------------------------- 
##  ::: Duration.of.Credit..month. (Gaussian) 
## --------------------------------------------------------------------------------- 
##                           
## Duration.of.Credit..month.        0        1
##                       mean 24.46610 19.20039
##                       sd   13.82208 11.13433
## 
## --------------------------------------------------------------------------------- 
##  ::: Payment.Status.of.Previous.Credit (Gaussian) 
## --------------------------------------------------------------------------------- 
##                                  
## Payment.Status.of.Previous.Credit        0        1
##                              mean 2.161017 2.665370
##                              sd   1.071649 1.045219
## 
## --------------------------------------------------------------------------------- 
##  ::: Purpose (Gaussian) 
## --------------------------------------------------------------------------------- 
##        
## Purpose        0        1
##    mean 2.927966 2.803502
##    sd   2.944722 2.633253
## 
## --------------------------------------------------------------------------------- 
##  ::: Credit.Amount (Gaussian) 
## --------------------------------------------------------------------------------- 
##              
## Credit.Amount        0        1
##          mean 3964.195 2984.177
##          sd   3597.093 2379.685
## 
## ---------------------------------------------------------------------------------
## 
## # ... and 15 more tables
## 
## ---------------------------------------------------------------------------------

step 5: Evaluate the Model

(conf_nat <- table(predict(naive_model, credit_test), credit_test$Creditability))

## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.

##    
##       0   1
##   0  42  35
##   1  22 151

Accuracy in percentage

(Accuracy <- sum(diag(conf_nat))/sum(conf_nat)*100)

## [1] 77.2

Lab- Part 2 Model Performance improvement

Step2: Data Exploration

library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

creditDataScaled <- scale(credit_rand[,2:ncol(credit_rand)], center=TRUE, scale = TRUE)
m <- cor(creditDataScaled)
(highlycor <- findCorrelation(m, 0.30))

## [1]  5 12 19 15  3

Removing highly correlated data and divide into train and test sets

filteredData <- credit_rand[, -(highlycor[4]+1)]
filteredTraining <- filteredData[1:750, ]
filteredTest <- filteredData[751:1000, ]

Step 3: Training Data

nb_model <- naive_bayes(Creditability ~ ., data=filteredTraining)

step 4:Model Evaluation

filteredTestPred <- predict(nb_model, newdata = filteredTest)

## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.

table(filteredTestPred, filteredTest$Creditability)

##                 
## filteredTestPred   0   1
##                0  41  34
##                1  23 152

step 5: Evaluate the Model

(conf_nat <- table(filteredTestPred, filteredTest$Creditability))

##                 
## filteredTestPred   0   1
##                0  41  34
##                1  23 152

Accuracy in percentage

(Accuracy <- sum(diag(conf_nat))/sum(conf_nat)*100)

## [1] 77.2

Part 3- Support Vector Machine

Letters Data

Step 1: Loading Data into R

library(readr)
letters <- read.csv("C:/Users/gmutya048/Downloads/letterdata.csv",stringsAsFactors=TRUE)

Step 2: Data preparation

str(letters)

## 'data.frame':    20000 obs. of  17 variables:
##  $ letter: Factor w/ 26 levels "A","B","C","D",..: 20 9 4 14 7 19 2 1 10 13 ...
##  $ xbox  : int  2 5 4 7 2 4 4 1 2 11 ...
##  $ ybox  : int  8 12 11 11 1 11 2 1 2 15 ...
##  $ width : int  3 3 6 6 3 5 5 3 4 13 ...
##  $ height: int  5 7 8 6 1 8 4 2 4 9 ...
##  $ onpix : int  1 2 6 3 1 3 4 1 2 7 ...
##  $ xbar  : int  8 10 10 5 8 8 8 8 10 13 ...
##  $ ybar  : int  13 5 6 9 6 8 7 2 6 2 ...
##  $ x2bar : int  0 5 2 4 6 6 6 2 2 6 ...
##  $ y2bar : int  6 4 6 6 6 9 6 2 6 2 ...
##  $ xybar : int  6 13 10 4 6 5 7 8 12 12 ...
##  $ x2ybar: int  10 3 3 4 5 6 6 2 4 1 ...
##  $ xy2bar: int  8 9 7 10 9 6 6 8 8 9 ...
##  $ xedge : int  0 2 3 6 1 0 2 1 1 8 ...
##  $ xedgey: int  8 8 7 10 7 8 8 6 6 1 ...
##  $ yedge : int  0 4 3 2 5 9 7 2 1 1 ...
##  $ yedgex: int  8 10 9 8 10 7 10 7 7 8 ...

Step 3: Split the data into training and test datasets

letters_train <- letters[1:18000,] 
letters_test <- letters[18001:20000,]

step 4: Training the model on Data

library(kernlab)

## 
## Attaching package: 'kernlab'

## The following object is masked from 'package:ggplot2':
## 
##     alpha

letter_classifier<-ksvm(letter ~ ., data=letters_train, kernel="vanilladot")

##  Setting default kernel parameters

summary(letter_classifier)

## Length  Class   Mode 
##      1   ksvm     S4

step 5: Evaluate the Model

letter_predictions <- predict(letter_classifier, letters_test) 
(p <- table(letter_predictions,letters_test$letter))

##                   
## letter_predictions  A  B  C  D  E  F  G  H  I  J  K  L  M  N  O  P  Q  R  S  T
##                  A 73  0  0  0  0  0  0  0  0  1  0  0  0  0  3  0  4  0  0  1
##                  B  0 61  0  3  2  0  1  1  0  0  1  1  0  0  0  2  0  1  3  0
##                  C  0  0 64  0  2  0  4  2  1  0  1  2  0  0  1  0  0  0  0  0
##                  D  2  1  0 67  0  0  1  3  3  2  1  2  0  3  4  2  1  2  0  0
##                  E  0  0  1  0 64  1  1  0  0  0  2  2  0  0  0  0  2  0  6  0
##                  F  0  0  0  0  0 70  1  1  4  0  0  0  0  0  0  5  1  0  2  0
##                  G  1  1  2  1  3  2 68  1  0  0  0  1  0  0  0  0  4  1  3  2
##                  H  0  0  0  1  0  1  0 46  0  2  3  1  1  1  9  0  0  5  0  3
##                  I  0  0  0  0  0  0  0  0 65  3  0  0  0  0  0  0  0  0  2  0
##                  J  0  1  0  0  0  1  0  0  3 61  0  0  0  0  1  0  0  0  1  0
##                  K  0  1  4  0  0  0  0  5  0  0 56  0  0  2  0  0  0  4  0  1
##                  L  0  0  0  0  1  0  0  1  0  0  0 63  0  0  0  0  0  0  0  0
##                  M  0  0  1  0  0  0  1  0  0  0  0  0 70  2  0  0  0  0  0  0
##                  N  0  0  0  0  0  0  0  0  0  0  0  0  0 77  0  0  0  1  0  0
##                  O  0  0  1  1  0  0  0  1  0  1  0  0  0  0 49  1  2  0  0  0
##                  P  0  0  0  0  0  3  0  0  0  0  0  0  0  0  2 69  0  0  0  0
##                  Q  0  0  0  0  0  0  3  1  0  0  0  2  0  0  2  1 52  0  1  0
##                  R  0  4  0  0  1  0  0  3  0  0  3  0  0  0  1  0  0 64  0  1
##                  S  0  1  0  0  1  1  1  0  1  1  0  0  0  0  0  0  6  0 47  1
##                  T  0  0  0  0  1  1  0  0  0  0  1  0  0  0  0  0  0  0  1 83
##                  U  0  0  2  1  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0
##                  V  0  0  0  0  0  0  0  0  0  0  0  0  1  0  1  0  0  0  0  0
##                  W  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0
##                  X  0  1  0  0  1  0  0  1  0  0  1  4  0  0  0  0  0  1  0  0
##                  Y  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  4  0  0  0  1
##                  Z  1  0  0  0  2  0  0  0  0  2  0  0  0  0  0  0  0  0  5  1
##                   
## letter_predictions  U  V  W  X  Y  Z
##                  A  2  0  1  0  0  0
##                  B  0  0  0  0  0  0
##                  C  0  0  0  0  0  0
##                  D  0  0  0  0  1  0
##                  E  0  0  0  1  0  0
##                  F  0  1  0  0  2  0
##                  G  0  0  0  0  0  0
##                  H  0  2  0  0  1  0
##                  I  0  0  0  2  1  0
##                  J  0  0  0  1  0  4
##                  K  2  0  0  4  0  0
##                  L  0  0  0  0  0  0
##                  M  1  0  6  0  0  0
##                  N  1  0  2  0  0  0
##                  O  1  0  0  0  0  0
##                  P  0  0  0  0  1  0
##                  Q  0  0  0  0  0  0
##                  R  0  1  0  0  0  0
##                  S  0  0  0  1  0  6
##                  T  1  0  0  0  2  2
##                  U 83  0  0  0  0  0
##                  V  0 64  1  0  1  0
##                  W  0  3 59  0  0  0
##                  X  0  0  0 76  1  0
##                  Y  0  0  0  1 58  0
##                  Z  0  0  0  0  0 70

Accuracy in percentage

(Accuracy <- sum(diag(p))/sum(p)*100)

## [1] 83.95

News Popularity Data

Step1: Loading data into r

library(readr)
news<- read.csv("C:/Users/gmutya048/Downloads/OnlineNewsPopularity_for_R.csv")
str(news)

## 'data.frame':    39644 obs. of  61 variables:
##  $ url                          : chr  "http://mashable.com/2013/01/07/amazon-instant-video-browser/" "http://mashable.com/2013/01/07/ap-samsung-sponsored-tweets/" "http://mashable.com/2013/01/07/apple-40-billion-app-downloads/" "http://mashable.com/2013/01/07/astronaut-notre-dame-bcs/" ...
##  $ timedelta                    : num  731 731 731 731 731 731 731 731 731 731 ...
##  $ n_tokens_title               : num  12 9 9 9 13 10 8 12 11 10 ...
##  $ n_tokens_content             : num  219 255 211 531 1072 ...
##  $ n_unique_tokens              : num  0.664 0.605 0.575 0.504 0.416 ...
##  $ n_non_stop_words             : num  1 1 1 1 1 ...
##  $ n_non_stop_unique_tokens     : num  0.815 0.792 0.664 0.666 0.541 ...
##  $ num_hrefs                    : num  4 3 3 9 19 2 21 20 2 4 ...
##  $ num_self_hrefs               : num  2 1 1 0 19 2 20 20 0 1 ...
##  $ num_imgs                     : num  1 1 1 1 20 0 20 20 0 1 ...
##  $ num_videos                   : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ average_token_length         : num  4.68 4.91 4.39 4.4 4.68 ...
##  $ num_keywords                 : num  5 4 6 7 7 9 10 9 7 5 ...
##  $ data_channel_is_lifestyle    : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ data_channel_is_entertainment: num  1 0 0 1 0 0 0 0 0 0 ...
##  $ data_channel_is_bus          : num  0 1 1 0 0 0 0 0 0 0 ...
##  $ data_channel_is_socmed       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ data_channel_is_tech         : num  0 0 0 0 1 1 0 1 1 0 ...
##  $ data_channel_is_world        : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ kw_min_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_min_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_min_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ self_reference_min_shares    : num  496 0 918 0 545 8500 545 545 0 0 ...
##  $ self_reference_max_shares    : num  496 0 918 0 16000 8500 16000 16000 0 0 ...
##  $ self_reference_avg_sharess   : num  496 0 918 0 3151 ...
##  $ weekday_is_monday            : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ weekday_is_tuesday           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_wednesday         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_thursday          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_friday            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_saturday          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_sunday            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ is_weekend                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ LDA_00                       : num  0.5003 0.7998 0.2178 0.0286 0.0286 ...
##  $ LDA_01                       : num  0.3783 0.05 0.0333 0.4193 0.0288 ...
##  $ LDA_02                       : num  0.04 0.0501 0.0334 0.4947 0.0286 ...
##  $ LDA_03                       : num  0.0413 0.0501 0.0333 0.0289 0.0286 ...
##  $ LDA_04                       : num  0.0401 0.05 0.6822 0.0286 0.8854 ...
##  $ global_subjectivity          : num  0.522 0.341 0.702 0.43 0.514 ...
##  $ global_sentiment_polarity    : num  0.0926 0.1489 0.3233 0.1007 0.281 ...
##  $ global_rate_positive_words   : num  0.0457 0.0431 0.0569 0.0414 0.0746 ...
##  $ global_rate_negative_words   : num  0.0137 0.01569 0.00948 0.02072 0.01213 ...
##  $ rate_positive_words          : num  0.769 0.733 0.857 0.667 0.86 ...
##  $ rate_negative_words          : num  0.231 0.267 0.143 0.333 0.14 ...
##  $ avg_positive_polarity        : num  0.379 0.287 0.496 0.386 0.411 ...
##  $ min_positive_polarity        : num  0.1 0.0333 0.1 0.1364 0.0333 ...
##  $ max_positive_polarity        : num  0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
##  $ avg_negative_polarity        : num  -0.35 -0.119 -0.467 -0.37 -0.22 ...
##  $ min_negative_polarity        : num  -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
##  $ max_negative_polarity        : num  -0.2 -0.1 -0.133 -0.167 -0.05 ...
##  $ title_subjectivity           : num  0.5 0 0 0 0.455 ...
##  $ title_sentiment_polarity     : num  -0.188 0 0 0 0.136 ...
##  $ abs_title_subjectivity       : num  0 0.5 0.5 0.5 0.0455 ...
##  $ abs_title_sentiment_polarity : num  0.188 0 0 0 0.136 ...
##  $ shares                       : int  593 711 1500 1200 505 855 556 891 3600 710 ...

Step2: Data Preparation

newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)

colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")

newsShort$popular = rep('na', nrow(newsShort))
for(i in 1:39644) {
     if(newsShort$shares[i] >= 1400) {
         newsShort$popular[i] = "yes"} 
     else {newsShort$popular[i] = "no"}
}
newsShort$shares = newsShort$popular
newsShort$shares <- as.factor(newsShort$shares)

news_rand <- newsShort[order(runif(10000)), ]
set.seed(12345)

Step 3: Split the data into training and test datasets

news_train <- news_rand[1:9000, ]
news_test <- news_rand[9001:10000, ]

Step 4: Model Design

nb_model <- naive_bayes(shares ~ ., data=news_train)

## Warning: naive_bayes(): Feature popular - zero probabilities are present.
## Consider Laplace smoothing.

nb_model

## 
## ================================== Naive Bayes ================================== 
##  
##  Call: 
## naive_bayes.formula(formula = shares ~ ., data = news_train)
## 
## --------------------------------------------------------------------------------- 
##  
## Laplace smoothing: 0
## 
## --------------------------------------------------------------------------------- 
##  
##  A priori probabilities: 
## 
##        no       yes 
## 0.4291111 0.5708889 
## 
## --------------------------------------------------------------------------------- 
##  
##  Tables: 
## 
## --------------------------------------------------------------------------------- 
##  ::: n_tokens_title (Gaussian) 
## --------------------------------------------------------------------------------- 
##               
## n_tokens_title       no      yes
##           mean 9.820559 9.695991
##           sd   1.929249 1.987754
## 
## --------------------------------------------------------------------------------- 
##  ::: n_tokens_content (Gaussian) 
## --------------------------------------------------------------------------------- 
##                 
## n_tokens_content       no      yes
##             mean 452.2315 515.1051
##             sd   347.1779 450.0206
## 
## --------------------------------------------------------------------------------- 
##  ::: n_unique_tokens (Gaussian) 
## --------------------------------------------------------------------------------- 
##                
## n_unique_tokens        no       yes
##            mean 0.5702437 0.5542023
##            sd   0.1127776 0.1232687
## 
## --------------------------------------------------------------------------------- 
##  ::: n_non_stop_words (Gaussian) 
## --------------------------------------------------------------------------------- 
##                 
## n_non_stop_words         no        yes
##             mean 0.99404453 0.99124172
##             sd   0.07695147 0.09318398
## 
## --------------------------------------------------------------------------------- 
##  ::: num_hrefs (Gaussian) 
## --------------------------------------------------------------------------------- 
##          
## num_hrefs        no       yes
##      mean  9.147851 10.570650
##      sd    8.644083 11.540711
## 
## ---------------------------------------------------------------------------------
## 
## # ... and 12 more tables
## 
## ---------------------------------------------------------------------------------

step 5: Evaluate the Model

news_Pred <- predict(nb_model, newdata = news_test)

## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.

(conf_nat <- table(news_Pred, news_test$shares))

##          
## news_Pred  no yes
##       no  419   0
##       yes  11 570

Accuracy in percentage

(Accuracy <- sum(diag(conf_nat))/sum(conf_nat)*100)

## [1] 98.9

ANLY 530 Lab 2

Gayathri Mutyala

11/12/2020