R Notebook

This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.

plot(cars)

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).

The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.

library(readxl)
creditData <- read.csv("C:/Users/NehaKatti/Desktop/creditData.csv")
View(creditData)
str(creditData)

## 'data.frame':    1000 obs. of  21 variables:
##  $ Creditability                    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Account.Balance                  : int  1 1 2 1 1 1 1 1 4 2 ...
##  $ Duration.of.Credit..month.       : int  18 9 12 12 12 10 8 6 18 24 ...
##  $ Payment.Status.of.Previous.Credit: int  4 4 2 4 4 4 4 4 4 2 ...
##  $ Purpose                          : int  2 0 9 0 0 0 0 0 3 3 ...
##  $ Credit.Amount                    : int  1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
##  $ Value.Savings.Stocks             : int  1 1 2 1 1 1 1 1 1 3 ...
##  $ Length.of.current.employment     : int  2 3 4 3 3 2 4 2 1 1 ...
##  $ Instalment.per.cent              : int  4 2 2 3 4 1 1 2 4 1 ...
##  $ Sex...Marital.Status             : int  2 3 2 3 3 3 3 3 2 2 ...
##  $ Guarantors                       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Duration.in.Current.address      : int  4 2 4 2 4 3 4 4 4 4 ...
##  $ Most.valuable.available.asset    : int  2 1 1 1 2 1 1 1 3 4 ...
##  $ Age..years.                      : int  21 36 23 39 38 48 39 40 65 23 ...
##  $ Concurrent.Credits               : int  3 3 3 3 1 3 3 3 3 3 ...
##  $ Type.of.apartment                : int  1 1 1 1 2 1 2 2 2 1 ...
##  $ No.of.Credits.at.this.Bank       : int  1 2 1 2 2 2 2 1 2 1 ...
##  $ Occupation                       : int  3 3 2 2 2 2 2 2 1 1 ...
##  $ No.of.dependents                 : int  1 2 1 2 1 2 1 2 1 1 ...
##  $ Telephone                        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Foreign.Worker                   : int  1 1 1 2 2 2 2 2 1 1 ...

sum(is.na(creditData))

## [1] 0

creditData$Creditability <- as.factor(creditData$Creditability)
set.seed(12345)
creditData_rand <- creditData[order(runif(1000)), ]
creditData_train <- creditData_rand[1:750, ]
creditData_test <- creditData_rand[751:1000, ]
library(naivebayes)

## Warning: package 'naivebayes' was built under R version 3.6.2

## naivebayes 0.9.6 loaded

nve <- naive_bayes(creditData_train$Creditability ~., data = creditData_train)
predict <- table(predict(nve, creditData_test), creditData_test$Creditability)

## Warning: predict.naive_bayes(): More features in the newdata are provided
## as there are probability tables in the object. Calculation is performed
## based on features to be found in the tables.

Accu <- sum(diag(predict/sum(predict)*100))
Accu

## [1] 77.2

#Part2
creditDataSc <- scale(creditData_rand[,2:ncol(creditData_rand)], center=TRUE, scale = TRUE)
r <- cor(creditDataSc)
filtered <- creditData_rand[, c(1,2,3,4,5, 6, 7,9, 10, 14)]
filteredtrain <- filtered[1:750, ]
filteredtest <- filtered[751:1000, ]
naive <- naive_bayes(filteredtrain$Creditability ~., data = filteredtrain)
predict1 <- table(predict(naive, filteredtest), filteredtest$Creditability)

## Warning: predict.naive_bayes(): More features in the newdata are provided
## as there are probability tables in the object. Calculation is performed
## based on features to be found in the tables.

Accu2 <- sum(diag(predict1/sum(predict1)*100))
Accu2

## [1] 82

library(readxl)
OnlineNews <- read.csv("C:/Users/NehaKatti/Desktop/OnlineNewsPopularity_for_R.csv")
View(OnlineNews)
str(OnlineNews)

## 'data.frame':    39644 obs. of  61 variables:
##  $ url                          : Factor w/ 39644 levels "http://mashable.com/2013/01/07/amazon-instant-video-browser/",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ timedelta                    : int  731 731 731 731 731 731 731 731 731 731 ...
##  $ n_tokens_title               : int  12 9 9 9 13 10 8 12 11 10 ...
##  $ n_tokens_content             : int  219 255 211 531 1072 370 960 989 97 231 ...
##  $ n_unique_tokens              : num  0.664 0.605 0.575 0.504 0.416 ...
##  $ n_non_stop_words             : num  1 1 1 1 1 ...
##  $ n_non_stop_unique_tokens     : num  0.815 0.792 0.664 0.666 0.541 ...
##  $ num_hrefs                    : int  4 3 3 9 19 2 21 20 2 4 ...
##  $ num_self_hrefs               : int  2 1 1 0 19 2 20 20 0 1 ...
##  $ num_imgs                     : int  1 1 1 1 20 0 20 20 0 1 ...
##  $ num_videos                   : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ average_token_length         : num  4.68 4.91 4.39 4.4 4.68 ...
##  $ num_keywords                 : int  5 4 6 7 7 9 10 9 7 5 ...
##  $ data_channel_is_lifestyle    : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ data_channel_is_entertainment: int  1 0 0 1 0 0 0 0 0 0 ...
##  $ data_channel_is_bus          : int  0 1 1 0 0 0 0 0 0 0 ...
##  $ data_channel_is_socmed       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ data_channel_is_tech         : int  0 0 0 0 1 1 0 1 1 0 ...
##  $ data_channel_is_world        : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ kw_min_min                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_min_max                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_max                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_min_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ self_reference_min_shares    : num  496 0 918 0 545 8500 545 545 0 0 ...
##  $ self_reference_max_shares    : num  496 0 918 0 16000 8500 16000 16000 0 0 ...
##  $ self_reference_avg_sharess   : num  496 0 918 0 3151 ...
##  $ weekday_is_monday            : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ weekday_is_tuesday           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_wednesday         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_thursday          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_friday            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_saturday          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_sunday            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ is_weekend                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ LDA_00                       : num  0.5003 0.7998 0.2178 0.0286 0.0286 ...
##  $ LDA_01                       : num  0.3783 0.05 0.0333 0.4193 0.0288 ...
##  $ LDA_02                       : num  0.04 0.0501 0.0334 0.4947 0.0286 ...
##  $ LDA_03                       : num  0.0413 0.0501 0.0333 0.0289 0.0286 ...
##  $ LDA_04                       : num  0.0401 0.05 0.6822 0.0286 0.8854 ...
##  $ global_subjectivity          : num  0.522 0.341 0.702 0.43 0.514 ...
##  $ global_sentiment_polarity    : num  0.0926 0.1489 0.3233 0.1007 0.281 ...
##  $ global_rate_positive_words   : num  0.0457 0.0431 0.0569 0.0414 0.0746 ...
##  $ global_rate_negative_words   : num  0.0137 0.01569 0.00948 0.02072 0.01213 ...
##  $ rate_positive_words          : num  0.769 0.733 0.857 0.667 0.86 ...
##  $ rate_negative_words          : num  0.231 0.267 0.143 0.333 0.14 ...
##  $ avg_positive_polarity        : num  0.379 0.287 0.496 0.386 0.411 ...
##  $ min_positive_polarity        : num  0.1 0.0333 0.1 0.1364 0.0333 ...
##  $ max_positive_polarity        : num  0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
##  $ avg_negative_polarity        : num  -0.35 -0.119 -0.467 -0.37 -0.22 ...
##  $ min_negative_polarity        : num  -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
##  $ max_negative_polarity        : num  -0.2 -0.1 -0.133 -0.167 -0.05 ...
##  $ title_subjectivity           : num  0.5 0 0 0 0.455 ...
##  $ title_sentiment_polarity     : num  -0.188 0 0 0 0.136 ...
##  $ abs_title_subjectivity       : num  0 0.5 0.5 0.5 0.0455 ...
##  $ abs_title_sentiment_polarity : num  0.188 0 0 0 0.136 ...
##  $ shares                       : int  593 711 1500 1200 505 855 556 891 3600 710 ...

sum(is.na(OnlineNews))

## [1] 0

OnlineNews$popular = rep('na', nrow(OnlineNews))
for(i in 1:39644) {
  if(OnlineNews$shares[i] >= 1400) {
    OnlineNews$popular[i] = 1} 
  else {OnlineNews$popular[i] = 0}
}
OnlineNews$popular <- as.numeric(OnlineNews$popular)
OnlineNews$popular <- as.factor(OnlineNews$popular)
OnlineNewsmod <- OnlineNews[order(runif(10000)), ]
set.seed(12345)
OnlineNewsmodtrain <- OnlineNewsmod[1:8500,]
OnlineNewsmodtest <- OnlineNewsmod[8501:10000,]
naive1 <- naive_bayes(OnlineNewsmodtrain$popular ~., data = OnlineNewsmodtrain)

## Warning: naive_bayes(): Feature url - zero probabilities are present.
## Consider Laplace smoothing.

predict2 <- table(predict(naive1,OnlineNewsmodtest), OnlineNewsmodtest$popular)

## Warning: predict.naive_bayes(): More features in the newdata are provided
## as there are probability tables in the object. Calculation is performed
## based on features to be found in the tables.

Accu3 <- sum(diag(predict/sum(predict2)*100))
Accu3

## [1] 12.86667

#Naives Bayer Part 2 gives better results than Naives Bayer Part 1 due to a wide variety of relevant independent variables.