Scatter Plot

load("C:/Users/USER/Downloads/cdc.Rdata")

head(cdc)
##     genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 1      good       0        1        0     70    175      175  77      m
## 2      good       0        1        1     64    125      115  33      f
## 3      good       1        1        1     60    105      105  49      f
## 4      good       1        1        0     66    132      124  42      f
## 5 very good       0        1        0     61    150      130  55      f
## 6 very good       1        1        0     64    114      114  55      f
hist(cdc$weight)

plot(cdc$weight, cdc$wtdesire,col=cdc$genhlth)

data(iris)
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
median(iris$Petal.Length)
## [1] 4.35
plot(x=iris$Sepal.Length, y = iris$Petal.Length)
abline(h = median(iris$Petal.Length), col='orange')

plot(x=iris$Sepal.Length, y = iris$Petal.Length, col=iris$Species)

plot(x=iris$Sepal.Length, y = iris$Petal.Length, col=ifelse(iris$Petal.Length >= median(iris$Petal.Length), 'red', 'blue'))
abline(h = median(iris$Petal.Length), col='orange')

setosa <- iris[iris$Species=='setosa',]
versicolor <- iris[iris$Species=='versicolor',]

plot(x=iris$Sepal.Length, y = iris$Petal.Length, type='n')
points(x=setosa$Sepal.Length, y= setosa$Petal.Length, col='green')
points(x=versicolor$Sepal.Length, y= versicolor$Petal.Length, col='red')

fit <- lm(cdc$wtdesire ~ cdc$weight)
fit
## 
## Call:
## lm(formula = cdc$wtdesire ~ cdc$weight)
## 
## Coefficients:
## (Intercept)   cdc$weight  
##      46.664        0.639
plot(cdc$weight, cdc$wtdesire)
abline(fit, col='red')

Mosaic Plot

head(cdc)
##     genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 1      good       0        1        0     70    175      175  77      m
## 2      good       0        1        1     64    125      115  33      f
## 3      good       1        1        1     60    105      105  49      f
## 4      good       1        1        0     66    132      124  42      f
## 5 very good       0        1        0     61    150      130  55      f
## 6 very good       1        1        0     64    114      114  55      f
tb <- table(cdc$smoke100)
barplot(tb)

smokers_gender <- table(cdc$gender, cdc$smoke100)
colnames(smokers_gender)
## [1] "0" "1"
colnames(smokers_gender) <- c('no', 'yes')
mosaicplot(smokers_gender, col=c('blue', 'red'))

rainbow(2)
## [1] "#FF0000FF" "#00FFFFFF"
#mosaicplot(smokers_gender, col=rainbow(2))
mosaicplot(smokers_gender, col=rainbow(length(colnames(smokers_gender))))

#smokers_gender

barplot(smokers_gender,col=c("darkblue","red"), beside=TRUE)

barplot(smokers_gender,col=c("darkblue","red"), beside=FALSE)

Boxplot

hist(cdc$weight)

summary(cdc$weight)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    68.0   140.0   165.0   169.7   190.0   500.0
boxplot(cdc$weight)

boxplot(cdc$weight ~ cdc$gender)

salary <- c(45,56,62,70,66,72,54,49,100)

sum(salary) / length(salary)
## [1] 63.77778
mean(salary)
## [1] 63.77778
sort(salary)
## [1]  45  49  54  56  62  66  70  72 100
median(salary)
## [1] 62
quantile(salary, 0.25)
## 25% 
##  54
quantile(salary, 0.75)
## 75% 
##  70
quantile(salary, 0.75) - quantile(salary, 0.25) 
## 75% 
##  16
IQR(salary)
## [1] 16
max(min(salary), median(salary) - 1.5 * IQR(salary) )
## [1] 45
lower_adjencent_value <- min(salary[salary > (median(salary) - 1.5 * IQR(salary))])

upper_adgencent_value <- max(salary[salary < (median(salary) + 1.5 * IQR(salary))])

lower_hinge <- quantile(salary, 0.25)
upper_hinge <- quantile(salary, 0.75)
print(paste(lower_adjencent_value, lower_hinge, median(salary), upper_hinge, upper_adgencent_value))
## [1] "45 54 62 70 72"
boxplot(salary)

Legend

taipei <- c(92.5,132.6,168.8,159.1,218.7)
tainan <- c(21.2, 30.6, 37.3, 84.6, 184.3)

plot(taipei, type="o", col="blue",ylim=c(0,300), xlim=c(1,7), xlab="Month", ylab="Rainfall", main = 'Rainfall from Taipei and Tainan')
lines(tainan, type="o", pch=22, lty=2, col="red")
#legend('topleft',c('taipei', 'tainan'), col=c('blue', 'red'), lwd=2.5 , title= 'Rainfall', lty=c(1,2))

legend(x = 1, y = 250,c('taipei', 'tainan'), col=c('blue', 'red'), lwd=2.5 , title= 'Rainfall', lty=c(1,2))
text(x=5, y= 250, 'Taipei', col='blue')
text(x=5.5, y= 200, 'Tainan', col='red')

house <- read.csv('https://raw.githubusercontent.com/ywchiu/rtibame/master/data/house-prices.csv')


bedroomsTable <- table(house$Bedrooms)
pie(bedroomsTable, main="Bedroom Type Calculate",col  = rainbow(length(names(bedroomsTable))))
legend('bottomright', legend = names(bedroomsTable), fill = rainbow(length(names(bedroomsTable))),title="units", cex=0.8)

par

showLayout<-function(n){
  for( i in 1:n){
    plot(1,type="n",xaxt="n",yaxt="n",xlab="",ylab="")
    text(1, 1, labels=i, cex=10)
  }
}

par(mfrow=c(3,2), mar=c(1,1,1,1))
showLayout(6)

par(mfcol=c(3,2), mar=c(1,1,1,1))
showLayout(6)

par(mfrow=c(3,2), mar=c(3,3,3,3))
showLayout(6)

jpeg("layout.jpg")
par(mfrow=c(3,2), mar=c(3,3,3,3))
showLayout(6)
dev.off()
## png 
##   2

Donut Chart

#install.packages('plotly')
library(plotly)
## Warning: package 'plotly' was built under R version 3.4.4
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.4.4
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
ds  <- data.frame(labels=c("A", "B", "C"),values =c(10, 20, 30))

plot_ly(ds, labels=ds$labels, values =ds$values, type ="pie", hole=0.6) %>% layout(title="Donut Chart Example")
## Warning: package 'bindrcpp' was built under R version 3.4.4
plot_ly(ds, labels=ds$labels, values =ds$values, type ="pie")

Area Chart

library(plotly)
month<-c(1,2,3,4,5)
taipei<-c(92.5,132.6,168.8,159.1,218.7)
tainan <-c(21.2, 30.6, 37.3, 84.6, 184.3)

plot_ly(x =month, y =taipei, name="taipei",type='scatter', mode='lines')
plot_ly(x =month, y =taipei,fill ="tozeroy", name="taipei",type='scatter', mode='markers') %>% add_trace(x =month, y =tainan, fill ="tozeroy",name="tainan") %>% layout(yaxis=list(title='rainfall'))
total <- taipei + tainan
plot_ly(x =month, y =taipei,fill ="tozeroy", name="taipei",type='scatter', mode='markers') %>% add_trace(x =month, y =total, fill ="tonexty",name="tainan") %>% layout(yaxis=list(title='rainfall'))

Diamonds

data("diamonds")
diamonds
## # A tibble: 53,940 x 10
##    carat cut       color clarity depth table price     x     y     z
##    <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1 0.230 Ideal     E     SI2      61.5  55.0   326  3.95  3.98  2.43
##  2 0.210 Premium   E     SI1      59.8  61.0   326  3.89  3.84  2.31
##  3 0.230 Good      E     VS1      56.9  65.0   327  4.05  4.07  2.31
##  4 0.290 Premium   I     VS2      62.4  58.0   334  4.20  4.23  2.63
##  5 0.310 Good      J     SI2      63.3  58.0   335  4.34  4.35  2.75
##  6 0.240 Very Good J     VVS2     62.8  57.0   336  3.94  3.96  2.48
##  7 0.240 Very Good I     VVS1     62.3  57.0   336  3.95  3.98  2.47
##  8 0.260 Very Good H     SI1      61.9  55.0   337  4.07  4.11  2.53
##  9 0.220 Fair      E     VS2      65.1  61.0   337  3.87  3.78  2.49
## 10 0.230 Very Good H     VS1      59.4  61.0   338  4.00  4.05  2.39
## # ... with 53,930 more rows
d <-diamonds[sample(nrow(diamonds), 1000), ]
plot_ly(d, x =d$carat, y=d$price, color =d$clarity, type='scatter', mode='markers', size =d$carat, text=paste("Clarity", d$clarity))

Multiple Charts

data("economics")

plot_ly(economics, x =economics$date, y =economics$unemploy, type='scatter', mode='lines')
plot_ly(economics, x =economics$date, y =economics$uempmed, type='scatter', mode='lines')
p <-subplot(
  plot_ly(economics, x =economics$date, y =economics$unemploy, type='scatter', mode='lines'),
  plot_ly(economics, x =economics$date, y =economics$uempmed, type='scatter', mode='lines'),
  margin=0.05
)

p %>%layout(showlegend=FALSE)
p <-subplot(
  plot_ly(economics, x =economics$date, y =economics$unemploy, type='scatter', mode='lines'),
  plot_ly(economics, x =economics$date, y =economics$uempmed, type='scatter', mode='lines'),
  margin=0.05,
  nrows = 2
)
p %>%layout(showlegend=FALSE)
#plot_ly(economics, x =economics$date, y =economics$unemploy, type='scatter', mode='lines') %>% add_trace(economics, x =economics$date, y =economics$uempmed, type='scatter', mode='lines')

Sapply Example

a <- data.frame(product = c('牛排煎鍋', '牛排', '煎鍋' ),stringsAsFactors=FALSE)
a
##    product
## 1 牛排煎鍋
## 2     牛排
## 3     煎鍋
b <- data.frame(words = c('煎鍋', '汽車'),stringsAsFactors=FALSE)
b$words
## [1] "煎鍋" "汽車"
filter_fun <- function(products){
  s <- sum(sapply(b$words, function(e) grepl(pattern = e,x = products) )) >= 1
  return(s)
}

a$product[! sapply(a$product, filter_fun)]
## [1] "牛排"

Tableau

RPART

data(iris)
library(rpart)
# y = ax +b
fit <- rpart(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = iris)

fit
## n= 150 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 150 100 setosa (0.33333333 0.33333333 0.33333333)  
##   2) Petal.Length< 2.45 50   0 setosa (1.00000000 0.00000000 0.00000000) *
##   3) Petal.Length>=2.45 100  50 versicolor (0.00000000 0.50000000 0.50000000)  
##     6) Petal.Width< 1.75 54   5 versicolor (0.00000000 0.90740741 0.09259259) *
##     7) Petal.Width>=1.75 46   1 virginica (0.00000000 0.02173913 0.97826087) *
summary(fit)
## Call:
## rpart(formula = Species ~ Sepal.Length + Sepal.Width + Petal.Length + 
##     Petal.Width, data = iris)
##   n= 150 
## 
##     CP nsplit rel error xerror       xstd
## 1 0.50      0      1.00   1.18 0.05017303
## 2 0.44      1      0.50   0.78 0.06118823
## 3 0.01      2      0.06   0.10 0.03055050
## 
## Variable importance
##  Petal.Width Petal.Length Sepal.Length  Sepal.Width 
##           34           31           21           14 
## 
## Node number 1: 150 observations,    complexity param=0.5
##   predicted class=setosa      expected loss=0.6666667  P(node) =1
##     class counts:    50    50    50
##    probabilities: 0.333 0.333 0.333 
##   left son=2 (50 obs) right son=3 (100 obs)
##   Primary splits:
##       Petal.Length < 2.45 to the left,  improve=50.00000, (0 missing)
##       Petal.Width  < 0.8  to the left,  improve=50.00000, (0 missing)
##       Sepal.Length < 5.45 to the left,  improve=34.16405, (0 missing)
##       Sepal.Width  < 3.35 to the right, improve=19.03851, (0 missing)
##   Surrogate splits:
##       Petal.Width  < 0.8  to the left,  agree=1.000, adj=1.00, (0 split)
##       Sepal.Length < 5.45 to the left,  agree=0.920, adj=0.76, (0 split)
##       Sepal.Width  < 3.35 to the right, agree=0.833, adj=0.50, (0 split)
## 
## Node number 2: 50 observations
##   predicted class=setosa      expected loss=0  P(node) =0.3333333
##     class counts:    50     0     0
##    probabilities: 1.000 0.000 0.000 
## 
## Node number 3: 100 observations,    complexity param=0.44
##   predicted class=versicolor  expected loss=0.5  P(node) =0.6666667
##     class counts:     0    50    50
##    probabilities: 0.000 0.500 0.500 
##   left son=6 (54 obs) right son=7 (46 obs)
##   Primary splits:
##       Petal.Width  < 1.75 to the left,  improve=38.969400, (0 missing)
##       Petal.Length < 4.75 to the left,  improve=37.353540, (0 missing)
##       Sepal.Length < 6.15 to the left,  improve=10.686870, (0 missing)
##       Sepal.Width  < 2.45 to the left,  improve= 3.555556, (0 missing)
##   Surrogate splits:
##       Petal.Length < 4.75 to the left,  agree=0.91, adj=0.804, (0 split)
##       Sepal.Length < 6.15 to the left,  agree=0.73, adj=0.413, (0 split)
##       Sepal.Width  < 2.95 to the left,  agree=0.67, adj=0.283, (0 split)
## 
## Node number 6: 54 observations
##   predicted class=versicolor  expected loss=0.09259259  P(node) =0.36
##     class counts:     0    49     5
##    probabilities: 0.000 0.907 0.093 
## 
## Node number 7: 46 observations
##   predicted class=virginica   expected loss=0.02173913  P(node) =0.3066667
##     class counts:     0     1    45
##    probabilities: 0.000 0.022 0.978
plot(fit, margin =0.1)
text(fit)

plot(Petal.Width ~ Petal.Length, data = iris, col=iris$Species)
abline(v = 2.45, col='blue')
abline(h = 1.75, col='orange')

predicted <- predict(fit, iris, type= 'class')

table(iris$Species, predicted)
##             predicted
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         49         1
##   virginica       0          5        45
#(50 + 49 + 45 ) / 150

#install.packages('caret')
#install.packages('e1071')
library(caret)
## Warning: package 'caret' was built under R version 3.4.4
## Loading required package: lattice
cm<-table(predict(fit, iris[,1:4], type="class"), iris[,5])
confusionMatrix(cm)
## Confusion Matrix and Statistics
## 
##             
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         49         5
##   virginica       0          1        45
## 
## Overall Statistics
##                                          
##                Accuracy : 0.96           
##                  95% CI : (0.915, 0.9852)
##     No Information Rate : 0.3333         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.94           
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9800           0.9000
## Specificity                 1.0000            0.9500           0.9900
## Pos Pred Value              1.0000            0.9074           0.9783
## Neg Pred Value              1.0000            0.9896           0.9519
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3267           0.3000
## Detection Prevalence        0.3333            0.3600           0.3067
## Balanced Accuracy           1.0000            0.9650           0.9450
df <- data.frame(x = c(1,2,3), y= c(2,3,4), points = c('a','b', 'a'))
plot(y ~ x, data = df, col=c('red', 'blue', 'green'))

plot(y ~ x, data = df, col=c(1,2,1))

str(df)
## 'data.frame':    3 obs. of  3 variables:
##  $ x     : num  1 2 3
##  $ y     : num  2 3 4
##  $ points: Factor w/ 2 levels "a","b": 1 2 1
plot(y ~ x, data = df, col=df$points)

set.seed(123)
sample.int(42,6)
## [1] 13 33 17 35 36  2
set.seed(123)
idx <- sample.int(2, nrow(iris), replace=TRUE, prob = c(0.7,0.3))
trainset <- iris[idx == 1,]
testset  <- iris[idx == 2,]
dim(trainset)
## [1] 106   5
dim(testset)
## [1] 44  5
fit <- rpart(Species ~ ., data= iris)
plot(fit, margin = 0.1)
text(fit)

predicted <- predict(fit, testset, type= 'class')
tb <- table(testset$Species, predicted)

(15 + 13 +13) / (15 + 14 + 15)
## [1] 0.9318182
confusionMatrix(tb)
## Confusion Matrix and Statistics
## 
##             predicted
##              setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      0         13         1
##   virginica       0          2        13
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9318          
##                  95% CI : (0.8134, 0.9857)
##     No Information Rate : 0.3409          
##     P-Value [Acc > NIR] : 2.711e-16       
##                                           
##                   Kappa : 0.8978          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.8667           0.9286
## Specificity                 1.0000            0.9655           0.9333
## Pos Pred Value              1.0000            0.9286           0.8667
## Neg Pred Value              1.0000            0.9333           0.9655
## Prevalence                  0.3409            0.3409           0.3182
## Detection Rate              0.3409            0.2955           0.2955
## Detection Prevalence        0.3409            0.3182           0.3409
## Balanced Accuracy           1.0000            0.9161           0.9310

Logistic Regression

dataset <- iris[iris$Species!= 'setosa', ]
dataset$Species <- factor(dataset$Species)

fit <- glm(Species ~., data = dataset, family='binomial')
summary(fit)
## 
## Call:
## glm(formula = Species ~ ., family = "binomial", data = dataset)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.01105  -0.00541  -0.00001   0.00677   1.78065  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)  
## (Intercept)   -42.638     25.707  -1.659   0.0972 .
## Sepal.Length   -2.465      2.394  -1.030   0.3032  
## Sepal.Width    -6.681      4.480  -1.491   0.1359  
## Petal.Length    9.429      4.737   1.991   0.0465 *
## Petal.Width    18.286      9.743   1.877   0.0605 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 138.629  on 99  degrees of freedom
## Residual deviance:  11.899  on 95  degrees of freedom
## AIC: 21.899
## 
## Number of Fisher Scoring iterations: 10
predicted <- predict(fit, dataset, type= 'response')
summary(predicted)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.00e+00 1.48e-05 5.37e-01 5.00e-01 1.00e+00 1.00e+00
res <- factor(ifelse(predicted > 0.5, 'virginica', 'versicolor'))
table(dataset$Species, res)
##             res
##              versicolor virginica
##   versicolor         49         1
##   virginica           1        49

SVM

#install.packages('e1071')
library(e1071)
## Warning: package 'e1071' was built under R version 3.4.4
fit <- rpart(Species ~., data = iris)
predicted <- predict(fit, iris, type = 'class')
table(iris$Species, predicted )
##             predicted
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         49         1
##   virginica       0          5        45
fit2 <- svm(Species ~., data = iris)
predicted2 <- predict(fit2, iris)
table(iris$Species, predicted2 )
##             predicted2
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         48         2
##   virginica       0          2        48
?svm
## starting httpd help server ... done

Churn Analysis

library(C50)
## Warning: package 'C50' was built under R version 3.4.4
data(churn)
head(churnTrain)
##   state account_length     area_code international_plan voice_mail_plan
## 1    KS            128 area_code_415                 no             yes
## 2    OH            107 area_code_415                 no             yes
## 3    NJ            137 area_code_415                 no              no
## 4    OH             84 area_code_408                yes              no
## 5    OK             75 area_code_415                yes              no
## 6    AL            118 area_code_510                yes              no
##   number_vmail_messages total_day_minutes total_day_calls total_day_charge
## 1                    25             265.1             110            45.07
## 2                    26             161.6             123            27.47
## 3                     0             243.4             114            41.38
## 4                     0             299.4              71            50.90
## 5                     0             166.7             113            28.34
## 6                     0             223.4              98            37.98
##   total_eve_minutes total_eve_calls total_eve_charge total_night_minutes
## 1             197.4              99            16.78               244.7
## 2             195.5             103            16.62               254.4
## 3             121.2             110            10.30               162.6
## 4              61.9              88             5.26               196.9
## 5             148.3             122            12.61               186.9
## 6             220.6             101            18.75               203.9
##   total_night_calls total_night_charge total_intl_minutes total_intl_calls
## 1                91              11.01               10.0                3
## 2               103              11.45               13.7                3
## 3               104               7.32               12.2                5
## 4                89               8.86                6.6                7
## 5               121               8.41               10.1                3
## 6               118               9.18                6.3                6
##   total_intl_charge number_customer_service_calls churn
## 1              2.70                             1    no
## 2              3.70                             1    no
## 3              3.29                             0    no
## 4              1.78                             2    no
## 5              2.73                             3    no
## 6              1.70                             0    no
churnTrain <- churnTrain[ , ! names(churnTrain) %in% c('state', 'account_length', 'area_code')]

dim(churnTrain)
## [1] 3333   17
set.seed(2)
idx <- sample.int(2, nrow(churnTrain), replace=TRUE, prob=c(0.7,0.3))
trainset <- churnTrain[idx == 1, ]
testset  <- churnTrain[idx == 2, ]

library(rpart)
churn.rp <- rpart(churn ~ ., data = trainset)
predicted <- predict(churn.rp, testset, type = 'class')
table(testset$churn, predicted)
##      predicted
##       yes  no
##   yes 100  41
##   no   18 859
plot(churn.rp, margin= 0.1)
text(churn.rp)

# kfold

idx <- sample.int(10, nrow(churnTrain), replace=TRUE)

for (i in 1:10){
  trainset <- churnTrain[idx != i, ]
  testset  <- churnTrain[idx == i, ]
  churn.rp <- rpart(churn ~., data = trainset)
  predicted <- predict(churn.rp, testset, type='class')
  #print(predicted)
  print( sum(predicted == testset$churn ) / length(predicted))
}
## [1] 0.9650794
## [1] 0.9309392
## [1] 0.9439528
## [1] 0.9391026
## [1] 0.925
## [1] 0.9345794
## [1] 0.942623
## [1] 0.9589443
## [1] 0.9464883
## [1] 0.9441341
library(caret)
control <- trainControl(method="repeatedcv", number=10, repeats=3)
model   <- train(churn~., data=trainset, method="rpart", preProcess="scale", trControl=control)
predicted <- predict(model, testset)
table(testset$churn, predicted)
##      predicted
##       yes  no
##   yes  23  26
##   no    5 304

ROC Curve

fit <- rpart(churn ~.,  trainset)
predicted <- predict(fit, testset, type = 'prob')
#predicted
fpr <- c(0)
tpr <- c(0)

predicted[, 1]
##          3         32         36         44         46         53 
## 0.11379310 0.11379310 0.02851426 0.02851426 0.02851426 0.02851426 
##         72         82        113        124        127        130 
## 0.11379310 0.04697987 0.92307692 0.02851426 0.88043478 0.02851426 
##        154        157        160        181        201        208 
## 0.11379310 0.84000000 0.02851426 0.09302326 0.02851426 0.02851426 
##        212        219        225        235        238        239 
## 0.04697987 0.73529412 0.02851426 0.04697987 0.02851426 0.02851426 
##        241        258        260        261        270        280 
## 0.02851426 0.02851426 0.02851426 0.02851426 0.04697987 0.04697987 
##        291        302        315        321        325        332 
## 0.11379310 0.84000000 0.02851426 0.02851426 0.02851426 0.84000000 
##        341        365        370        387        388        394 
## 0.02851426 0.02851426 0.02851426 0.02851426 0.02851426 0.02851426 
##        395        402        403        419        422        442 
## 0.11379310 0.02851426 0.02851426 0.02851426 0.11379310 0.02851426 
##        443        456        463        468        484        491 
## 0.02851426 0.84000000 0.02851426 0.02851426 0.02851426 0.02851426 
##        494        498        503        516        519        536 
## 0.02851426 0.02851426 1.00000000 0.02851426 0.02851426 0.02851426 
##        543        565        587        618        622        628 
## 0.09302326 0.11379310 0.04697987 0.02851426 0.02851426 0.09302326 
##        635        641        643        644        649        651 
## 0.02851426 0.02851426 0.02851426 0.02851426 0.02851426 0.11379310 
##        657        663        667        669        682        684 
## 0.02851426 0.02851426 0.11379310 0.11379310 0.02851426 0.02851426 
##        695        705        742        743        744        751 
## 0.88043478 0.02851426 0.02851426 0.09302326 0.02851426 0.02851426 
##        755        760        774        781        792        793 
## 0.02851426 0.02851426 1.00000000 0.84000000 0.02851426 0.10000000 
##        802        804        829        831        836        841 
## 0.11379310 0.02851426 0.02851426 1.00000000 0.02851426 0.02851426 
##        863        874        892        911        916        919 
## 0.02851426 0.02851426 0.02851426 0.02851426 0.02851426 0.02851426 
##        921        922        928        932        940        947 
## 0.02851426 0.02851426 0.02851426 0.02851426 0.02851426 0.11379310 
##        950        985        992        995        997       1005 
## 0.11379310 0.02851426 0.02851426 0.11379310 0.02851426 0.02851426 
##       1015       1018       1035       1038       1043       1056 
## 0.02851426 0.02851426 0.02851426 0.04697987 0.04697987 0.02851426 
##       1064       1076       1080       1086       1108       1112 
## 0.02851426 0.02851426 0.02851426 0.11379310 0.02851426 0.02851426 
##       1115       1117       1138       1144       1149       1170 
## 0.02851426 0.02851426 0.02851426 0.02851426 0.02851426 0.04697987 
##       1171       1173       1183       1193       1194       1215 
## 0.02851426 0.02851426 0.84000000 0.09302326 0.88043478 0.02851426 
##       1220       1221       1223       1241       1253       1259 
## 0.02851426 0.02851426 0.02851426 0.92307692 0.11379310 0.04697987 
##       1265       1272       1275       1297       1309       1310 
## 0.02851426 0.04697987 0.02851426 0.02851426 0.02851426 0.04697987 
##       1313       1328       1333       1334       1341       1367 
## 0.02851426 0.02851426 0.11379310 0.02851426 0.02851426 0.02851426 
##       1385       1389       1433       1498       1507       1510 
## 0.02851426 0.09302326 0.02851426 0.02851426 0.11379310 0.11379310 
##       1514       1516       1527       1530       1533       1551 
## 0.11379310 0.02851426 0.11379310 0.02851426 0.02851426 0.02851426 
##       1566       1587       1598       1599       1611       1612 
## 0.02851426 0.02851426 0.02851426 0.02851426 0.00000000 0.02851426 
##       1613       1665       1676       1682       1688       1691 
## 0.02851426 0.02851426 0.02851426 0.02851426 0.02851426 0.02851426 
##       1702       1703       1711       1712       1717       1718 
## 0.84000000 0.88043478 0.02851426 0.04697987 0.02851426 0.02851426 
##       1721       1725       1729       1732       1746       1748 
## 0.02851426 0.02851426 0.88043478 0.73529412 0.11379310 0.02851426 
##       1793       1803       1809       1826       1841       1842 
## 0.02851426 0.09302326 0.11379310 0.02851426 0.11379310 0.02851426 
##       1846       1855       1861       1873       1901       1907 
## 1.00000000 0.02851426 0.02851426 0.04697987 0.02851426 0.02851426 
##       1913       1926       1928       1932       1937       1954 
## 0.88043478 0.02851426 0.02851426 0.04697987 0.02851426 0.02851426 
##       1958       1959       2000       2006       2008       2020 
## 0.02851426 0.02851426 0.02851426 0.02851426 0.11379310 0.02851426 
##       2022       2023       2039       2047       2074       2096 
## 0.02851426 0.02851426 0.84000000 0.02851426 0.02851426 0.11379310 
##       2108       2110       2114       2118       2120       2135 
## 0.73529412 0.02851426 0.84000000 0.09302326 0.11379310 0.02851426 
##       2138       2153       2201       2215       2220       2221 
## 0.02851426 0.04697987 0.02851426 0.02851426 0.02851426 0.02851426 
##       2229       2232       2238       2258       2267       2268 
## 0.02851426 0.02851426 0.92307692 0.02851426 0.02851426 0.84000000 
##       2286       2291       2307       2313       2315       2325 
## 0.02851426 0.11379310 0.11379310 0.02851426 0.02851426 0.88043478 
##       2327       2348       2352       2393       2412       2436 
## 0.11379310 1.00000000 0.02851426 0.02851426 0.02851426 0.10000000 
##       2444       2449       2454       2463       2464       2466 
## 0.02851426 0.11379310 0.02851426 0.02851426 0.02851426 0.02851426 
##       2468       2476       2481       2484       2491       2496 
## 0.04697987 0.02851426 0.04697987 0.02851426 0.02851426 0.02851426 
##       2509       2515       2523       2530       2531       2532 
## 0.02851426 0.02851426 0.02851426 0.02851426 0.84000000 0.02851426 
##       2536       2538       2541       2547       2556       2566 
## 0.02851426 0.02851426 0.02851426 0.02851426 0.02851426 0.02851426 
##       2567       2581       2590       2607       2615       2628 
## 0.02851426 0.02851426 0.02851426 0.02851426 0.84000000 0.02851426 
##       2629       2648       2659       2665       2676       2727 
## 0.09302326 0.84000000 0.11379310 1.00000000 0.02851426 0.02851426 
##       2728       2753       2754       2756       2762       2764 
## 0.04697987 0.02851426 0.02851426 0.02851426 0.02851426 0.02851426 
##       2767       2776       2777       2782       2785       2788 
## 0.04697987 0.11379310 0.02851426 0.02851426 0.11379310 0.02851426 
##       2794       2796       2797       2808       2832       2833 
## 0.10000000 0.02851426 0.11379310 0.02851426 0.02851426 0.02851426 
##       2840       2853       2854       2863       2878       2896 
## 0.84000000 0.02851426 0.02851426 0.02851426 0.02851426 0.04697987 
##       2902       2906       2908       2909       2917       2920 
## 0.88043478 0.02851426 0.02851426 0.84000000 0.02851426 0.02851426 
##       2924       2934       2936       2943       2959       2972 
## 0.02851426 0.04697987 0.02851426 0.84000000 0.88043478 1.00000000 
##       2975       3005       3018       3021       3055       3056 
## 0.02851426 0.02851426 0.09302326 0.02851426 0.02851426 0.02851426 
##       3067       3076       3081       3083       3101       3104 
## 0.02851426 0.02851426 0.02851426 0.02851426 0.02851426 0.02851426 
##       3106       3110       3129       3145       3150       3154 
## 0.02851426 0.04697987 0.02851426 0.88043478 0.02851426 0.02851426 
##       3164       3184       3199       3209       3210       3231 
## 0.73529412 0.02851426 0.02851426 0.11379310 1.00000000 0.02851426 
##       3236       3237       3238       3257       3260       3273 
## 0.02851426 0.11379310 0.04697987 0.84000000 0.02851426 0.84000000 
##       3286       3322       3329       3330 
## 0.02851426 0.02851426 0.02851426 0.11379310
for (i in seq(0,1,0.01)){
  print(i)
  
  predicted_prob <- factor(ifelse(predicted[, 1] > i, 'no', 'yes') )
  #print(predicted_prob)
  tb <- table(testset$churn, predicted_prob)
  #print(length(tb))
  if(length(tb) == 4){
  TP <- tb[1,1] 
  FP <- tb[1,2]
  FN <- tb[2,1]
  TN <- tb[1,1] 
  tpr <- c(tpr, TP / (TP + FN))
  fpr <- c(fpr, FP / (FP + TN))
  }
}
## [1] 0
## [1] 0.01
## [1] 0.02
## [1] 0.03
## [1] 0.04
## [1] 0.05
## [1] 0.06
## [1] 0.07
## [1] 0.08
## [1] 0.09
## [1] 0.1
## [1] 0.11
## [1] 0.12
## [1] 0.13
## [1] 0.14
## [1] 0.15
## [1] 0.16
## [1] 0.17
## [1] 0.18
## [1] 0.19
## [1] 0.2
## [1] 0.21
## [1] 0.22
## [1] 0.23
## [1] 0.24
## [1] 0.25
## [1] 0.26
## [1] 0.27
## [1] 0.28
## [1] 0.29
## [1] 0.3
## [1] 0.31
## [1] 0.32
## [1] 0.33
## [1] 0.34
## [1] 0.35
## [1] 0.36
## [1] 0.37
## [1] 0.38
## [1] 0.39
## [1] 0.4
## [1] 0.41
## [1] 0.42
## [1] 0.43
## [1] 0.44
## [1] 0.45
## [1] 0.46
## [1] 0.47
## [1] 0.48
## [1] 0.49
## [1] 0.5
## [1] 0.51
## [1] 0.52
## [1] 0.53
## [1] 0.54
## [1] 0.55
## [1] 0.56
## [1] 0.57
## [1] 0.58
## [1] 0.59
## [1] 0.6
## [1] 0.61
## [1] 0.62
## [1] 0.63
## [1] 0.64
## [1] 0.65
## [1] 0.66
## [1] 0.67
## [1] 0.68
## [1] 0.69
## [1] 0.7
## [1] 0.71
## [1] 0.72
## [1] 0.73
## [1] 0.74
## [1] 0.75
## [1] 0.76
## [1] 0.77
## [1] 0.78
## [1] 0.79
## [1] 0.8
## [1] 0.81
## [1] 0.82
## [1] 0.83
## [1] 0.84
## [1] 0.85
## [1] 0.86
## [1] 0.87
## [1] 0.88
## [1] 0.89
## [1] 0.9
## [1] 0.91
## [1] 0.92
## [1] 0.93
## [1] 0.94
## [1] 0.95
## [1] 0.96
## [1] 0.97
## [1] 0.98
## [1] 0.99
## [1] 1
tpr <- c(tpr,1)
fpr <- c(fpr,1)

plot(fpr, tpr , type= 'l')

#install.packages('ROCR')

library(ROCR)
## Warning: package 'ROCR' was built under R version 3.4.4
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.4.4
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
predictions <-predict(churn.rp, testset, type="prob")
pred.to.roc<-predictions[, 1]

pred.rocr<-prediction(pred.to.roc, as.factor(testset[,(dim(testset)[[2]])]))

perf.rocr<-performance(pred.rocr, measure ="auc", x.measure="cutoff")
perf.tpr.rocr<-performance(pred.rocr, "tpr","fpr")
plot(perf.tpr.rocr, colorize=T,main=paste("AUC:",(perf.rocr@y.values)))