lvr_data <- read.csv('lvr_prices_windows.csv')
lvr_new <- lvr_data[
lvr_data$total_price < 100000000 &
lvr_data$building_sqmeter< 1000 &
lvr_data$area == '大安區' &
lvr_data$building_sqmeter > 0,]
plot(total_price ~ building_sqmeter, data = lvr_new)
fit <- lm(total_price ~ building_sqmeter, data = lvr_new)
abline(fit, col="red")
class(lvr_data$trading_ymd)
## [1] "factor"
lvr_data$trading_ymd <- as.Date(lvr_data$trading_ymd)
lvr_data$finish_ymd <- as.Date(lvr_data$finish_ymd)
lvr_data$house_age <- as.numeric((lvr_data$trading_ymd - lvr_data$finish_ymd)/ 365)
str(lvr_data)
fit2 <- lm(total_price ~ management + compartment + parking_type + parking_sqmeter + house_age + room + living_room + bath + building_sqmeter + main_purpose + built_with + trading_target + land_sqmeter + city_land_type + area, data = lvr_data)
step(fit2)
library(rpart)
## Warning: package 'rpart' was built under R version 3.2.5
data(iris)
fit <- rpart(formula= Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = iris )
fit
## n= 150
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 150 100 setosa (0.33333333 0.33333333 0.33333333)
## 2) Petal.Length< 2.45 50 0 setosa (1.00000000 0.00000000 0.00000000) *
## 3) Petal.Length>=2.45 100 50 versicolor (0.00000000 0.50000000 0.50000000)
## 6) Petal.Width< 1.75 54 5 versicolor (0.00000000 0.90740741 0.09259259) *
## 7) Petal.Width>=1.75 46 1 virginica (0.00000000 0.02173913 0.97826087) *
plot(fit, margin = 0.1)
text(fit)
plot(Petal.Length ~ Petal.Width, data=iris, col=Species)
abline(h = 2.45, col="blue")
abline(v = 1.75, col="orange")
predicted_prob <- predict(fit, data = iris)
predicted <- predict(fit, data = iris, type ="class")
table(predicted, iris$Species)
##
## predicted setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 49 5
## virginica 0 1 45
#install.packages('caret')
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
cm <- table(predicted, iris$Species)
confusionMatrix(cm)
## Confusion Matrix and Statistics
##
##
## predicted setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 49 5
## virginica 0 1 45
##
## Overall Statistics
##
## Accuracy : 0.96
## 95% CI : (0.915, 0.9852)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.94
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9800 0.9000
## Specificity 1.0000 0.9500 0.9900
## Pos Pred Value 1.0000 0.9074 0.9783
## Neg Pred Value 1.0000 0.9896 0.9519
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3267 0.3000
## Detection Prevalence 0.3333 0.3600 0.3067
## Balanced Accuracy 1.0000 0.9650 0.9450
set.seed(123)
idx <- sample.int(2, nrow(iris), replace=TRUE, prob=c(0.7,0.3))
table(idx)
## idx
## 1 2
## 106 44
trainset <- iris[idx == 1,]
testset <- iris[idx == 2,]
dim(trainset)
## [1] 106 5
dim(testset)
## [1] 44 5
a <- c(1,2,3,4,5)
idx2 <- c(1,0,1,0,1)
idx2 == 1
## [1] TRUE FALSE TRUE FALSE TRUE
a[idx2 == 1]
## [1] 1 3 5
# . 代表所有變數
fit2 <- rpart(Species ~. , data = trainset)
predicted2 <- predict(fit2, testset, type="class")
cm2 <- table(predicted2, testset$Species)
confusionMatrix(cm2)
## Confusion Matrix and Statistics
##
##
## predicted2 setosa versicolor virginica
## setosa 15 0 0
## versicolor 0 10 1
## virginica 0 4 14
##
## Overall Statistics
##
## Accuracy : 0.8864
## 95% CI : (0.7544, 0.9621)
## No Information Rate : 0.3409
## P-Value [Acc > NIR] : 8.552e-14
##
## Kappa : 0.8291
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.7143 0.9333
## Specificity 1.0000 0.9667 0.8621
## Pos Pred Value 1.0000 0.9091 0.7778
## Neg Pred Value 1.0000 0.8788 0.9615
## Prevalence 0.3409 0.3182 0.3409
## Detection Rate 0.3409 0.2273 0.3182
## Detection Prevalence 0.3409 0.2500 0.4091
## Balanced Accuracy 1.0000 0.8405 0.8977
fit2
## n= 106
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 106 70 versicolor (0.33018868 0.33962264 0.33018868)
## 2) Petal.Length< 2.45 35 0 setosa (1.00000000 0.00000000 0.00000000) *
## 3) Petal.Length>=2.45 71 35 versicolor (0.00000000 0.50704225 0.49295775)
## 6) Petal.Length< 4.75 34 0 versicolor (0.00000000 1.00000000 0.00000000) *
## 7) Petal.Length>=4.75 37 2 virginica (0.00000000 0.05405405 0.94594595) *
plot(fit2, margin = 0.1)
text(fit2)
plot(Petal.Length ~ Petal.Width, data=iris, col=Species)
abline(h = 2.45, col="blue")
abline(h = 4.75, col="orange")
#install.packages("C50")
library(C50)
## Warning: package 'C50' was built under R version 3.2.5
data(churn)
str(churnTrain)
## 'data.frame': 3333 obs. of 20 variables:
## $ state : Factor w/ 51 levels "AK","AL","AR",..: 17 36 32 36 37 2 20 25 19 50 ...
## $ account_length : int 128 107 137 84 75 118 121 147 117 141 ...
## $ area_code : Factor w/ 3 levels "area_code_408",..: 2 2 2 1 2 3 3 2 1 2 ...
## $ international_plan : Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 1 2 1 2 ...
## $ voice_mail_plan : Factor w/ 2 levels "no","yes": 2 2 1 1 1 1 2 1 1 2 ...
## $ number_vmail_messages : int 25 26 0 0 0 0 24 0 0 37 ...
## $ total_day_minutes : num 265 162 243 299 167 ...
## $ total_day_calls : int 110 123 114 71 113 98 88 79 97 84 ...
## $ total_day_charge : num 45.1 27.5 41.4 50.9 28.3 ...
## $ total_eve_minutes : num 197.4 195.5 121.2 61.9 148.3 ...
## $ total_eve_calls : int 99 103 110 88 122 101 108 94 80 111 ...
## $ total_eve_charge : num 16.78 16.62 10.3 5.26 12.61 ...
## $ total_night_minutes : num 245 254 163 197 187 ...
## $ total_night_calls : int 91 103 104 89 121 118 118 96 90 97 ...
## $ total_night_charge : num 11.01 11.45 7.32 8.86 8.41 ...
## $ total_intl_minutes : num 10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
## $ total_intl_calls : int 3 3 5 7 3 6 7 6 4 5 ...
## $ total_intl_charge : num 2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
## $ number_customer_service_calls: int 1 1 0 2 3 0 3 0 1 0 ...
## $ churn : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...
churnTrain = churnTrain[,! names(churnTrain) %in% c("state", "area_code", "account_length") ]
set.seed(2)
ind <- sample(2, nrow(churnTrain), replace = TRUE, prob=c(0.7, 0.3))
trainset <- churnTrain[ind == 1,]
testset <- churnTrain[ind == 2,]
dim(trainset)
## [1] 2315 17
dim(testset)
## [1] 1018 17
set.seed(2)
sample(42,6)
## [1] 8 29 23 7 36 35
sample(42,6)
## [1] 6 35 19 22 39 9
sample(42,6)
## [1] 32 8 17 34 38 9
churn.rp <- rpart(churn ~ ., data=trainset)
plot(churn.rp, margin= 0.1)
text(churn.rp, all=TRUE, use.n = TRUE)
min_error_split <- which.min(churn.rp$cptable[,'xerror'])
min_error_split <- 3
churn.cp <- churn.rp$cptable[min_error_split ,"CP"]
prune.tree <- prune(churn.rp, cp= churn.cp)
plot(prune.tree, margin = 0.1)
text(prune.tree)
predictions <- predict(prune.tree, testset, type="class")
churn_table <- table(predictions, testset$churn)
confusionMatrix(churn_table)
## Confusion Matrix and Statistics
##
##
## predictions yes no
## yes 64 10
## no 77 867
##
## Accuracy : 0.9145
## 95% CI : (0.8957, 0.931)
## No Information Rate : 0.8615
## P-Value [Acc > NIR] : 1.266e-07
##
## Kappa : 0.5527
## Mcnemar's Test P-Value : 1.484e-12
##
## Sensitivity : 0.45390
## Specificity : 0.98860
## Pos Pred Value : 0.86486
## Neg Pred Value : 0.91843
## Prevalence : 0.13851
## Detection Rate : 0.06287
## Detection Prevalence : 0.07269
## Balanced Accuracy : 0.72125
##
## 'Positive' Class : yes
##
table(testset$churn)
##
## yes no
## 141 877
877 / (877 + 141)
## [1] 0.8614931
idx <- sample.int(10, nrow(churnTrain), replace=TRUE)
accu_vector = c()
for (i in seq(1,10)){
trainset <- churnTrain[idx == i,]
testset <- churnTrain[idx != i,]
fit <- rpart(churn ~., trainset)
predictions <- predict(fit, testset, type = 'class')
accu_vector <- c(accu_vector, confusionMatrix(predictions, testset$churn)$overall[1])
}
library(caret)
control = trainControl(method="repeatedcv", number=10, repeats=3)
model = train(churn~., data=trainset, method="rpart", preProcess="scale", trControl=control)
model
## CART
##
## 348 samples
## 16 predictor
## 2 classes: 'yes', 'no'
##
## Pre-processing: scaled (16)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 314, 313, 313, 313, 313, 313, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.03333333 0.8687395 0.4797714
## 0.08333333 0.8802241 0.4647340
## 0.35000000 0.8553501 0.2702799
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.08333333.
set.seed(2)
ind <- sample(2, nrow(churnTrain), replace = TRUE, prob=c(0.7, 0.3))
trainset <- churnTrain[ind == 1,]
testset <- churnTrain[ind == 2,]
fit <- rpart(churn ~., trainset)
library(caret)
predictions <- predict(fit, testset)
x <- c()
y <- c()
for (t in seq(0,1,0.1)){
predict_over_threshold <- ifelse(predictions[,1] > t, '0', '1' )
predict_over_threshold <- as.factor(predict_over_threshold)
levels(predict_over_threshold) <- c('yes', 'no')
cm <- table(predict_over_threshold, testset$churn)
y <- c(y, confusionMatrix(cm)$byClass[1])
x <- c(x, 1- confusionMatrix(cm)$byClass[2])
}
plot(x,y, type='p')
lines(sort(x),y[order(x)], col="red")
## use rocr
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.2.5
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.2.5
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
predictions <- predict(churn.rp, testset, type="prob")
pred.to.roc <- predictions[, 1]
pred.rocr <- prediction(pred.to.roc, as.factor(testset[,(dim(testset)[[2]])]))
perf.rocr <- performance(pred.rocr, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr <- performance(pred.rocr, "tpr","fpr")
plot(perf.tpr.rocr, colorize=T,main=paste("AUC:",(perf.rocr@y.values)))