data <- read.csv("MVcat.csv")
library(caret)
## 载入需要的程序包:ggplot2
## 载入需要的程序包:lattice
library(party)
## 载入需要的程序包:grid
## 载入需要的程序包:mvtnorm
## 载入需要的程序包:modeltools
## 载入需要的程序包:stats4
## 载入需要的程序包:strucchange
## 载入需要的程序包:zoo
##
## 载入程序包:'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## 载入需要的程序包:sandwich
data$MVcat <- as.factor(data$MVcat)
data$HouseAge <- as.numeric(data$HouseAge)
data$SquareFeet <- as.numeric(data$SquareFeet)
set.seed(1234)
sample_size <- floor(0.7 * nrow(data))
train_indices <- sample(seq_len(nrow(data)), size = sample_size, replace = FALSE)
training_data <- data[train_indices, ]
testing_data <- data[-train_indices, ]
ctree_model <- ctree(MVcat ~ HouseAge + SquareFeet, data = training_data)
predictions <- predict(ctree_model, newdata = testing_data)
confusion_matrix <- confusionMatrix(predictions, testing_data$MVcat)
summary(confusion_matrix)
## Length Class Mode
## positive 0 -none- NULL
## table 25 table numeric
## overall 7 -none- numeric
## byClass 55 -none- numeric
## mode 1 -none- character
## dots 0 -none- list
print(confusion_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction AVE HIGH LOW MEDIUM HIGH MEDIUM LOW
## AVE 0 0 0 0 0
## HIGH 0 0 0 0 0
## LOW 1 0 3 0 0
## MEDIUM HIGH 2 3 0 1 3
## MEDIUM LOW 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.3077
## 95% CI : (0.0909, 0.6143)
## No Information Rate : 0.2308
## P-Value [Acc > NIR] : 0.3515
##
## Kappa : 0.2095
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: AVE Class: HIGH Class: LOW Class: MEDIUM HIGH
## Sensitivity 0.0000 0.0000 1.0000 1.00000
## Specificity 1.0000 1.0000 0.9000 0.33333
## Pos Pred Value NaN NaN 0.7500 0.11111
## Neg Pred Value 0.7692 0.7692 1.0000 1.00000
## Prevalence 0.2308 0.2308 0.2308 0.07692
## Detection Rate 0.0000 0.0000 0.2308 0.07692
## Detection Prevalence 0.0000 0.0000 0.3077 0.69231
## Balanced Accuracy 0.5000 0.5000 0.9500 0.66667
## Class: MEDIUM LOW
## Sensitivity 0.0000
## Specificity 1.0000
## Pos Pred Value NaN
## Neg Pred Value 0.7692
## Prevalence 0.2308
## Detection Rate 0.0000
## Detection Prevalence 0.0000
## Balanced Accuracy 0.5000
ave_predicted_as_ave <- confusion_matrix$table["AVE", "AVE"]
high_predicted_as_high <- confusion_matrix$table["HIGH", "HIGH"]
low_predicted_as_low <- confusion_matrix$table["LOW", "LOW"]
medium_high_predicted_as_medium_high <- confusion_matrix$table["MEDIUM HIGH", "MEDIUM HIGH"]
medium_low_predicted_as_medium_low <- confusion_matrix$table["MEDIUM LOW", "MEDIUM LOW"]
#How many actual "AVE" from the test data got predicted as "AVE"?
#=0
#How many actual "HIGH" from the test data got predicted as "HIGH"?
#=0
#How many actual "LOW" from the test data got predicted as "LOW"?
#=3
#How many actual "MEDIUM HIGH" from the test data got predicted as "MEDIUM HIGH"?
#=1
#How many actual "MEDIUM LOW" from the test data got predicted as "MEDIUM LOW"?
#=0
#The Classification went poorly since the model only has an accuracy of 30.77% which could be caused by the limited data collected.
#To make an improvement for the results, I would suggest on collecting more data to make the model more accurate and avoid false conclusions since the data only has 42 observations now.