data <- read.csv("MVcat.csv")
library(caret)
## 载入需要的程序包:ggplot2
## 载入需要的程序包:lattice
library(party)
## 载入需要的程序包:grid
## 载入需要的程序包:mvtnorm
## 载入需要的程序包:modeltools
## 载入需要的程序包:stats4
## 载入需要的程序包:strucchange
## 载入需要的程序包:zoo
## 
## 载入程序包:'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 载入需要的程序包:sandwich
data$MVcat <- as.factor(data$MVcat)
data$HouseAge <- as.numeric(data$HouseAge)
data$SquareFeet <- as.numeric(data$SquareFeet)
set.seed(1234)

sample_size <- floor(0.7 * nrow(data))
train_indices <- sample(seq_len(nrow(data)), size = sample_size, replace = FALSE)
training_data <- data[train_indices, ]
testing_data <- data[-train_indices, ]
ctree_model <- ctree(MVcat ~ HouseAge + SquareFeet, data = training_data)
predictions <- predict(ctree_model, newdata = testing_data)
confusion_matrix <- confusionMatrix(predictions, testing_data$MVcat)
summary(confusion_matrix)
##          Length Class  Mode     
## positive  0     -none- NULL     
## table    25     table  numeric  
## overall   7     -none- numeric  
## byClass  55     -none- numeric  
## mode      1     -none- character
## dots      0     -none- list
print(confusion_matrix)
## Confusion Matrix and Statistics
## 
##              Reference
## Prediction    AVE HIGH LOW MEDIUM HIGH MEDIUM LOW
##   AVE           0    0   0           0          0
##   HIGH          0    0   0           0          0
##   LOW           1    0   3           0          0
##   MEDIUM HIGH   2    3   0           1          3
##   MEDIUM LOW    0    0   0           0          0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.3077          
##                  95% CI : (0.0909, 0.6143)
##     No Information Rate : 0.2308          
##     P-Value [Acc > NIR] : 0.3515          
##                                           
##                   Kappa : 0.2095          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: AVE Class: HIGH Class: LOW Class: MEDIUM HIGH
## Sensitivity              0.0000      0.0000     1.0000            1.00000
## Specificity              1.0000      1.0000     0.9000            0.33333
## Pos Pred Value              NaN         NaN     0.7500            0.11111
## Neg Pred Value           0.7692      0.7692     1.0000            1.00000
## Prevalence               0.2308      0.2308     0.2308            0.07692
## Detection Rate           0.0000      0.0000     0.2308            0.07692
## Detection Prevalence     0.0000      0.0000     0.3077            0.69231
## Balanced Accuracy        0.5000      0.5000     0.9500            0.66667
##                      Class: MEDIUM LOW
## Sensitivity                     0.0000
## Specificity                     1.0000
## Pos Pred Value                     NaN
## Neg Pred Value                  0.7692
## Prevalence                      0.2308
## Detection Rate                  0.0000
## Detection Prevalence            0.0000
## Balanced Accuracy               0.5000
ave_predicted_as_ave <- confusion_matrix$table["AVE", "AVE"]
high_predicted_as_high <- confusion_matrix$table["HIGH", "HIGH"]
low_predicted_as_low <- confusion_matrix$table["LOW", "LOW"]
medium_high_predicted_as_medium_high <- confusion_matrix$table["MEDIUM HIGH", "MEDIUM HIGH"]
medium_low_predicted_as_medium_low <- confusion_matrix$table["MEDIUM LOW", "MEDIUM LOW"]
#How many actual "AVE" from the test data got predicted as "AVE"?
#=0

#How many actual "HIGH" from the test data got predicted as "HIGH"?
#=0

#How many actual "LOW" from the test data got predicted as "LOW"?
#=3

#How many actual "MEDIUM HIGH" from the test data got predicted as "MEDIUM HIGH"?
#=1

#How many actual "MEDIUM LOW" from the test data got predicted as "MEDIUM LOW"?
#=0
#The Classification went poorly since the model only has an accuracy of 30.77% which could be caused by the limited data collected.
#To make an improvement for the results, I would suggest on collecting more data to make the model more accurate and avoid false conclusions since the data only has 42 observations now.