library(openair)
library(dplyr)
##
## 載入套件:'dplyr'
## 下列物件被遮斷自 'package:stats':
##
## filter, lag
## 下列物件被遮斷自 'package:base':
##
## intersect, setdiff, setequal, union
library(readxl)
library(C50)
air_county_data = read_xlsx("C:/Users/USER/Desktop/NTHU_Master_degree1/NTHU_Big_Data/空汙資料/清洗乾淨資料集/all_county_data_clean.xlsx")
unique(air_county_data$county)
## [1] "新北市" "基隆市" "臺南市" "屏東縣" "臺北市" "桃園市" "新竹縣" "新竹市"
## [9] "苗栗縣" "臺中市" "彰化縣" "南投縣" "雲林縣" "嘉義縣" "嘉義市" "高雄市"
## [17] "臺東縣" "花蓮縣" "宜蘭縣" "連江縣" "金門縣" "澎湖縣"
data.mining = air_county_data
data.mining$county = as.factor(data.mining$county)
data.mining$date = as.factor(data.mining$date)
data.mining$aqi_category = as.factor(data.mining$aqi_category)
str(data.mining)
## tibble [7,346 × 11] (S3: tbl_df/tbl/data.frame)
## $ county : Factor w/ 22 levels "宜蘭縣","花蓮縣",..: 12 12 12 12 12 12 12 12 12 12 ...
## $ date : Factor w/ 334 levels "2022-02-01","2022-02-02",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ avg_aqi : num [1:7346] 36.9 36.6 35 35.7 44.2 ...
## $ avg_so2 : num [1:7346] 0.872 0.751 0.554 0.644 0.739 ...
## $ avg_co : num [1:7346] 0.257 0.306 0.304 0.267 0.294 ...
## $ avg_o3 : num [1:7346] 32.2 40.1 37.1 39.1 42.6 ...
## $ avg_pm10 : num [1:7346] 14.58 8.34 9.69 11.87 23.77 ...
## $ avg_pm2.5 : num [1:7346] 9.28 4.14 4.21 6.6 13.94 ...
## $ avg_nox : num [1:7346] 9.76 7.35 8.45 7.71 8.27 ...
## $ avg_winddirec: num [1:7346] 139.4 115.1 134.1 111.1 84.9 ...
## $ aqi_category : Factor w/ 4 levels "G","O","R","Y": 1 1 1 1 1 1 1 1 1 1 ...
# data.mining$avg_o3
# SVM
library(e1071)
# 未除去時間######
set.seed(111111)
index <- sample(2,nrow(data.mining),replace = TRUE,prob=c(0.8,0.2))
traindata <- data.mining[index==1,]
testdata <- data.mining[index==2,]
library(ggplot2)
county_vector <- testdata$county
# 建立 SVM 模型
air_svm_model <- svm(county ~ ., data = traindata)
# 使用 SVM 模型進行預測
air_svm_model_pred_1 <- predict(air_svm_model, newdata = testdata[, -1])
# 創建混淆矩陣
air_table_1 <- table(pred = air_svm_model_pred_1, true = county_vector)
# 將混淆矩陣轉為資料框
confusion_matrix <- as.data.frame(as.table(air_table_1))
# 繪製熱度圖
ggplot(confusion_matrix, aes(x = true, y = pred, fill = Freq)) +
geom_tile() +
geom_text(aes(label = Freq), vjust = 1) +
scale_fill_gradient(low = "white", high = "blue") +
labs(title = "Full parameter Confusion Matrix Heatmap", x = "True", y = "Predicted")

# 安裝並載入 doParallel 套件
#library(doParallel)
# 設定並行運算的核心數
#cores <- 8 # 你的計算機核心數
#cl <- makeCluster(cores)
#registerDoParallel(cl)
# 找出最好參數
# tune cost and gamma in SVM(soft-margin)
#tune.model = tune(svm,
# county~.,
# data=traindata,
# kernel="radial", # RBF kernel function
# range=list(cost=10^(-1:2), gamma=c(.5,1,2))# 調參數的最主要一行
#)
#summary(tune.model)
#print(tune.model$best.parameters)
#windows()
#plot(tune.model)
## 再次進行運算
# 在 svm 函數中使用 parallel = TRUE 參數
svm_model_Full.new <- svm(county ~ ., data = traindata, kernel = "radial", cost = 10, gamma = 0.5, parallel = TRUE)
# 使用 SVM 模型進行預測
svm_pred_Full.new <- predict(svm_model_Full.new, newdata = testdata[, -1])
# 創建混淆矩陣
air_table_1 <- table(pred = svm_pred_Full.new, true = county_vector)
# 將混淆矩陣轉為資料框
confusion_matrix_df <- as.data.frame(as.table(air_table_1))
# 繪製熱度圖
ggplot(confusion_matrix_df, aes(x = true, y = pred, fill = Freq)) +
geom_tile() +
geom_text(aes(label = Freq), vjust = 1) +
scale_fill_gradient(low = "white", high = "blue") +
labs(title = "New Full parameter Confusion Matrix Heatmap", x = "True", y = "Predicted")

# 停止並行運算
# stopCluster(cl)
### 除去時間及風向####
# data.mining = air_county_data[,c(-2,-10)]
# data.mining$county = as.factor(data.mining$county)
# data.mining$aqi_category = as.factor(data.mining$aqi_category)
#
#
# index <- sample(2,nrow(data.mining),replace = TRUE,prob=c(0.8,0.2))
# traindata <- data.mining[index==1,]
# testdata <- data.mining[index==2,]
#
#
# library(ggplot2)
#
# # 假設 "county" 是資料框中的欄位名稱
# county_vector <- testdata$county # 使用 testdata 的 county
#
# # 建立 SVM 模型
# air_svm_model <- svm(county ~ ., data = traindata, kernel = "radial", cost = 10, gamma = 0.5, parallel = TRUE)
#
# # 使用 SVM 模型進行預測
# air_svm_model_pred_1 <- predict(air_svm_model, newdata = testdata[, -1])
#
#
# # 創建混淆矩陣
# confusion_matrix <- table(pred = air_svm_model_pred_1, true = county_vector)
#
# # 計算準確度
# accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
#
# # 計算精確度和召回率
# precision <- confusion_matrix[2, 2] / sum(confusion_matrix[, 2]) # 假設正例是第二類
# recall <- confusion_matrix[2, 2] / sum(confusion_matrix[2, ])
#
# # 計算 F1 分數
# f1_score <- 2 * (precision * recall) / (precision + recall)
#
# # 打印結果
# print(c(precision, recall, accuracy, f1_score))