This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(mlbench)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
#1.PREPARE DATASET
AirQualityDataset <- read.csv("~/Downloads/Air_Quality(in).csv", stringsAsFactors=TRUE)
AirQualityDataset_Not_Altered <- AirQualityDataset
View(AirQualityDataset_Not_Altered)
head(AirQualityDataset)
## Unique.ID Indicator.ID Name Measure Measure.Info
## 1 336867 375 Nitrogen dioxide (NO2) Mean ppb
## 2 336741 375 Nitrogen dioxide (NO2) Mean ppb
## 3 550157 375 Nitrogen dioxide (NO2) Mean ppb
## 4 412802 375 Nitrogen dioxide (NO2) Mean ppb
## 5 412803 375 Nitrogen dioxide (NO2) Mean ppb
## 6 412676 375 Nitrogen dioxide (NO2) Mean ppb
## Geo.Type.Name Geo.Join.ID Geo.Place.Name
## 1 CD 407 Flushing and Whitestone (CD7)
## 2 CD 107 Upper West Side (CD7)
## 3 CD 414 Rockaway and Broad Channel (CD14)
## 4 CD 407 Flushing and Whitestone (CD7)
## 5 CD 407 Flushing and Whitestone (CD7)
## 6 CD 107 Upper West Side (CD7)
## Time.Period Start_Date Data.Value Message
## 1 Winter 2014-15 12/1/2014 23.97 NA
## 2 Winter 2014-15 12/1/2014 27.42 NA
## 3 Annual Average 2017 1/1/2017 12.55 NA
## 4 Winter 2015-16 12/1/2015 22.63 NA
## 5 Summer 2016 6/1/2016 14.00 NA
## 6 Winter 2015-16 12/1/2015 26.43 NA
#summary(AirQualityDataset)
str(AirQualityDataset)
## 'data.frame': 18862 obs. of 12 variables:
## $ Unique.ID : int 336867 336741 550157 412802 412803 412676 412677 603044 412804 825832 ...
## $ Indicator.ID : int 375 375 375 375 375 375 375 375 375 375 ...
## $ Name : Factor w/ 18 levels "Annual vehicle miles traveled",..: 14 14 14 14 14 14 14 14 14 14 ...
## $ Measure : Factor w/ 8 levels "Annual average concentration",..: 6 6 6 6 6 6 6 6 6 6 ...
## $ Measure.Info : Factor w/ 8 levels "µg/m3","mcg/m3",..: 8 8 8 8 8 8 8 8 8 8 ...
## $ Geo.Type.Name : Factor w/ 5 levels "Borough","CD",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Geo.Join.ID : int 407 107 414 407 407 107 107 314 407 107 ...
## $ Geo.Place.Name: Factor w/ 114 levels "Bay Ridge and Dyker Heights (CD10)",..: 38 107 82 38 38 107 107 35 38 107 ...
## $ Time.Period : Factor w/ 57 levels "2005","2005-2007",..: 49 49 21 50 35 50 35 22 20 56 ...
## $ Start_Date : Factor w/ 46 levels "1/1/2005","1/1/2009",..: 22 22 8 23 39 23 39 9 31 29 ...
## $ Data.Value : num 24 27.4 12.6 22.6 14 ...
## $ Message : logi NA NA NA NA NA NA ...
#manually deletes columns
AirQualityDataset <- AirQualityDataset[, !names(AirQualityDataset) %in% c("Unique.ID", "Indicator.ID", "Geo.Place.Name", "Geo.Join.ID", "Start_Date", "Message")]
sum(is.na(AirQualityDataset))
## [1] 0
AirQualityDataset <- na.omit(AirQualityDataset)
AirQualityDataset$Name <- as.factor(AirQualityDataset$Name)
AirQualityDataset$Measure <- as.factor(AirQualityDataset$Measure)
AirQualityDataset$Measure.Info <- as.factor(AirQualityDataset$Measure.Info)
AirQualityDataset$Geo.Type.Name <- as.factor(AirQualityDataset$Geo.Type.Name)
AirQualityDataset$Time.Period <- as.factor(AirQualityDataset$Time.Period)
str(AirQualityDataset)
## 'data.frame': 18862 obs. of 6 variables:
## $ Name : Factor w/ 18 levels "Annual vehicle miles traveled",..: 14 14 14 14 14 14 14 14 14 14 ...
## $ Measure : Factor w/ 8 levels "Annual average concentration",..: 6 6 6 6 6 6 6 6 6 6 ...
## $ Measure.Info : Factor w/ 8 levels "µg/m3","mcg/m3",..: 8 8 8 8 8 8 8 8 8 8 ...
## $ Geo.Type.Name: Factor w/ 5 levels "Borough","CD",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Time.Period : Factor w/ 57 levels "2005","2005-2007",..: 49 49 21 50 35 50 35 22 20 56 ...
## $ Data.Value : num 24 27.4 12.6 22.6 14 ...
unique(AirQualityDataset$Measure.Info)
## [1] ppb mcg/m3 number
## [4] per 100,000 adults per square mile per 100,000 children
## [7] µg/m3 per 100,000
## 8 Levels: µg/m3 mcg/m3 number per 100,000 ... ppb
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
AirQualityDataset <- AirQualityDataset %>%
mutate(Measure.Info = recode(Measure.Info,
"µg/m3" = "mcg/m3",
"per 100,000" = "mcg/m3"
))
# Shuffle the rows
shuffled_data <- AirQualityDataset[sample(nrow(AirQualityDataset)), ]
# Get split index (70%)
split_index <- floor(0.7 * nrow(shuffled_data))
# Split into training and test sets
AirQualityDataset_Train <- shuffled_data[1:split_index, ]
AirQualityDataset_Test <- shuffled_data[(split_index + 1):nrow(shuffled_data), ]
cat("Training rows:", nrow(AirQualityDataset_Train), "\n")
## Training rows: 13203
cat("Testing rows:", nrow(AirQualityDataset_Test), "\n")
## Testing rows: 5659
str(AirQualityDataset_Train)
## 'data.frame': 13203 obs. of 6 variables:
## $ Name : Factor w/ 18 levels "Annual vehicle miles traveled",..: 13 13 14 16 14 14 17 14 14 9 ...
## $ Measure : Factor w/ 8 levels "Annual average concentration",..: 6 6 6 1 6 6 6 6 6 8 ...
## $ Measure.Info : Factor w/ 6 levels "mcg/m3","number",..: 1 1 6 1 6 6 6 6 6 2 ...
## $ Geo.Type.Name: Factor w/ 5 levels "Borough","CD",..: 5 5 5 5 5 4 5 2 2 5 ...
## $ Time.Period : Factor w/ 57 levels "2005","2005-2007",..: 32 52 17 8 43 24 30 31 53 7 ...
## $ Data.Value : num 10.17 7.57 14.69 1.57 30.17 ...
View(AirQualityDataset_Train)
#2.SVM FOR REGRESSION MODEL
#cross validation
#trainControl <- trainControl(method='cv', number=10)
#trainControl <- trainControl(method = 'repeatedcv', number = 10, repeats = 3)
trainControl <- trainControl(method = 'repeatedcv', number = 10, repeats = 3)
library(e1071)
# Use a smaller training set
sample_train <- AirQualityDataset_Train[sample(nrow(AirQualityDataset_Train), 2500), ]
#bigger difference between 5000-10000 than 10000-15000
# Timing the model
set.seed(123)
fit.svm <- train(Data.Value ~ ., data = sample_train, method = 'svmRadial', trControl = trainControl)
set.seed(123)
fit.svm1 <- train(Data.Value ~ ., data = sample_train, method = 'svmLinear', trControl = trainControl)
print(fit.svm)
## Support Vector Machines with Radial Basis Function Kernel
##
## 2500 samples
## 5 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 2250, 2250, 2250, 2250, 2249, 2252, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 14.92878 0.5883150 5.558150
## 0.50 14.30274 0.6062455 5.278882
## 1.00 14.15867 0.6033759 5.179564
##
## Tuning parameter 'sigma' was held constant at a value of 0.006345187
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.006345187 and C = 1.
print(fit.svm1)
## Support Vector Machines with Linear Kernel
##
## 2500 samples
## 5 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 2250, 2250, 2250, 2250, 2249, 2252, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 13.89192 0.6149383 5.529905
##
## Tuning parameter 'C' was held constant at a value of 1
#Visualization Svm radical
predictions <- predict(fit.svm, newdata = AirQualityDataset_Test)
# Actual vs Predicted Plot
plot(AirQualityDataset_Test$Data.Value, predictions,
main = "Actual vs Predicted (SVM Radical)",
xlab = "Actual Values",
ylab = "Predicted Values",
col = "blue", pch = 19)
abline(0, 1, col = "red") # ideal line
# Calculate residuals
residuals <- AirQualityDataset_Test$Data.Value - predictions
# Residuals vs Predicted Plot
plot(predictions, residuals,
main = "Residuals vs Predicted",
xlab = "Predicted",
ylab = "Residuals",
col = "darkgreen", pch = 19)
abline(h = 0, col = "red")
histogram
## function (x, data, ...)
## UseMethod("histogram")
## <bytecode: 0x15b6a55c8>
## <environment: namespace:lattice>
hist(residuals, main = "Histogram of Residuals", col = "purple", xlab = "Residuals")
#Visualization svm Radical
predictions1 <- predict(fit.svm1, newdata = AirQualityDataset_Test)
# Actual vs Predicted Plot
plot(AirQualityDataset_Test$Data.Value, predictions1,
main = "Actual vs Predicted (SVM Linear)",
xlab = "Actual Values",
ylab = "Predicted Values",
col = "blue", pch = 19)
abline(0, 1, col = "red") # ideal line
# Calculate residuals
residuals1 <- AirQualityDataset_Test$Data.Value - predictions1
# Residuals vs Predicted Plot
plot(predictions1, residuals1,
main = "Residuals vs Predicted",
xlab = "Predicted",
ylab = "Residuals",
col = "darkgreen", pch = 19)
abline(h = 0, col = "red")
histogram
## function (x, data, ...)
## UseMethod("histogram")
## <bytecode: 0x15b6a55c8>
## <environment: namespace:lattice>
hist(residuals1, main = "Histogram of Residuals", col = "purple", xlab = "Residuals")
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.