Data source 1: https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data
Data source 2: https://ourworldindata.org/grapher/share-deaths-heart-disease?tab=chart&country=~MYS

1.Four ways to get initial understanding of the data

This is a study of differences in Stroke probability in different countries. When I get a data, I will know him from the following four aspects.
First, I look for dirty data.
Second, I will calculate the average, median and mode of the data.
Third, I will check whether there are missing values in the classification variables.
Fourth, I will check the data structure of each column


2.Four ways of subsetting / choosing row or columns

In terms of extracting rows/columns, I will do it mainly by the following means.
First, I'm going to select the point by choosing the coordinates, like df1[2, 3]
Second, I will select by selecting the column name, for example: df1["a", "weight"]
Third, I'm going to extract multiple points through a matrix, one column for each row, one column for each column sucn as df1 [cbind(c(1, 2, 1), 3:1)]
Fourth, I # select entire rows and columns by selecting one less dimension (this is a way of choosing more rows), for example: df1[2,] fetch rows, df1[, 3] fetch columns

# Import the dataset about death from heart disease between 1990 and 2017
df_country <- read.csv("C:/Users/AndrewSzl/Desktop/UM/WQD7001 PRINCIPLES OF DATA SCIENCE/DataSet_Group_Assignment/Descriptive modelling/share-deaths-heart-disease.csv")
# Import the Cleveland dataset
colNames <- c("age","sex","cp","trestbps","chol","fbs","restecg","thalach","exang","oldpeak","slope","ca","thal","num")
df_clev <- read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data", col.names = colNames)
# Have a look at the summary information of df_country
summary(df_country)
# Have a look at the structure
str(df_country)
# Frequency tables for column Entity and Code
freq_table_Entity <- table(df_country $Entity)
freq_table_Code <- table(df_country $Code)
# To see whetehr there is any subcategory whose counts don't equal 28
freq_table_Entity[freq_table_Entity != 28]
freq_table_Code[freq_table_Code != 28]
# Load the library dplyr
library(dplyr)
# Filter the rows whose Entity and Code don't equal to  " "
  df_country <- df_country %>%
  filter(Entity != "" & Code != "")
  df_country $Entity <- as.factor(df_country $Entity)
  df_country $Code <- as.factor(df_country $Code)
# Have a look at the first six row
head(df_clev)
summary(df_clev)
# Have a look at the its structure
str(df_clev)
# Have a look at the frequency table for each categorical variable
table(df_clev $ ca)
table(df_clev $ thal)
# Assign "?" to NA
df_clev[df_clev == "?"] <- NA
/ nrow(df_clev)
# Filter the missing rows
df_clev <- df_clev[complete.cases(df_clev),]
# Convert variables classes
df_clev $ sex <- factor(df_clev$sex, labels=c('Female','Male'))
df_clev $ cp <- factor(df_clev $cp, labels=c('typical angina','atypical angina','non-anginal pain','asymptomatic'))
df_clev $ trestbps <- as.integer(df_clev $trestbps)
df_clev $ chol <- as.integer(df_clev $chol)
df_clev $ fbs <-factor(df_clev$fbs,labels=c('<= 120','> 120'))
df_clev $ restecg <- factor(df_clev$restecg, labels=c('normal','ST-T wave abnormality','LVH'))
df_clev $ thalach <- as.integer(df_clev $thalach)
df_clev $ exang <- factor(df_clev $exang, labels=c('No','Yes'))
df_clev $ oldpeak <- as.numeric(df_clev $oldpeak)
df_clev $ slope <- factor(df_clev $slope, labels=c('upsloping','flat','downsloping'))
df_clev $ ca <- factor(df_clev$ca, labels = c("0 major vessels","1 major vessels", "2 major vessels", "3 major vessels"))
df_clev $ thal <- factor(df_clev$thal, labels=c('normal','fixed defect','reversable defect'))
df_clev $ num <- ifelse(df_clev $ num == 0, 0, 1 )
df_clev $ num <- factor(df_clev$num, labels=c('healthy','heart disease'))
colSums(is.na(df_clev))
## Error: <text>:33:1: 意外的'/'
## 32: df_clev[df_clev == "?"] <- NA
## 33: /
##     ^

Four ways to Preprocess data (Cleaning, etc)

The following methods are commonly used for data cleaning
1. Solutions to incomplete data (missing values)
In most cases, missing values must be filled in manually (that is, cleaned manually). Of course, some missing values can be derived from this or other data sources, which can be cleaned up by replacing the missing values with average, maximum, minimum, or more complex probability estimates.
2. Error value detection and solutions
Use statistical analysis to identify possible error values or outliers, such as bias analysis, identifying values that do not conform to distribution or regression equations, or check data values with a simple rule base (common sense rules, business specific rules, etc.), or use constraints between different attributes, external data to detect and clean up data.
3. Repeated record detection and elimination methods
Records with the same attribute value in the database are regarded as duplicate records. The equality of records is detected by judging whether the attribute value between records is equal. The equal records are merged into one record (merge/clear). Merge/clear is the basic method of weight loss.

# Have a look at the summary information of df_country
summary(df_country)
## Error in summary(df_country): 找不到对象'df_country'
# Load the library dplyr
library(dplyr)
## Error in library(dplyr): 不存在叫'dplyr'这个名字的程辑包
# Mean death percent of each country
df_country_mean_death_rate <- df_country %>%
  group_by(Entity) %>%
  summarise(mean_death_rate = mean(death_percent)) %>%
  arrange(desc(mean_death_rate)) %>%
  ungroup()
## Error in df_country %>% group_by(Entity) %>% summarise(mean_death_rate = mean(death_percent)) %>% : 没有"%>%"这个函数
# Have a look at the highest and lowest average death rate of heart disease by country
df_country_mean_death_rate[c(1, nrow(df_country_mean_death_rate)), ]
## Error in eval(expr, envir, enclos): 找不到对象'df_country_mean_death_rate'
# Mean death rate of all countries for each year
mean_death_percent_every_year <- df_country %>%
  group_by(Year) %>%
  summarise(mean_death_rate_per_year = mean(death_percent)) %>%
  arrange(desc(mean_death_rate_per_year)) %>%
  ungroup()
## Error in df_country %>% group_by(Year) %>% summarise(mean_death_rate_per_year = mean(death_percent)) %>% : 没有"%>%"这个函数
# Have a look at  with the highest and lowest mean death rate of heart disease by year
mean_death_percent_every_year[c(1, nrow(mean_death_percent_every_year)),]
## Error in eval(expr, envir, enclos): 找不到对象'mean_death_percent_every_year'
# Have a look at the summary information of df_clev
summary(df_clev)
## Error in summary(df_clev): 找不到对象'df_clev'
# Load the library ggplot2 and dplyr
library(ggplot2)
## Error in library(ggplot2): 不存在叫'ggplot2'这个名字的程辑包
library(dplyr)
## Error in library(dplyr): 不存在叫'dplyr'这个名字的程辑包
df_country %>%
  filter(Entity %in% c("China", "Malaysia", "India")) %>%
  ggplot(aes(Year, death_percent, color = Entity)) + geom_line()
## Error in df_country %>% filter(Entity %in% c("China", "Malaysia", "India")) %>% : 没有"%>%"这个函数
  par(mfrow = c(2,3))
with(df_clev, {
boxplot(age, ylab = "Age", main = "Boxplot of age")
boxplot(trestbps, ylab = "trestbps", main = "Boxplot of trestbps")
boxplot(chol, ylab = "chol", main = "Boxplot of chol")
boxplot(thalach, ylab = "chol", main = "Boxplot of chol")
boxplot(oldpeak, ylab = "oldpeak", main = "Boxplot of oldpeak")
}
)
## Error in with(df_clev, {: 找不到对象'df_clev'
par(mfrow = c(2,3))
with(df_clev, {
plot(sex, xlab = "sex", ylab = "frequency", main = "Barplot of sex")
plot(fbs, xlab = "fbs", ylab = "frequency", main = "Barplot of fbs", )
plot(exang, xlab = "exangs", ylab = "frequency", main = "Barplot of exang")
plot(slope, xlab = "slope", ylab = "frequency",main = "Barplot of slope")
plot(num, xlab = "num", ylab = "frequency", main = "Barplot of num")
}
)
## Error in with(df_clev, {: 找不到对象'df_clev'
# Barplot of cp
barplot(table(df_clev $cp),  xlab = "cp", ylab = "frequency", main = "Barplot of cp")
## Error in table(df_clev$cp): 找不到对象'df_clev'
# Barplot of restecg
barplot(table(df_clev $restecg),  xlab = "restecg", ylab = "frequency", main = "Barplot of restecg")
## Error in table(df_clev$restecg): 找不到对象'df_clev'
# Barplot of ca
barplot(table(df_clev $ca),  xlab = "ca", ylab = "frequency", main = "Barplot of ca")
## Error in table(df_clev$ca): 找不到对象'df_clev'
# Barplot of thal
barplot(table(df_clev $thal),  xlab = "thal", ylab = "frequency", main = "Barplot of thal")
## Error in table(df_clev$thal): 找不到对象'df_clev'
# age by num
# Make a boxplot
ggplot(df_clev, aes(num, age)) + geom_boxplot() + ggtitle("age by num") + theme(plot.title = element_text(hjust = 0.5))
## Error in ggplot(df_clev, aes(num, age)): 没有"ggplot"这个函数
# trestbps by num
ggplot(df_clev, aes(num, trestbps)) + geom_boxplot() +  ggtitle("trestbps by num") + theme(plot.title = element_text(hjust = 0.5))
## Error in ggplot(df_clev, aes(num, trestbps)): 没有"ggplot"这个函数
# chol by num
ggplot(df_clev, aes(num, chol)) + geom_boxplot() + ggtitle("chol by num") + theme(plot.title = element_text(hjust = 0.5))
## Error in ggplot(df_clev, aes(num, chol)): 没有"ggplot"这个函数
#  thalach by num
ggplot(df_clev, aes(num, thalach)) + geom_boxplot() + ggtitle("thalach by num") + theme(plot.title = element_text(hjust = 0.5))
## Error in ggplot(df_clev, aes(num, thalach)): 没有"ggplot"这个函数
#  oldpeak by num
ggplot(df_clev, aes(num, oldpeak)) + geom_boxplot() +  ggtitle("oldpeak by num") + theme(plot.title = element_text(hjust = 0.5))
## Error in ggplot(df_clev, aes(num, oldpeak)): 没有"ggplot"这个函数
# sex by num
#Load the library ggplot2
ggplot(df_clev, aes(sex, fill = num)) + geom_bar(position = "fill") + ggtitle("sex by num") + theme(plot.title = element_text(hjust = 0.5))
## Error in ggplot(df_clev, aes(sex, fill = num)): 没有"ggplot"这个函数
# cp by num
ggplot(df_clev, aes(cp, fill = num)) + geom_bar(position = "dodge") + ggtitle("cp by num") + theme(plot.title = element_text(hjust = 0.5))
## Error in ggplot(df_clev, aes(cp, fill = num)): 没有"ggplot"这个函数
# Get the correlation matrix chart
df_clev %>%
  select(age, trestbps, chol, thalach, oldpeak) %>%
  chart.Correlation(histogram=TRUE, pch=19)
## Error in df_clev %>% select(age, trestbps, chol, thalach, oldpeak) %>% : 没有"%>%"这个函数

Modeling

In Modeling, we will compare several models using 10-fold cross validation and select the best one fro df_clev. Finally we will test its performance on the testing dataset to see whether it is overfitting.


Regression and Random Forest models are made to analyze which model has better effect

#4.Modeling
#In Modeling, we will compare several models using 10-fold cross validation and select the best one fro df_clev. Finally we will test its performance on the testing dataset to see whether it is overfitting.
#Regression and Random Forest models are made to analyze which model has better effect


# Split the dataset into 80% and 20% as training and testing dataset respectively
# Set the random seed to ensure we get the same traning dataset and testing dataset everytime we run the code.
split <- 0.8
set.seed(998)
train_index <- createDataPartition(df_clev$num, p=split, list = F)
## Error in createDataPartition(df_clev$num, p = split, list = F): 没有"createDataPartition"这个函数
data_train <- df_clev[train_index, ]
## Error in eval(expr, envir, enclos): 找不到对象'df_clev'
data_test <- df_clev[-train_index, ]
## Error in eval(expr, envir, enclos): 找不到对象'df_clev'
# 10 fold cross validation
train_control <- trainControl(method="cv", number=10)
## Error in trainControl(method = "cv", number = 10): 没有"trainControl"这个函数
# knn
# Set the same random seed for each algorithm to ensure the data split for 10-fold cross validation
set.seed(2)
model_knn <- train(num~., data=data_train, trControl=train_control, method="knn", metric="Accuracy")
## Error in train(num ~ ., data = data_train, trControl = train_control, : 没有"train"这个函数
print(model_knn)
## Error in print(model_knn): 找不到对象'model_knn'
# logistics regression
set.seed(2)
model_glm <- train(num~., data=data_train, trControl=train_control, method="glm", family = 'binomial', metric="Accuracy")
## Error in train(num ~ ., data = data_train, trControl = train_control, : 没有"train"这个函数
print(model_glm)
## Error in print(model_glm): 找不到对象'model_glm'
# Random forest
set.seed(2)
model_rf <- train(num~., data=data_train, trControl=train_control, method="rf", metric="Accuracy")
## Error in train(num ~ ., data = data_train, trControl = train_control, : 没有"train"这个函数
print(model_rf)
## Error in print(model_rf): 找不到对象'model_rf'
# Make predictions
x_test <- data_test[, 1:length(data_train) - 1]
## Error in eval(expr, envir, enclos): 找不到对象'data_test'
y_test <- data_test[, length(data_train)]
## Error in eval(expr, envir, enclos): 找不到对象'data_test'
prediction <- predict(model_glm, x_test)
## Error in predict(model_glm, x_test): 找不到对象'model_glm'
# Have a look at the confusion metrics
confusionMatrix(prediction, y_test)
## Error in confusionMatrix(prediction, y_test): 没有"confusionMatrix"这个函数