Data source 1: https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data
Data source 2: https://ourworldindata.org/grapher/share-deaths-heart-disease?tab=chart&country=~MYS
This is a study of differences in Stroke probability in different countries. When I get a data, I will know him from the following four aspects.
First, I look for dirty data.
Second, I will calculate the average, median and mode of the data.
Third, I will check whether there are missing values in the classification variables.
Fourth, I will check the data structure of each column
In terms of extracting rows/columns, I will do it mainly by the following means.
First, I'm going to select the point by choosing the coordinates, like df1[2, 3]
Second, I will select by selecting the column name, for example: df1["a", "weight"]
Third, I'm going to extract multiple points through a matrix, one column for each row, one column for each column sucn as df1 [cbind(c(1, 2, 1), 3:1)]
Fourth, I # select entire rows and columns by selecting one less dimension (this is a way of choosing more rows), for example: df1[2,] fetch rows, df1[, 3] fetch columns
# Import the dataset about death from heart disease between 1990 and 2017 df_country <- read.csv("C:/Users/AndrewSzl/Desktop/UM/WQD7001 PRINCIPLES OF DATA SCIENCE/DataSet_Group_Assignment/Descriptive modelling/share-deaths-heart-disease.csv") # Import the Cleveland dataset colNames <- c("age","sex","cp","trestbps","chol","fbs","restecg","thalach","exang","oldpeak","slope","ca","thal","num") df_clev <- read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data", col.names = colNames) # Have a look at the summary information of df_country summary(df_country) # Have a look at the structure str(df_country) # Frequency tables for column Entity and Code freq_table_Entity <- table(df_country $Entity) freq_table_Code <- table(df_country $Code) # To see whetehr there is any subcategory whose counts don't equal 28 freq_table_Entity[freq_table_Entity != 28] freq_table_Code[freq_table_Code != 28] # Load the library dplyr library(dplyr) # Filter the rows whose Entity and Code don't equal to " " df_country <- df_country %>% filter(Entity != "" & Code != "") df_country $Entity <- as.factor(df_country $Entity) df_country $Code <- as.factor(df_country $Code) # Have a look at the first six row head(df_clev) summary(df_clev) # Have a look at the its structure str(df_clev) # Have a look at the frequency table for each categorical variable table(df_clev $ ca) table(df_clev $ thal) # Assign "?" to NA df_clev[df_clev == "?"] <- NA / nrow(df_clev) # Filter the missing rows df_clev <- df_clev[complete.cases(df_clev),] # Convert variables classes df_clev $ sex <- factor(df_clev$sex, labels=c('Female','Male')) df_clev $ cp <- factor(df_clev $cp, labels=c('typical angina','atypical angina','non-anginal pain','asymptomatic')) df_clev $ trestbps <- as.integer(df_clev $trestbps) df_clev $ chol <- as.integer(df_clev $chol) df_clev $ fbs <-factor(df_clev$fbs,labels=c('<= 120','> 120')) df_clev $ restecg <- factor(df_clev$restecg, labels=c('normal','ST-T wave abnormality','LVH')) df_clev $ thalach <- as.integer(df_clev $thalach) df_clev $ exang <- factor(df_clev $exang, labels=c('No','Yes')) df_clev $ oldpeak <- as.numeric(df_clev $oldpeak) df_clev $ slope <- factor(df_clev $slope, labels=c('upsloping','flat','downsloping')) df_clev $ ca <- factor(df_clev$ca, labels = c("0 major vessels","1 major vessels", "2 major vessels", "3 major vessels")) df_clev $ thal <- factor(df_clev$thal, labels=c('normal','fixed defect','reversable defect')) df_clev $ num <- ifelse(df_clev $ num == 0, 0, 1 ) df_clev $ num <- factor(df_clev$num, labels=c('healthy','heart disease')) colSums(is.na(df_clev))
## Error: <text>:33:1: 意外的'/' ## 32: df_clev[df_clev == "?"] <- NA ## 33: / ## ^
The following methods are commonly used for data cleaning
1. Solutions to incomplete data (missing values)
In most cases, missing values must be filled in manually (that is, cleaned manually). Of course, some missing values can be derived from this or other data sources, which can be cleaned up by replacing the missing values with average, maximum, minimum, or more complex probability estimates.
2. Error value detection and solutions
Use statistical analysis to identify possible error values or outliers, such as bias analysis, identifying values that do not conform to distribution or regression equations, or check data values with a simple rule base (common sense rules, business specific rules, etc.), or use constraints between different attributes, external data to detect and clean up data.
3. Repeated record detection and elimination methods
Records with the same attribute value in the database are regarded as duplicate records. The equality of records is detected by judging whether the attribute value between records is equal. The equal records are merged into one record (merge/clear). Merge/clear is the basic method of weight loss.
# Have a look at the summary information of df_country summary(df_country)
## Error in summary(df_country): 找不到对象'df_country'
# Load the library dplyr library(dplyr)
## Error in library(dplyr): 不存在叫'dplyr'这个名字的程辑包
# Mean death percent of each country df_country_mean_death_rate <- df_country %>% group_by(Entity) %>% summarise(mean_death_rate = mean(death_percent)) %>% arrange(desc(mean_death_rate)) %>% ungroup()
## Error in df_country %>% group_by(Entity) %>% summarise(mean_death_rate = mean(death_percent)) %>% : 没有"%>%"这个函数
# Have a look at the highest and lowest average death rate of heart disease by country df_country_mean_death_rate[c(1, nrow(df_country_mean_death_rate)), ]
## Error in eval(expr, envir, enclos): 找不到对象'df_country_mean_death_rate'
# Mean death rate of all countries for each year mean_death_percent_every_year <- df_country %>% group_by(Year) %>% summarise(mean_death_rate_per_year = mean(death_percent)) %>% arrange(desc(mean_death_rate_per_year)) %>% ungroup()
## Error in df_country %>% group_by(Year) %>% summarise(mean_death_rate_per_year = mean(death_percent)) %>% : 没有"%>%"这个函数
# Have a look at with the highest and lowest mean death rate of heart disease by year mean_death_percent_every_year[c(1, nrow(mean_death_percent_every_year)),]
## Error in eval(expr, envir, enclos): 找不到对象'mean_death_percent_every_year'
# Have a look at the summary information of df_clev summary(df_clev)
## Error in summary(df_clev): 找不到对象'df_clev'
# Load the library ggplot2 and dplyr library(ggplot2)
## Error in library(ggplot2): 不存在叫'ggplot2'这个名字的程辑包
library(dplyr)
## Error in library(dplyr): 不存在叫'dplyr'这个名字的程辑包
df_country %>% filter(Entity %in% c("China", "Malaysia", "India")) %>% ggplot(aes(Year, death_percent, color = Entity)) + geom_line()
## Error in df_country %>% filter(Entity %in% c("China", "Malaysia", "India")) %>% : 没有"%>%"这个函数
par(mfrow = c(2,3)) with(df_clev, { boxplot(age, ylab = "Age", main = "Boxplot of age") boxplot(trestbps, ylab = "trestbps", main = "Boxplot of trestbps") boxplot(chol, ylab = "chol", main = "Boxplot of chol") boxplot(thalach, ylab = "chol", main = "Boxplot of chol") boxplot(oldpeak, ylab = "oldpeak", main = "Boxplot of oldpeak") } )
## Error in with(df_clev, {: 找不到对象'df_clev'
par(mfrow = c(2,3)) with(df_clev, { plot(sex, xlab = "sex", ylab = "frequency", main = "Barplot of sex") plot(fbs, xlab = "fbs", ylab = "frequency", main = "Barplot of fbs", ) plot(exang, xlab = "exangs", ylab = "frequency", main = "Barplot of exang") plot(slope, xlab = "slope", ylab = "frequency",main = "Barplot of slope") plot(num, xlab = "num", ylab = "frequency", main = "Barplot of num") } )
## Error in with(df_clev, {: 找不到对象'df_clev'
# Barplot of cp barplot(table(df_clev $cp), xlab = "cp", ylab = "frequency", main = "Barplot of cp")
## Error in table(df_clev$cp): 找不到对象'df_clev'
# Barplot of restecg barplot(table(df_clev $restecg), xlab = "restecg", ylab = "frequency", main = "Barplot of restecg")
## Error in table(df_clev$restecg): 找不到对象'df_clev'
# Barplot of ca barplot(table(df_clev $ca), xlab = "ca", ylab = "frequency", main = "Barplot of ca")
## Error in table(df_clev$ca): 找不到对象'df_clev'
# Barplot of thal barplot(table(df_clev $thal), xlab = "thal", ylab = "frequency", main = "Barplot of thal")
## Error in table(df_clev$thal): 找不到对象'df_clev'
# age by num # Make a boxplot ggplot(df_clev, aes(num, age)) + geom_boxplot() + ggtitle("age by num") + theme(plot.title = element_text(hjust = 0.5))
## Error in ggplot(df_clev, aes(num, age)): 没有"ggplot"这个函数
# trestbps by num ggplot(df_clev, aes(num, trestbps)) + geom_boxplot() + ggtitle("trestbps by num") + theme(plot.title = element_text(hjust = 0.5))
## Error in ggplot(df_clev, aes(num, trestbps)): 没有"ggplot"这个函数
# chol by num ggplot(df_clev, aes(num, chol)) + geom_boxplot() + ggtitle("chol by num") + theme(plot.title = element_text(hjust = 0.5))
## Error in ggplot(df_clev, aes(num, chol)): 没有"ggplot"这个函数
# thalach by num ggplot(df_clev, aes(num, thalach)) + geom_boxplot() + ggtitle("thalach by num") + theme(plot.title = element_text(hjust = 0.5))
## Error in ggplot(df_clev, aes(num, thalach)): 没有"ggplot"这个函数
# oldpeak by num ggplot(df_clev, aes(num, oldpeak)) + geom_boxplot() + ggtitle("oldpeak by num") + theme(plot.title = element_text(hjust = 0.5))
## Error in ggplot(df_clev, aes(num, oldpeak)): 没有"ggplot"这个函数
# sex by num #Load the library ggplot2 ggplot(df_clev, aes(sex, fill = num)) + geom_bar(position = "fill") + ggtitle("sex by num") + theme(plot.title = element_text(hjust = 0.5))
## Error in ggplot(df_clev, aes(sex, fill = num)): 没有"ggplot"这个函数
# cp by num ggplot(df_clev, aes(cp, fill = num)) + geom_bar(position = "dodge") + ggtitle("cp by num") + theme(plot.title = element_text(hjust = 0.5))
## Error in ggplot(df_clev, aes(cp, fill = num)): 没有"ggplot"这个函数
# Get the correlation matrix chart df_clev %>% select(age, trestbps, chol, thalach, oldpeak) %>% chart.Correlation(histogram=TRUE, pch=19)
## Error in df_clev %>% select(age, trestbps, chol, thalach, oldpeak) %>% : 没有"%>%"这个函数
In Modeling, we will compare several models using 10-fold cross validation and select the best one fro df_clev. Finally we will test its performance on the testing dataset to see whether it is overfitting.
Regression and Random Forest models are made to analyze which model has better effect
#4.Modeling #In Modeling, we will compare several models using 10-fold cross validation and select the best one fro df_clev. Finally we will test its performance on the testing dataset to see whether it is overfitting. #Regression and Random Forest models are made to analyze which model has better effect # Split the dataset into 80% and 20% as training and testing dataset respectively # Set the random seed to ensure we get the same traning dataset and testing dataset everytime we run the code. split <- 0.8 set.seed(998) train_index <- createDataPartition(df_clev$num, p=split, list = F)
## Error in createDataPartition(df_clev$num, p = split, list = F): 没有"createDataPartition"这个函数
data_train <- df_clev[train_index, ]
## Error in eval(expr, envir, enclos): 找不到对象'df_clev'
data_test <- df_clev[-train_index, ]
## Error in eval(expr, envir, enclos): 找不到对象'df_clev'
# 10 fold cross validation train_control <- trainControl(method="cv", number=10)
## Error in trainControl(method = "cv", number = 10): 没有"trainControl"这个函数
# knn # Set the same random seed for each algorithm to ensure the data split for 10-fold cross validation set.seed(2) model_knn <- train(num~., data=data_train, trControl=train_control, method="knn", metric="Accuracy")
## Error in train(num ~ ., data = data_train, trControl = train_control, : 没有"train"这个函数
print(model_knn)
## Error in print(model_knn): 找不到对象'model_knn'
# logistics regression set.seed(2) model_glm <- train(num~., data=data_train, trControl=train_control, method="glm", family = 'binomial', metric="Accuracy")
## Error in train(num ~ ., data = data_train, trControl = train_control, : 没有"train"这个函数
print(model_glm)
## Error in print(model_glm): 找不到对象'model_glm'
# Random forest set.seed(2) model_rf <- train(num~., data=data_train, trControl=train_control, method="rf", metric="Accuracy")
## Error in train(num ~ ., data = data_train, trControl = train_control, : 没有"train"这个函数
print(model_rf)
## Error in print(model_rf): 找不到对象'model_rf'
# Make predictions x_test <- data_test[, 1:length(data_train) - 1]
## Error in eval(expr, envir, enclos): 找不到对象'data_test'
y_test <- data_test[, length(data_train)]
## Error in eval(expr, envir, enclos): 找不到对象'data_test'
prediction <- predict(model_glm, x_test)
## Error in predict(model_glm, x_test): 找不到对象'model_glm'
# Have a look at the confusion metrics confusionMatrix(prediction, y_test)
## Error in confusionMatrix(prediction, y_test): 没有"confusionMatrix"这个函数