# Load the necessary packages required to reproduce the report. For example:
library(kableExtra)
library(magrittr)
library(dplyr)
library(readr)
library(readxl)
| Student name | Student number | Percentage of contribution |
|---|---|---|
| Hrishika Shrestha | s4192201 | 100 |
The datasets used here is taken from Kaggle. The dataset is titled “Stroke Prediction Dataset” in Kaggle. The link to the dataset: https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset
The dataset contains 11 clinical features that are used for predicting stroke. It contains the following variables:
gender: “Male”, “Female” or “Other”
ever_married: “No” or “Yes”
work_type: “children”, “Govt_jov”, “Never_worked”,
“Private” or “Self-employed”
Residence_type:
“Rural” or “Urban”
smoking_status: “formerly
smoked”, “never smoked”, “smokes” or “Unknown”
id: unique identifier
age: Age
of the patient
avg_glucose_level: average glucose
level in blood
bmi: body mass index
hypertension: 0 if the patient doesn’t have
hypertension, 1 if the patient has hypertension
heart_disease: 0 if the patient doesn’t have any heart
diseases, 1 if the patient has a heart disease
stroke: 1 if the patient had a stroke or 0 if
not
# Read the dataset
stroke_data <- read.csv("C:\\Users\\Predator\\Downloads\\healthcare-dataset-stroke-data.csv")
# First 10 rows
head(stroke_data,10)
Here at first the dimensions of the dataset is checked to know the total rows and columns present in the dataset used.
# Check the dimensions of the dataset
dim_result <- dim(stroke_data)
cat("Dimensions of the data frame: ", dim_result[1], "rows and", dim_result[2], "columns.\n")
Dimensions of the data frame: 5110 rows and 12 columns.
By using colnames(), we extracted the name of the
columns of our dataset.
# check column names
col_names <- colnames(stroke_data)
print("Column names in the data frame: ")
[1] "Column names in the data frame: "
print(col_names)
[1] "id" "gender" "age" "hypertension" "heart_disease" "ever_married" "work_type"
[8] "Residence_type" "avg_glucose_level" "bmi" "smoking_status" "stroke"
By using glimpse() the datatypes of each variables
present in the dataset is checked, this gives data in a readable and
compact format. Similarly, capture.output() function is
used to capture the output of data and writelines()
function to write the output in lines.
# check data types
cat("Structure of the data frame:\n")
Structure of the data frame:
output <- capture.output(glimpse(stroke_data))
writeLines(output)
Rows: 5,110
Columns: 12
$ id <int> 9046, 51676, 31112, 60182, 1665, 56669, 53882, 10434, 27419, 60491, 12109, 12095, 12175, 8213, 5317, 58202, 56112, 34120, 274…
$ gender <chr> "Male", "Female", "Male", "Female", "Female", "Male", "Male", "Female", "Female", "Female", "Female", "Female", "Female", "Ma…
$ age <dbl> 67, 61, 80, 49, 79, 81, 74, 69, 59, 78, 81, 61, 54, 78, 79, 50, 64, 75, 60, 57, 71, 52, 79, 82, 71, 80, 65, 58, 69, 59, 57, 4…
$ hypertension <int> 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,…
$ heart_disease <int> 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,…
$ ever_married <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "No", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", …
$ work_type <chr> "Private", "Self-employed", "Private", "Private", "Self-employed", "Private", "Private", "Private", "Private", "Private", "Pr…
$ Residence_type <chr> "Urban", "Rural", "Rural", "Urban", "Rural", "Urban", "Rural", "Urban", "Rural", "Urban", "Rural", "Rural", "Urban", "Urban",…
$ avg_glucose_level <dbl> 228.69, 202.21, 105.92, 171.23, 174.12, 186.21, 70.09, 94.39, 76.15, 58.57, 80.43, 120.46, 104.51, 219.84, 214.09, 167.41, 19…
$ bmi <chr> "36.6", "N/A", "32.5", "34.4", "24", "29", "27.4", "22.8", "N/A", "24.2", "29.7", "36.8", "27.3", "N/A", "28.2", "30.9", "37.…
$ smoking_status <chr> "formerly smoked", "never smoked", "never smoked", "smokes", "never smoked", "formerly smoked", "never smoked", "never smoked…
$ stroke <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
# check data summary
summary_result <- summary(stroke_data)
print("Summary of the data:")
[1] "Summary of the data:"
print(summary_result)
id gender age hypertension heart_disease ever_married work_type Residence_type
Min. : 67 Length:5110 Min. : 0.08 Min. :0.00000 Min. :0.00000 Length:5110 Length:5110 Length:5110
1st Qu.:17741 Class :character 1st Qu.:25.00 1st Qu.:0.00000 1st Qu.:0.00000 Class :character Class :character Class :character
Median :36932 Mode :character Median :45.00 Median :0.00000 Median :0.00000 Mode :character Mode :character Mode :character
Mean :36518 Mean :43.23 Mean :0.09746 Mean :0.05401
3rd Qu.:54682 3rd Qu.:61.00 3rd Qu.:0.00000 3rd Qu.:0.00000
Max. :72940 Max. :82.00 Max. :1.00000 Max. :1.00000
avg_glucose_level bmi smoking_status stroke
Min. : 55.12 Length:5110 Length:5110 Min. :0.00000
1st Qu.: 77.25 Class :character Class :character 1st Qu.:0.00000
Median : 91.89 Mode :character Mode :character Median :0.00000
Mean :106.15 Mean :0.04873
3rd Qu.:114.09 3rd Qu.:0.00000
Max. :271.74 Max. :1.00000
There are both categorical and numerical variables in the dataset. As an identifier (factor/character), id ought to be handled accordingly. While gender, ever_married, work_type, residence_type, and smoking_status should be factors, age, avg_glucose_level, and bmi are numerical. Heart disease, stroke, and hypertension should be classified as “No” and “Yes” variables since they are binary (0/1).
# Convert 'id' to factor since it's an identifier, not for numeric analysis
stroke_data$id <- as.factor(stroke_data$id)
# Convert 'gender' to factor (categorical variable)
stroke_data$gender <- as.factor(stroke_data$gender)
# Convert 'ever_married' to factor (categorical variable: Yes/No)
stroke_data$ever_married <- as.factor(stroke_data$ever_married)
# Convert 'work_type' to factor (categorical variable: Govt_job, Private, etc.)
stroke_data$work_type <- as.factor(stroke_data$work_type)
# Convert 'Residence_type' to factor (categorical variable: Rural/Urban)
stroke_data$Residence_type <- as.factor(stroke_data$Residence_type)
# Convert 'bmi' to numeric (continuous variable; may contain missing values)
stroke_data$bmi <- as.numeric(stroke_data$bmi)
G2;H2;Warningh: NAs introduced by coerciong
# Convert 'smoking_status' to factor (categorical variable: never smoked, formerly smoked, etc.)
stroke_data$smoking_status <- as.factor(stroke_data$smoking_status)
# Convert 'hypertension' from numeric 0/1 into factor with labels No/Yes
stroke_data$hypertension <- factor(as.numeric(as.character(stroke_data$hypertension)),
levels = c(0,1), labels = c("No","Yes"))
# Convert 'heart_disease' from numeric 0/1 into factor with labels No/Yes
stroke_data$heart_disease <- factor(as.numeric(as.character(stroke_data$heart_disease)),
levels = c(0,1), labels = c("No","Yes"))
# Convert 'stroke' from numeric 0/1 into factor with labels No/Yes
stroke_data$stroke <- factor(as.numeric(as.character(stroke_data$stroke)),
levels = c(0,1), labels = c("No","Yes"))
# Check the structure of the dataset after type conversions
str(stroke_data)
'data.frame': 5110 obs. of 12 variables:
$ id : Factor w/ 5110 levels "67","77","84",..: 672 3611 2152 4227 115 3971 3767 749 1895 4244 ...
$ gender : Factor w/ 3 levels "Female","Male",..: 2 1 2 1 1 2 2 1 1 1 ...
$ age : num 67 61 80 49 79 81 74 69 59 78 ...
$ hypertension : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 2 1 1 1 ...
$ heart_disease : Factor w/ 2 levels "No","Yes": 2 1 2 1 1 1 2 1 1 1 ...
$ ever_married : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 1 2 2 ...
$ work_type : Factor w/ 5 levels "children","Govt_job",..: 4 5 4 4 5 4 4 4 4 4 ...
$ Residence_type : Factor w/ 2 levels "Rural","Urban": 2 1 1 2 1 2 1 2 1 2 ...
$ avg_glucose_level: num 229 202 106 171 174 ...
$ bmi : num 36.6 NA 32.5 34.4 24 29 27.4 22.8 NA 24.2 ...
$ smoking_status : Factor w/ 4 levels "formerly smoked",..: 1 2 2 3 2 1 2 2 4 4 ...
$ stroke : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
Using indexing, we retrieved the first ten rows and every column, then we transformed the subset into a matrix. All values were forced to character class in order to preserve consistency because data frames can have mixed types whereas matrices cannot.
# Subset your data and convert it to a matrix, provide R codes here.
# Subset first 10 rows
subset_df <- stroke_data[1:10, ]
# Convert to matrix
matrix_data <- as.matrix(subset_df)
# Check matrix structure
str(matrix_data)
chr [1:10, 1:12] "9046" "51676" "31112" "60182" "1665" "56669" "53882" "10434" "27419" "60491" "Male" "Female" "Male" "Female" "Female" "Male" ...
- attr(*, "dimnames")=List of 2
..$ : chr [1:10] "1" "2" "3" "4" ...
..$ : chr [1:12] "id" "gender" "age" "hypertension" ...
print("First few rows of the matrix:")
[1] "First few rows of the matrix:"
print(head(matrix_data, 3))
id gender age hypertension heart_disease ever_married work_type Residence_type avg_glucose_level bmi smoking_status stroke
1 "9046" "Male" "67" "No" "Yes" "Yes" "Private" "Urban" "228.69" "36.6" "formerly smoked" "Yes"
2 "51676" "Female" "61" "No" "No" "Yes" "Self-employed" "Rural" "202.21" NA "never smoked" "Yes"
3 "31112" "Male" "80" "No" "Yes" "Yes" "Private" "Rural" "105.92" "32.5" "never smoked" "Yes"
The structure changes to character once the first ten rows are selected and converted to a matrix since matrices can only hold one type of data and R automatically converts all variables to character because categorical variables are present.
We started by creating a tiny data frame with variables related to
the risk of stroke. While the ordinal variable risk_level
appropriately factorizes and orders stroke risk as Low, Medium, or High,
the integer variable years_smoking reflects the number of
years a patient has smoked. Next, we used cbind() to add a
third numerical variable, bmi. Three variables with the
proper kinds are included in the final data frame: an integer, an
ordered factor, and a numeric variable.
# Integer variable: years the patient has smoked
years_smoking <- c(0, 5, 10, 2, 0, 15, 3, 1, 20, 8)
# Ordinal variable: stroke risk levels
risk_level <- c("Low", "Medium", "High", "Medium", "Low",
"High", "Medium", "Low", "High", "Medium")
# Convert risk_level to an ordered factor
risk_level <- factor(risk_level,
levels = c("Low", "Medium", "High"),
ordered = TRUE)
# Combine into a data frame
df <- data.frame(years_smoking, risk_level)
# Check structure
str(df)
'data.frame': 10 obs. of 2 variables:
$ years_smoking: num 0 5 10 2 0 15 3 1 20 8
$ risk_level : Ord.factor w/ 3 levels "Low"<"Medium"<..: 1 2 3 2 1 3 2 1 3 2
# Check levels of the ordinal variable
levels(df$risk_level)
[1] "Low" "Medium" "High"
# Numeric variable: BMI values
bmi <- c(22.5, 27.8, 31.2, 24.7, 29.1, 26.3, 28.4, 23.9, 32.0, 25.5)
# Add BMI to the data frame
df <- cbind(df, bmi)
# Check structure
str(df)
'data.frame': 10 obs. of 3 variables:
$ years_smoking: num 0 5 10 2 0 15 3 1 20 8
$ risk_level : Ord.factor w/ 3 levels "Low"<"Medium"<..: 1 2 3 2 1 3 2 1 3 2
$ bmi : num 22.5 27.8 31.2 24.7 29.1 26.3 28.4 23.9 32 25.5