library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.5.1 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(naniar)
library(readr)
library(ggplot2)
library(broom)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(tidyr)
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
Problem 1. Please read carefully each statement below and provide your code (.rmd) into chunks.
bday <- "28-Mar"
name <- "Anna"
me <- c(bday, name)
cat(me)
## 28-Mar Anna
class(me)
## [1] "character"
#We get this error because the data type of object me is "character" and 2 is numeric. For the devision operator to work both data types should be numeric.
cars <- read_csv("https://jhudatascience.org/intro_to_r/data/kaggleCarAuction.csv")
## Rows: 72983 Columns: 34
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (24): PurchDate, Auction, Make, Model, Trim, SubModel, Color, Transmissi...
## dbl (10): RefId, IsBadBuy, VehYear, VehicleAge, VehOdo, BYRNO, VNZIP1, VehBC...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(cars)
## # A tibble: 6 × 34
## RefId IsBadBuy PurchDate Auction VehYear VehicleAge Make Model Trim SubModel
## <dbl> <dbl> <chr> <chr> <dbl> <dbl> <chr> <chr> <chr> <chr>
## 1 1 0 12/7/2009 ADESA 2006 3 MAZDA MAZD… i 4D SEDA…
## 2 2 0 12/7/2009 ADESA 2004 5 DODGE 1500… ST QUAD CA…
## 3 3 0 12/7/2009 ADESA 2005 4 DODGE STRA… SXT 4D SEDA…
## 4 4 0 12/7/2009 ADESA 2004 5 DODGE NEON SXT 4D SEDAN
## 5 5 0 12/7/2009 ADESA 2005 4 FORD FOCUS ZX3 2D COUP…
## 6 6 0 12/7/2009 ADESA 2004 5 MITS… GALA… ES 4D SEDA…
## # ℹ 24 more variables: Color <chr>, Transmission <chr>, WheelTypeID <chr>,
## # WheelType <chr>, VehOdo <dbl>, Nationality <chr>, Size <chr>,
## # TopThreeAmericanName <chr>, MMRAcquisitionAuctionAveragePrice <chr>,
## # MMRAcquisitionAuctionCleanPrice <chr>,
## # MMRAcquisitionRetailAveragePrice <chr>,
## # MMRAcquisitonRetailCleanPrice <chr>, MMRCurrentAuctionAveragePrice <chr>,
## # MMRCurrentAuctionCleanPrice <chr>, MMRCurrentRetailAveragePrice <chr>, …
pct_complete(cars)
## [1] 99.90485
glimpse(cars)
## Rows: 72,983
## Columns: 34
## $ RefId <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1…
## $ IsBadBuy <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ PurchDate <chr> "12/7/2009", "12/7/2009", "12/7/2009…
## $ Auction <chr> "ADESA", "ADESA", "ADESA", "ADESA", …
## $ VehYear <dbl> 2006, 2004, 2005, 2004, 2005, 2004, …
## $ VehicleAge <dbl> 3, 5, 4, 5, 4, 5, 5, 4, 2, 2, 4, 8, …
## $ Make <chr> "MAZDA", "DODGE", "DODGE", "DODGE", …
## $ Model <chr> "MAZDA3", "1500 RAM PICKUP 2WD", "ST…
## $ Trim <chr> "i", "ST", "SXT", "SXT", "ZX3", "ES"…
## $ SubModel <chr> "4D SEDAN I", "QUAD CAB 4.7L SLT", "…
## $ Color <chr> "RED", "WHITE", "MAROON", "SILVER", …
## $ Transmission <chr> "AUTO", "AUTO", "AUTO", "AUTO", "MAN…
## $ WheelTypeID <chr> "1", "1", "2", "1", "2", "2", "2", "…
## $ WheelType <chr> "Alloy", "Alloy", "Covers", "Alloy",…
## $ VehOdo <dbl> 89046, 93593, 73807, 65617, 69367, 8…
## $ Nationality <chr> "OTHER ASIAN", "AMERICAN", "AMERICAN…
## $ Size <chr> "MEDIUM", "LARGE TRUCK", "MEDIUM", "…
## $ TopThreeAmericanName <chr> "OTHER", "CHRYSLER", "CHRYSLER", "CH…
## $ MMRAcquisitionAuctionAveragePrice <chr> "8155", "6854", "3202", "1893", "391…
## $ MMRAcquisitionAuctionCleanPrice <chr> "9829", "8383", "4760", "2675", "505…
## $ MMRAcquisitionRetailAveragePrice <chr> "11636", "10897", "6943", "4658", "7…
## $ MMRAcquisitonRetailCleanPrice <chr> "13600", "12572", "8457", "5690", "8…
## $ MMRCurrentAuctionAveragePrice <chr> "7451", "7456", "4035", "1844", "324…
## $ MMRCurrentAuctionCleanPrice <chr> "8552", "9222", "5557", "2646", "438…
## $ MMRCurrentRetailAveragePrice <chr> "11597", "11374", "7146", "4375", "6…
## $ MMRCurrentRetailCleanPrice <chr> "12409", "12791", "8702", "5518", "7…
## $ PRIMEUNIT <chr> "NULL", "NULL", "NULL", "NULL", "NUL…
## $ AUCGUART <chr> "NULL", "NULL", "NULL", "NULL", "NUL…
## $ BYRNO <dbl> 21973, 19638, 19638, 19638, 19638, 1…
## $ VNZIP1 <dbl> 33619, 33619, 33619, 33619, 33619, 3…
## $ VNST <chr> "FL", "FL", "FL", "FL", "FL", "FL", …
## $ VehBCost <dbl> 7100, 7600, 4900, 4100, 4000, 5600, …
## $ IsOnlineSale <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ WarrantyCost <dbl> 1113, 1053, 1389, 630, 1020, 594, 53…
str(cars$RefId) #The class of the first column named RefId is numeric
## num [1:72983] 1 2 3 4 5 6 7 8 9 10 ...
str(cars$IsBadBuy) #The class of the second column named IsBadBuy is numeric
## num [1:72983] 0 0 0 0 0 0 0 0 0 0 ...
str(cars$PurchDate) #The class of the third column named PurchDate is character
## chr [1:72983] "12/7/2009" "12/7/2009" "12/7/2009" "12/7/2009" "12/7/2009" ...
cat("There are", nrow(cars), "cars in the dataset. ")
## There are 72983 cars in the dataset.
cat("There are", ncol(cars), "variables in the dataset")
## There are 34 variables in the dataset
cars <- cars %>% filter(VehBCost > 5000)
nrow(cars) #After filtering the data we have 59957 cars left with VehBCost being higher than $5000
## [1] 59957
cars <- cars %>% mutate(MonthlyPrice = VehBCost / 70)
head(cars)
## # A tibble: 6 × 35
## RefId IsBadBuy PurchDate Auction VehYear VehicleAge Make Model Trim SubModel
## <dbl> <dbl> <chr> <chr> <dbl> <dbl> <chr> <chr> <chr> <chr>
## 1 1 0 12/7/2009 ADESA 2006 3 MAZDA MAZD… i 4D SEDA…
## 2 2 0 12/7/2009 ADESA 2004 5 DODGE 1500… ST QUAD CA…
## 3 6 0 12/7/2009 ADESA 2004 5 MITS… GALA… ES 4D SEDA…
## 4 9 0 12/7/2009 ADESA 2007 2 KIA SPEC… EX 4D SEDA…
## 5 10 0 12/7/2009 ADESA 2007 2 FORD FIVE… SEL 4D SEDA…
## 6 11 0 12/14/20… ADESA 2005 4 GMC 1500… SLE REG CAB…
## # ℹ 25 more variables: Color <chr>, Transmission <chr>, WheelTypeID <chr>,
## # WheelType <chr>, VehOdo <dbl>, Nationality <chr>, Size <chr>,
## # TopThreeAmericanName <chr>, MMRAcquisitionAuctionAveragePrice <chr>,
## # MMRAcquisitionAuctionCleanPrice <chr>,
## # MMRAcquisitionRetailAveragePrice <chr>,
## # MMRAcquisitonRetailCleanPrice <chr>, MMRCurrentAuctionAveragePrice <chr>,
## # MMRCurrentAuctionCleanPrice <chr>, MMRCurrentRetailAveragePrice <chr>, …
range(cars$VehYear) #Range of the manufacture year of the vehicles is from 2001 to 2010
## [1] 2001 2010
before_2004 <- nrow(cars %>% filter(VehYear<2004))
prop <- before_2004*100/nrow(cars)
before_2004 #we have 11113 cars that were before 2004
## [1] 6132
format(prop, digits=3) #the proportion of the cars before 2004 is 15.2 percent
## [1] "10.2"
manufactors <- cars$Make %>% unique()
manufactors
## [1] "MAZDA" "DODGE" "MITSUBISHI" "KIA" "FORD"
## [6] "GMC" "NISSAN" "CHEVROLET" "CHRYSLER" "HYUNDAI"
## [11] "PONTIAC" "SATURN" "TOYOTA" "SUZUKI" "JEEP"
## [16] "HONDA" "OLDSMOBILE" "BUICK" "SCION" "MERCURY"
## [21] "VOLKSWAGEN" "ISUZU" "LINCOLN" "MINI" "SUBARU"
## [26] "CADILLAC" "VOLVO" "INFINITI" "LEXUS" "ACURA"
## [31] "TOYOTA SCION" "HUMMER"
length(manufactors) #There are 33 different manufacturers in the dataset
## [1] 32
models <- cars %>% pull(Model) %>% unique() %>% length()
models #There are 1063 different models of vehicles
## [1] 985
cars %>% group_by(Color) %>% summarize(mean_cost=mean(VehBCost)) %>% filter(mean_cost==max(mean_cost)) #Black color, cost was nearly $7145
## # A tibble: 1 × 2
## Color mean_cost
## <chr> <dbl>
## 1 GREY 7551.
cars %>% filter(Color=="RED") %>% nrow() #6257 RED
## [1] 5095
cars %>% filter(Color=="BLUE") %>% nrow() #10347 BLUE
## [1] 8682
veh_columns <- grep("^Veh", names(cars), value = TRUE)
colMeans(cars[, veh_columns])
## VehYear VehicleAge VehOdo VehBCost
## 2005.654252 3.896176 70336.967210 7264.971979
#cars %>% select(starts_with("Veh")) %>% colMeans()
Problem 2. Please read carefully the next steps to analyze the dataset “heart_midterm”. You can read the description of the variables of the dataset here: https://archive.ics.uci.edu/dataset/45/heart+disease
Load the dataset and describe what your data looks like.
heart_data <- read_csv("/Users/anna/Downloads/heart_midterm.csv")
## Rows: 1025 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (14): age, sex, cp, trestbps, chol, fbs, restecg, thalach, exang, oldpea...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(heart_data)
head(heart_data)
## # A tibble: 6 × 14
## age sex cp trestbps chol fbs restecg thalach exang oldpeak slope
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 52 1 0 125 212 0 1 168 0 1 2
## 2 53 1 0 140 203 1 0 155 1 3.1 0
## 3 70 1 0 145 174 0 1 125 1 2.6 0
## 4 61 1 0 148 203 0 1 161 0 0 2
## 5 62 0 0 138 294 1 1 106 0 1.9 1
## 6 58 0 0 100 248 0 0 122 0 1 1
## # ℹ 3 more variables: ca <dbl>, thal <dbl>, target <dbl>
ncol(heart_data)
## [1] 14
nrow(heart_data)
## [1] 1025
pct_complete(heart_data)
## [1] 99.84669
#We can see that our data has 14 columns describing different variables, 1025 rows which correspond to different patients. #The percentage of the complete data is 99.8 % , which is good, as we don’t have many missing values.
Perform at least three different data subsetting, cleaning, or manipulation methods that were described in this course on your data. Examples are: renaming the columns, recoding values, filtering the data etc. Please describe what you did to clean/subset/ manipulate your data and why.
summary(heart_data)
## age sex cp trestbps
## Min. :29.00 Min. :0.0000 Min. :0.0000 Min. : 94.0
## 1st Qu.:48.00 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:120.0
## Median :56.00 Median :1.0000 Median :1.0000 Median :130.0
## Mean :54.43 Mean :0.6956 Mean :0.9424 Mean :131.6
## 3rd Qu.:61.00 3rd Qu.:1.0000 3rd Qu.:2.0000 3rd Qu.:140.0
## Max. :77.00 Max. :1.0000 Max. :3.0000 Max. :200.0
## NA's :2
## chol fbs restecg thalach
## Min. :126.0 Min. :0.0000 Min. :0.0000 Min. : 71.0
## 1st Qu.:211.0 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:132.0
## Median :240.0 Median :0.0000 Median :1.0000 Median :152.0
## Mean :246.1 Mean :0.1493 Mean :0.5293 Mean :149.1
## 3rd Qu.:275.0 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:166.0
## Max. :564.0 Max. :1.0000 Max. :2.0000 Max. :202.0
## NA's :7 NA's :1 NA's :3
## exang oldpeak slope ca
## Min. :0.0000 Min. :0.000 Min. :0.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:1.000 1st Qu.:0.0000
## Median :0.0000 Median :0.800 Median :1.000 Median :0.0000
## Mean :0.3359 Mean :1.073 Mean :1.384 Mean :0.7541
## 3rd Qu.:1.0000 3rd Qu.:1.800 3rd Qu.:2.000 3rd Qu.:1.0000
## Max. :1.0000 Max. :6.200 Max. :2.000 Max. :4.0000
## NA's :1 NA's :4 NA's :2
## thal target
## Min. :0.000 Min. :0.0000
## 1st Qu.:2.000 1st Qu.:0.0000
## Median :2.000 Median :1.0000
## Mean :2.322 Mean :0.5132
## 3rd Qu.:3.000 3rd Qu.:1.0000
## Max. :3.000 Max. :1.0000
## NA's :2
heart_clean <- na.omit(heart_data)
head(heart_clean)
## # A tibble: 6 × 14
## age sex cp trestbps chol fbs restecg thalach exang oldpeak slope
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 52 1 0 125 212 0 1 168 0 1 2
## 2 53 1 0 140 203 1 0 155 1 3.1 0
## 3 70 1 0 145 174 0 1 125 1 2.6 0
## 4 61 1 0 148 203 0 1 161 0 0 2
## 5 62 0 0 138 294 1 1 106 0 1.9 1
## 6 58 0 0 100 248 0 0 122 0 1 1
## # ℹ 3 more variables: ca <dbl>, thal <dbl>, target <dbl>
#As we didn’t have many missing values, I have decided to omit the observations with missing values (from summary table you can see which columns had these values before), as this won’t affect the sample size much.
#Now let’s rename the column names in a way that it would make more sense and make them look better with clean_names() function.
heart_clean <- heart_clean %>% rename(cholesterol=chol, glucose=fbs, ST_depression=oldpeak, max_heart_rate=thalach, blood_pressure=trestbps, exercise_angina=exang)
heart_clean <- clean_names(heart_clean)
#Now let’s filter the data to have only diabetics here (patients with fasting blood sugar > 120 mg/dl)
heart_clean$glucose <- as.numeric(heart_clean$glucose)
heart_clean <- heart_clean %>% filter(glucose == 1)
head(heart_clean)
## # A tibble: 6 × 14
## age sex cp blood_pressure cholesterol glucose restecg max_heart_rate
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 53 1 0 140 203 1 0 155
## 2 62 0 0 138 294 1 1 106
## 3 58 1 2 140 211 1 0 165
## 4 58 0 1 136 319 1 0 152
## 5 56 1 2 130 256 1 0 142
## 6 66 0 0 178 228 1 1 165
## # ℹ 6 more variables: exercise_angina <dbl>, st_depression <dbl>, slope <dbl>,
## # ca <dbl>, thal <dbl>, target <dbl>
#let’s subset sex, blood pressure and age, to see what data we have:
heart_clean[, c("sex","age", "blood_pressure")]
## # A tibble: 149 × 3
## sex age blood_pressure
## <dbl> <dbl> <dbl>
## 1 1 53 140
## 2 0 62 138
## 3 1 58 140
## 4 0 58 136
## 5 1 56 130
## 6 0 66 178
## 7 1 60 117
## 8 0 58 150
## 9 1 53 130
## 10 1 56 125
## # ℹ 139 more rows
#Lastly let’s order our data by age.
heart_clean <- arrange(heart_clean, age)
head(heart_clean)
## # A tibble: 6 × 14
## age sex cp blood_pressure cholesterol glucose restecg max_heart_rate
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 42 1 2 120 240 1 1 194
## 2 42 1 2 120 240 1 1 194
## 3 42 1 2 120 240 1 1 194
## 4 43 1 0 132 247 1 0 143
## 5 43 0 0 132 341 1 0 136
## 6 43 1 0 132 247 1 0 143
## # ℹ 6 more variables: exercise_angina <dbl>, st_depression <dbl>, slope <dbl>,
## # ca <dbl>, thal <dbl>, target <dbl>
Make two different kinds of visualizations of your data using ggplot2.
heart_clean$sex <- as_factor(heart_clean$sex)
ggplot(heart_clean, aes(x = age, y = blood_pressure, color = sex)) +
geom_point() +
geom_line() +
geom_smooth() +
theme_minimal()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
#This is a visualisation of the relationship of age and blood pressure
in females and males.
ggplot(heart_clean, mapping = aes(x = age, y = cholesterol)) +
geom_point(size = 5, color = "red", alpha = 0.5) +
geom_line(size = 0.8, color = "brown", linetype = 3) +
labs(title = "Plot of cholesterol levels depending on age")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Perform a simple analysis of your data. This can involve summarizing the data to describe aspects about it (quartiles, means, range etc.).
summary(heart_clean)
## age sex cp blood_pressure cholesterol
## Min. :42.00 0: 41 Min. :0.000 Min. :101.0 Min. :126.0
## 1st Qu.:53.00 1:108 1st Qu.:0.000 1st Qu.:128.0 1st Qu.:212.0
## Median :57.00 Median :1.000 Median :132.0 Median :246.0
## Mean :57.14 Mean :1.148 Mean :139.3 Mean :249.5
## 3rd Qu.:62.00 3rd Qu.:2.000 3rd Qu.:150.0 3rd Qu.:283.0
## Max. :71.00 Max. :3.000 Max. :200.0 Max. :417.0
## glucose restecg max_heart_rate exercise_angina st_depression
## Min. :1 Min. :0.000 Min. : 90.0 Min. :0.0000 Min. :0.000
## 1st Qu.:1 1st Qu.:0.000 1st Qu.:136.0 1st Qu.:0.0000 1st Qu.:0.000
## Median :1 Median :0.000 Median :150.0 Median :0.0000 Median :1.000
## Mean :1 Mean :0.396 Mean :148.7 Mean :0.3893 Mean :1.094
## 3rd Qu.:1 3rd Qu.:1.000 3rd Qu.:163.0 3rd Qu.:1.0000 3rd Qu.:1.600
## Max. :1 Max. :1.000 Max. :194.0 Max. :1.0000 Max. :4.000
## slope ca thal target
## Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.0000
## 1st Qu.:1.000 1st Qu.:0.000 1st Qu.:2.000 1st Qu.:0.0000
## Median :1.000 Median :1.000 Median :2.000 Median :0.0000
## Mean :1.275 Mean :1.121 Mean :2.255 Mean :0.4631
## 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:3.000 3rd Qu.:1.0000
## Max. :2.000 Max. :4.000 Max. :3.000 Max. :1.0000
#Here we can see that the mean age of the participants was 57 (with the range of 42 to 71) #The mean blood pressure was 139 (stage 1 hypertension category according to common guideline) #The mean cholesterol level was 250 (is considered borderline high) #We have to keep in mind that we had filtered the data and all these patients have glucose=1 , which means their fasting blood sugar is above 120
Perform two statistical test for your data. Describe what analysis you performed and why. Provide some simple interpretation about what your analysis might indicate about your data.
#I have performed ANOVA test to see if blood pressure is statistically different in males and females. As I got a p-value which is lower than 0.05, we can say that there is a statistical difference between the mean blood pressure between males and females.
anova_result <- aov(blood_pressure ~ sex , data = heart_clean)
tidy(anova_result)
## # A tibble: 2 × 6
## term df sumsq meansq statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 sex 1 3115. 3115. 7.73 0.00613
## 2 Residuals 147 59199. 403. NA NA
diabetics <- subset(heart_data, fbs == 1) $ "chol"
non_diabetics <- subset(heart_data, fbs == 0) $ "chol"
t_test_result_chol <- t.test(diabetics, non_diabetics)
tidy_result_chol <- tidy(t_test_result_chol)
glimpse(tidy_result_chol)
## Rows: 1
## Columns: 10
## $ estimate <dbl> 4.45839
## $ estimate1 <dbl> 249.8609
## $ estimate2 <dbl> 245.4025
## $ statistic <dbl> 1.005289
## $ p.value <dbl> 0.3159131
## $ parameter <dbl> 210.2789
## $ conf.low <dbl> -4.284237
## $ conf.high <dbl> 13.20102
## $ method <chr> "Welch Two Sample t-test"
## $ alternative <chr> "two.sided"
p_chol <- tidy_result_chol $ p.value
p_chol
## [1] 0.3159131
#Here I have performed a t-test by subseting data into diabetics and non-diabetics ( blood sugar level >120, <120). As I got a p-value higher than 0.05, this means that there is no statistical difference in mean cholesterol levels in diabetics and non-diabetocs.