Find interesting dataset and prepare short report (in R Markdown) which will consists:
Then, edit theme of the graphs and all scales of the graph and prepare publication-ready plots.
library(ggplot2)
library(dplyr)
##
## 載入套件:'dplyr'
## 下列物件被遮斷自 'package:stats':
##
## filter, lag
## 下列物件被遮斷自 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(RColorBrewer)
hdiease <- read.csv('/Users/jeank4723/Desktop/Advance VR/1/Data/framingham.csv')
head(hdiease)
## male age education currentSmoker cigsPerDay BPMeds prevalentStroke
## 1 1 39 4 0 0 0 0
## 2 0 46 2 0 0 0 0
## 3 1 48 1 1 20 0 0
## 4 0 61 3 1 30 0 0
## 5 0 46 3 1 23 0 0
## 6 0 43 2 0 0 0 0
## prevalentHyp diabetes totChol sysBP diaBP BMI heartRate glucose TenYearCHD
## 1 0 0 195 106.0 70 26.97 80 77 0
## 2 0 0 250 121.0 81 28.73 95 76 0
## 3 0 0 245 127.5 80 25.34 75 70 0
## 4 1 0 225 150.0 95 28.58 65 103 1
## 5 0 0 285 130.0 84 23.10 85 85 0
## 6 1 0 228 180.0 110 30.30 77 99 0
hdiease <- hdiease %>%
mutate_at(c("male","education","currentSmoker",
"prevalentStroke","prevalentHyp","diabetes",
"TenYearCHD"), as.factor)
str(hdiease)
## 'data.frame': 4238 obs. of 16 variables:
## $ male : Factor w/ 2 levels "0","1": 2 1 2 1 1 1 1 1 2 2 ...
## $ age : int 39 46 48 61 46 43 63 45 52 43 ...
## $ education : Factor w/ 4 levels "1","2","3","4": 4 2 1 3 3 2 1 2 1 1 ...
## $ currentSmoker : Factor w/ 2 levels "0","1": 1 1 2 2 2 1 1 2 1 2 ...
## $ cigsPerDay : int 0 0 20 30 23 0 0 20 0 30 ...
## $ BPMeds : int 0 0 0 0 0 0 0 0 0 0 ...
## $ prevalentStroke: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ prevalentHyp : Factor w/ 2 levels "0","1": 1 1 1 2 1 2 1 1 2 2 ...
## $ diabetes : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ totChol : int 195 250 245 225 285 228 205 313 260 225 ...
## $ sysBP : num 106 121 128 150 130 ...
## $ diaBP : num 70 81 80 95 84 110 71 71 89 107 ...
## $ BMI : num 27 28.7 25.3 28.6 23.1 ...
## $ heartRate : int 80 95 75 65 85 77 60 79 76 93 ...
## $ glucose : int 77 76 70 103 85 99 85 78 79 88 ...
## $ TenYearCHD : Factor w/ 2 levels "0","1": 1 1 1 2 1 1 2 1 1 1 ...
p1 <- ggplot(data = hdiease, aes(x = age))
p1 +
geom_bar(stat = 'Count', aes(y = ..count..), fill = 'skyblue4') +
labs(x = 'Age', y = 'Population', title = 'Age of the patient') +
theme_classic()
2. CurrentSmoker and Gender According to the plot, it shows that female patients in total are more than male patients. Moreover, male current smoker are more than female, but in the other way male people who are not current smoker are less than female people. However, we do not know if current smoker is more or less than not current smoker in this plot. if we would like to know we can switch the ‘position = “dodge”’ into ‘position = “stack”’.
p2 <- ggplot(data = hdiease, aes(x = currentSmoker, fill = male))
p2 + geom_bar(position = "dodge") +
xlab('Current Smoker') +
ylab('Population') +
labs(title = 'Current Smokers',
subtitle = '0 means not a current smoker; 1 means a current smoker',
fill = 'Gender')
3. Education Level In this pie plot, we can see that the majority of people are at the Level 1 education. In addtion, the second most is Level 2 education. The line which is surrounding the pie plot indicates the accumulation number of people is divided by education variable.
Note: + scale_fill_brewer() for different color
hdiease_edunum <- hdiease %>%
count(education)
str(hdiease_edunum)
## 'data.frame': 5 obs. of 2 variables:
## $ education: Factor w/ 4 levels "1","2","3","4": 1 2 3 4 NA
## $ n : int 1720 1253 687 473 105
p3 <- ggplot(data = hdiease_edunum, aes(x = "", y = n, fill = education))
library(wesanderson)
p3 + geom_bar(width = 1, stat = "identity", color = "white") +
coord_polar("y", start=0) +
scale_fill_hue(c=45, l=80) +
labs(title = "Patients' Education level")
4. Average BMI and Education
We can see that at least more than half of patients are close to or more than 25 BMI value which means most of them are currently in a state of overweight.
hdiease_edu <- hdiease %>%
group_by(education) %>%
summarise(BMI = median(BMI,na.rm=TRUE))
p4 <- ggplot(data = hdiease_edu, aes(x = education , y = BMI, fill = education))
p4 +
geom_bar(stat = 'identity',show.legend = F) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x = 'Education', y = 'Average BMI', title = 'Median BMI group by Education')