Attribute Information: 1. age 2. sex 3. chest pain type (4 values) 4. resting blood pressure 5. serum cholestoral in mg/dl 6. fasting blood sugar > 120 mg/dl 7. resting electrocardiographic results (values 0,1,2) 8. maximum heart rate achieved 9. exercise induced angina 10. oldpeak = ST depression induced by exercise relative to rest 11. the slope of the peak exercise ST segment 12. number of major vessels (0-3) colored by flourosopy 13. thal: 3 = normal; 6 = fixed defect; 7 = reversable defect
#Load all required packages
library(DT)
library(tidyr)
library(dplyr)
library(ggplot2)
#Read data from csv
heart_ds <- read.csv(file="Heart.csv", header=TRUE, sep=",")
#View sample data
head(heart_ds)
## ï..age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca
## 1 63 1 3 145 233 1 0 150 0 2.3 0 0
## 2 37 1 2 130 250 0 1 187 0 3.5 0 0
## 3 41 0 1 130 204 0 0 172 0 1.4 2 0
## 4 56 1 1 120 236 0 1 178 0 0.8 2 0
## 5 57 0 0 120 354 0 1 163 1 0.6 2 0
## 6 57 1 0 140 192 0 1 148 0 0.4 1 0
## thal target
## 1 1 1
## 2 2 1
## 3 2 1
## 4 2 1
## 5 2 1
## 6 1 1
#Examine the current columns
colnames(heart_ds)
## [1] "ï..age" "sex" "cp" "trestbps" "chol" "fbs"
## [7] "restecg" "thalach" "exang" "oldpeak" "slope" "ca"
## [13] "thal" "target"
#Rename columns
colnames(heart_ds) <- c("age","sex","chest_pain","bp","chol","sugar","cardio","hrt_rt","exer", "oldpeak","slope","ca","thal","target")
str(heart_ds)
## 'data.frame': 303 obs. of 14 variables:
## $ age : int 63 37 41 56 57 57 56 44 52 57 ...
## $ sex : int 1 1 0 1 0 1 0 1 1 1 ...
## $ chest_pain: int 3 2 1 1 0 0 1 1 2 2 ...
## $ bp : int 145 130 130 120 120 140 140 120 172 150 ...
## $ chol : int 233 250 204 236 354 192 294 263 199 168 ...
## $ sugar : int 1 0 0 0 0 0 0 0 1 0 ...
## $ cardio : int 0 1 0 1 1 1 0 1 1 1 ...
## $ hrt_rt : int 150 187 172 178 163 148 153 173 162 174 ...
## $ exer : int 0 0 0 0 1 0 0 0 0 0 ...
## $ oldpeak : num 2.3 3.5 1.4 0.8 0.6 0.4 1.3 0 0.5 1.6 ...
## $ slope : int 0 0 2 2 2 1 1 2 2 2 ...
## $ ca : int 0 0 0 0 0 0 0 0 0 0 ...
## $ thal : int 1 2 2 2 2 1 2 3 3 2 ...
## $ target : int 1 1 1 1 1 1 1 1 1 1 ...
#Summary
summary(heart_ds)
## age sex chest_pain bp
## Min. :29.00 Min. :0.0000 Min. :0.000 Min. : 94.0
## 1st Qu.:47.50 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:120.0
## Median :55.00 Median :1.0000 Median :1.000 Median :130.0
## Mean :54.37 Mean :0.6832 Mean :0.967 Mean :131.6
## 3rd Qu.:61.00 3rd Qu.:1.0000 3rd Qu.:2.000 3rd Qu.:140.0
## Max. :77.00 Max. :1.0000 Max. :3.000 Max. :200.0
## chol sugar cardio hrt_rt
## Min. :126.0 Min. :0.0000 Min. :0.0000 Min. : 71.0
## 1st Qu.:211.0 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:133.5
## Median :240.0 Median :0.0000 Median :1.0000 Median :153.0
## Mean :246.3 Mean :0.1485 Mean :0.5281 Mean :149.6
## 3rd Qu.:274.5 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:166.0
## Max. :564.0 Max. :1.0000 Max. :2.0000 Max. :202.0
## exer oldpeak slope ca
## Min. :0.0000 Min. :0.00 Min. :0.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00 1st Qu.:1.000 1st Qu.:0.0000
## Median :0.0000 Median :0.80 Median :1.000 Median :0.0000
## Mean :0.3267 Mean :1.04 Mean :1.399 Mean :0.7294
## 3rd Qu.:1.0000 3rd Qu.:1.60 3rd Qu.:2.000 3rd Qu.:1.0000
## Max. :1.0000 Max. :6.20 Max. :2.000 Max. :4.0000
## thal target
## Min. :0.000 Min. :0.0000
## 1st Qu.:2.000 1st Qu.:0.0000
## Median :2.000 Median :1.0000
## Mean :2.314 Mean :0.5446
## 3rd Qu.:3.000 3rd Qu.:1.0000
## Max. :3.000 Max. :1.0000
#View data
datatable(heart_ds)
#Remove unwanted data
new_cols <- c("age","sex","chest_pain","bp","chol","sugar","cardio","hrt_rt")
heart_ds_new <- heart_ds[new_cols]
#View new data
datatable(heart_ds_new)
#How many rows and columns
dim(heart_ds_new)
## [1] 303 8
#Create two new category for sex.
heart_ds_new <- heart_ds_new %>% mutate(M=if_else(sex==1, 1, 0))
heart_ds_new <- heart_ds_new %>% mutate(F=if_else(sex==0, 1, 0))
#Put bp in categories
heart_ds_new <- heart_ds_new %>% mutate(bp_cat=if_else(bp>=120, "high",
if_else(bp<120, "Normal", "")))
#Create a feature for chest pain
heart_ds_new <- heart_ds_new %>% mutate(has_chest_pain=if_else(chest_pain>2, 1, 0))
heart_ds_new <- gather(data=heart_ds_new, key=sex_col, value=sex_count, M, F)
datatable(heart_ds_new)
#Examine data for the entire population,
qplot(data=heart_ds_new, x=chol, y=hrt_rt, size=I(3), color=bp_cat, alpha=I(0.6), main="Cholesterol vs Heart Rate (All)")
#Lets look at population above 60
heart_ds_new_above60 <- filter(heart_ds_new, age > 60)
#60 and male
heart_ds_new_above60_m <- filter(heart_ds_new_above60, age > 60 & sex==1)
#60 and female
heart_ds_new_above60_f <- filter(heart_ds_new_above60, age > 60 & sex==1)
#Convert bp to percent
heart_ds_new_above60 <- mutate(heart_ds_new_above60, bp_pr = (bp/100))
#View above 60 data
datatable(heart_ds_new_above60)
#Examine data for the entire population,
qplot(data=heart_ds_new_above60, x=chol, y=hrt_rt, size=I(3), color=bp_cat, alpha=I(0.6), main="Cholesterol vs Heart Rate (Above 60)")
#Lets look at population below 60
heart_ds_new_below60 <- filter(heart_ds_new, age <= 60)
#Less than 60 and male
heart_ds_new_below60_m <- filter(heart_ds_new_below60, age <= 60 & sex==1)
#Less than 60 and female
heart_ds_new_above60_f <- filter(heart_ds_new_below60, age <= 60 & sex==1)
#Convert bp to percent
heart_ds_new_below60 <- mutate(heart_ds_new_below60, bp_pr = (bp/100))
#View above 60 data
datatable(heart_ds_new_below60)
#Examine data for the entire population,
qplot(data=heart_ds_new_below60, x=chol, y=hrt_rt, size=I(3), color=bp_cat, alpha=I(0.6), main="Cholesterol vs Heart Rate (Below 60)")
#Age and cholesterol
qplot(data=heart_ds_new, x=chol, y=age, size=I(3), color=bp_cat, alpha=I(0.6), main="Age vs Cholesterol (All)")
#Look at the mean bp
summarise(heart_ds_new, mean(bp, na.rm=TRUE))
## mean(bp, na.rm = TRUE)
## 1 131.6238
#What about sex and bp
summarise(group_by(heart_ds_new, sex), mean(bp, na.rm=TRUE))
## # A tibble: 2 x 2
## sex `mean(bp, na.rm = TRUE)`
## <int> <dbl>
## 1 0 133.
## 2 1 131.
#View random sample
datatable(sample_n(heart_ds_new, size=20))
#View 10% of the data
datatable(sample_frac(heart_ds_new, size=.1))
#How many male and female. Looks like we have uneven distribution.
count(heart_ds_new, sex)
## # A tibble: 2 x 2
## sex n
## <int> <int>
## 1 0 192
## 2 1 414
#sort
datatable(arrange(heart_ds_new, desc(age), sex))
#pipe operator for multiple func
heart_ds_new %>%
filter(sex==1) %>%
group_by(age) %>%
summary(mean(age, na.rm=TRUE))
## age sex chest_pain bp
## Min. :29.00 Min. :1 Min. :0.0000 Min. : 94.0
## 1st Qu.:47.00 1st Qu.:1 1st Qu.:0.0000 1st Qu.:120.0
## Median :54.00 Median :1 Median :0.0000 Median :130.0
## Mean :53.76 Mean :1 Mean :0.9324 Mean :130.9
## 3rd Qu.:59.75 3rd Qu.:1 3rd Qu.:2.0000 3rd Qu.:140.0
## Max. :77.00 Max. :1 Max. :3.0000 Max. :192.0
## chol sugar cardio hrt_rt
## Min. :126.0 Min. :0.0000 Min. :0.0000 Min. : 71
## 1st Qu.:208.0 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:132
## Median :235.0 Median :0.0000 Median :1.0000 Median :151
## Mean :239.3 Mean :0.1594 Mean :0.5072 Mean :149
## 3rd Qu.:268.5 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:168
## Max. :353.0 Max. :1.0000 Max. :2.0000 Max. :202
## bp_cat has_chest_pain sex_col sex_count
## Length:414 Min. :0.00000 Length:414 Min. :0.0
## Class :character 1st Qu.:0.00000 Class :character 1st Qu.:0.0
## Mode :character Median :0.00000 Mode :character Median :0.5
## Mean :0.09179 Mean :0.5
## 3rd Qu.:0.00000 3rd Qu.:1.0
## Max. :1.00000 Max. :1.0
library(tidyverse)
#Read data from csv
md_ds <- read.csv(file="national_marriage_divorce_rates_00-16.csv", header=FALSE, sep=",")
#View initial data
as.tibble(md_ds)
## # A tibble: 61 x 10
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
## <fct> <fct> <fct> <fct> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl>
## 1 Provisiona~ "" "" "" NA NA NA NA NA NA
## 2 "" "" "" "" NA NA NA NA NA NA
## 3 Year Marri~ Popu~ Rate pe~ NA NA NA NA NA NA
## 4 2016 2,245~ 323,~ 6.9 NA NA NA NA NA NA
## 5 2015 2,221~ 321,~ 6.9 NA NA NA NA NA NA
## 6 2014/1 2,140~ 308,~ 6.9 NA NA NA NA NA NA
## 7 2013/1 2,081~ 306,~ 6.8 NA NA NA NA NA NA
## 8 2012 2,131~ 313,~ 6.8 NA NA NA NA NA NA
## 9 2011 2,118~ 311,~ 6.8 NA NA NA NA NA NA
## 10 2010 2,096~ 308,~ 6.8 NA NA NA NA NA NA
## # ... with 51 more rows
#md_ds_tb <- as.tibble(md_ds)
colnames(md_ds)
## [1] "V1" "V2" "V3" "V4" "V5" "V6" "V7" "V8" "V9" "V10"
colnames(md_ds) <- c("years","marriage","population","population_rate","X.3","X.4","X.5","X.6","X.7","X.8")
#md_ds
md_ds <- md_ds[-c(33:61),]
md_ds <- md_ds[-c(21:32),]
md_ds <- md_ds[-c(1:3),]
#Remove unwanted data
md_ds_cols <- c("years","marriage","population","population_rate")
md_ds_new <- md_ds[md_ds_cols]
md_ds_new$years <- gsub("/\\d", "", md_ds_new$years)
#Reset the index
rownames(md_ds_new) <- 1:nrow(md_ds_new)
md_ds_new
## years marriage population population_rate
## 1 2016 2,245,404 323,127,513 6.9
## 2 2015 2,221,579 321,418,820 6.9
## 3 2014 2,140,272 308,759,713 6.9
## 4 2013 2,081,301 306,136,672 6.8
## 5 2012 2,131,000 313,914,040 6.8
## 6 2011 2,118,000 311,591,917 6.8
## 7 2010 2,096,000 308,745,538 6.8
## 8 2009 2,080,000 306,771,529 6.8
## 9 2008 2,157,000 304,093,966 7.1
## 10 2007 2,197,000 301,231,207 7.3
## 11 2006 2,193,000 294,077,247 7.5
## 12 2005 2,249,000 295,516,599 7.6
## 13 2004 2,279,000 292,805,298 7.8
## 14 2003 2,245,000 290,107,933 7.7
## 15 2002 2,290,000 287,625,193 8.0
## 16 2001 2,326,000 284,968,955 8.2
## 17 2000 2,315,000 281,421,906 8.2
#qplot(data=md_ds_new, x=population_rate, main="Population Rate")
#qplot(data=md_ds_new, x=marriage, y=population, size=I(3), color=I("Blue"), main="Marriage vs Population")
#qplot(data=md_ds_new, x=population_rate, y=population, size=I(3), color=I("Blue"), main="Marriage vs Population")
#qplot(data=md_ds_new, x=population_rate, y=marriage, size=I(3), color=I("Blue"), main="Marriage vs Population")
qplot(data=md_ds_new, x=years, y=population_rate, size=I(3), color=I("Red"), main="Population Rate vs Years")
qplot(data=md_ds_new, x=years, y=marriage, size=I(3), color=I("Blue"), main="Marriage vs Years")
qplot(data=md_ds_new, x=years, y=population, size=I(3), color=I("#CC0000"), main="Population vs Years")
library(sqldf)
it_revenue_ds <-sqldf(c("Drop table if exists revenue","CREATE table IT_revenue ( ID serial PRIMARY KEY, Year int (10) NOT NULL, Google int NULL, Facebook int NULL, Yahoo int NULL, Microsoft int NULL, AOL int NULL)",
"INSERT INTO IT_revenue (ID, Year, Google, Facebook, Yahoo, Microsoft, AOL) Values ('1','2009','0.36','0.56', '1.26', '0.37', '0.51'), ('2', '2010','0.86','1.21', '1.43', '0.51', '0.47'), ('3', '2011', '1.67', '1.73', '1.36', '0.6', '0.53'), ('4','2012','2.26','2.18', '1.35','0.9','0.7'), ('5','2013','2.99','3.17','1.27','0.79','0.73')", "Select Year, Google, Facebook, Yahoo, Microsoft, AOL from IT_revenue"))
#Original data
datatable(it_revenue_ds)
it_revenue_ds1 <- gather(it_revenue_ds,"Company","Revenue",2:6)
#Gathered data
datatable(it_revenue_ds1)
it_revenue_ds1_grp1 <- it_revenue_ds1 %>%
group_by(Company) %>%
summarize(total_revenue=sum(Revenue, na.rm=TRUE), count=n(), avg_revenue=mean(Revenue,na.rm=TRUE))
it_revenue_ds1_grp1
## # A tibble: 5 x 4
## Company total_revenue count avg_revenue
## <chr> <dbl> <int> <dbl>
## 1 AOL 2.94 5 0.588
## 2 Facebook 8.85 5 1.77
## 3 Google 8.14 5 1.63
## 4 Microsoft 3.17 5 0.634
## 5 Yahoo 6.67 5 1.33
qplot(data=it_revenue_ds1_grp1, x=Company, y=total_revenue, color=Company, size=I(10), main="Summary - Total Revenue by Company")
qplot(data=it_revenue_ds1_grp1, x=Company, y=avg_revenue, color=Company, size=I(10), main="Summary - Average Revenue by Company")
qplot(data=it_revenue_ds1, x=Revenue, y=Year, color=Company, size=I(3), main="Revenue by Company") + geom_line()
qplot(data=it_revenue_ds1, x=Company, y=Revenue, size=I(3), main="Revenue by Company")