The following notebook uses two data sets, one from Bihar, India and the other from the US. Each data set includes 6 variables : pesonid, female (boolean), adult(boolean), age, height_cm, and weight_kg are numeric. The notebook looks at the distribution of female adults in both locations for illustration purposes. For that, we use tidyverse for data manipulation and ggplot2 from exploring the data.
Load the Packages
library(ggplot2)
library(tidyverse)
require(cowplot)
Read Bihar, India Data
#load in the data
bihar_data <- read_csv("Data Sets/Week3/Bihar_sample_data.csv")
Parsed with column specification:
cols(
personid = col_double(),
female = col_double(),
adult = col_double(),
age = col_double(),
height_cm = col_double(),
weight_kg = col_double()
)
#have a look
head(bihar_data)
Keep Female Adults, Bihar
bihar_adult_females <- bihar_data[bihar_data$female == 1 & bihar_data$adult == 1, -which(names(bihar_data) %in% c("female", "adult"))]
# OR using dplyr
bihar_adult_females_2 <- bihar_data %>% filter(female == 1, adult == 1, height_cm > 120, height_cm < 200) %>% select(-c(personid, female, adult))
head(bihar_adult_females_2)
Ploting Histogram of Female Height in Bihar, India using ggplot2
#Include only female adults whose heights are between 120 cm and 200 cm
Warning messages:
1: In readChar(file, size, TRUE) : truncating string with embedded nuls
2: In readChar(file, size, TRUE) : truncating string with embedded nuls
bihar_adult_females_trunc <- filter(bihar_adult_females, height_cm > 120, height_cm < 200)
p_bihar <- ggplot(bihar_adult_females_trunc, aes(height_cm))
p_bihar + geom_histogram(fill="blue", color="darkblue", binwidth = 5) +
xlab("Height in centimeters, Bihar Females") +
ylab("Number of Female adults") +
ggtitle("Height of Adult Females in Bihar, India") +
theme(plot.title = element_text(hjust = 0.5))
ggsave("bihar_raw.pdf")
Saving 7.29 x 4.5 in image

Read US Data
us_data <- read_csv("Data Sets/Week3/US_sample_data.csv")
Missing column names filled in: 'X1' [1]Parsed with column specification:
cols(
X1 = col_double(),
seqn = col_double(),
female = col_double(),
adult = col_double(),
age = col_double(),
height_cm = col_double(),
weight_kg = col_double()
)
us_adult_females_trunc <- us_data %>% filter(female == 1, adult == 1, height_cm > 120, height_cm < 200) %>% select(-c(X1, seqn, female, adult))
head(us_adult_females_trunc)
Ploting Histogram of Female Height in the US using ggplot2
p_US <- ggplot(us_adult_females_trunc, aes(height_cm))
p_US + geom_histogram(fill="blue", color="darkblue", binwidth = 5) +
xlab("Height in centimeters, Bihar Females") +
ylab("Number of Adult Females") +
ggtitle("Height of Adult Females in the US") +
theme(plot.title = element_text(hjust = 0.5))

Kernel Density Estimation
p_US + geom_histogram(data=us_adult_females_trunc, aes(height_cm , ..density..), fill="blue" , color="darkred", breaks = seq(130, 200, 5)) +
geom_density(kernel="gaussian", aes(height_cm), bw = 5)

Bihar and US Histograms
p_bihar + geom_histogram(data=bihar_adult_females_trunc, aes(height_cm, ..density.. ),fill = "blue", color = "darkblue") +
geom_histogram(alpha = 0.5, data=us_adult_females_trunc, aes(height_cm , ..density..), fill = "red", color = "darkred") +
xlab("Height in centimeters") +
ggtitle("Distribution of Height for Female Adults in Bihar, India and the US") +
theme(plot.title = element_text(hjust = 0.5))

# more visible as points
ggplot(bihar_adult_females_trunc, aes(height_cm))+
geom_freqpoly(data=bihar_adult_females_trunc, aes(height_cm, ..density.. ), color="darkblue" )+
geom_freqpoly(data=us_adult_females_trunc, aes(height_cm , ..density..), color="darkred" )+
xlab("Height in centimeters")

# Kernel density
ggplot(bihar_adult_females_trunc, aes(height_cm))+
geom_density(data=bihar_adult_females_trunc, aes(height_cm), color="darkblue" )+
geom_density(data=us_adult_females_trunc, aes(height_cm), color="darkred" )+
xlab("Height in centimeters")

# Representing the CDF
p_bihar + stat_ecdf(data=bihar_adult_females_trunc, aes(height_cm), color="darkblue" )+
stat_ecdf(data=us_adult_females_trunc, aes(height_cm), color="darkred" )+
xlab("Height in centimeters") +
ylab("Probability")

bihar_adult_females_2$location <- 'Bihar'
us_adult_females_trunc$location <- 'US'
combdata <- rbind(bihar_adult_females_2, us_adult_females_trunc)
tail(combdata)
ggplot(combdata, aes(height_cm, fill = location)) +
geom_density(alpha = 0.2) +
xlab("Height in cm") +
ggtitle("Height of Female Adults") +
theme(plot.title = element_text(hjust = 0.5))

LS0tDQp0aXRsZTogIlByb2JhYmlsaXR5IERpc3RyaWJ1dGlvbiINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQoNClRoZSBmb2xsb3dpbmcgbm90ZWJvb2sgdXNlcyB0d28gZGF0YSBzZXRzLCBvbmUgZnJvbSBCaWhhciwgSW5kaWEgYW5kIHRoZSBvdGhlciBmcm9tIHRoZSBVUy4gRWFjaCBkYXRhIHNldCBpbmNsdWRlcyA2IHZhcmlhYmxlcyA6IHBlc29uaWQsIGZlbWFsZSAoYm9vbGVhbiksIGFkdWx0KGJvb2xlYW4pLCBhZ2UsIGhlaWdodF9jbSwgYW5kIHdlaWdodF9rZyBhcmUgbnVtZXJpYy4gVGhlIG5vdGVib29rIGxvb2tzIGF0IHRoZSBkaXN0cmlidXRpb24gb2YgZmVtYWxlIGFkdWx0cyBpbiBib3RoIGxvY2F0aW9ucyBmb3IgaWxsdXN0cmF0aW9uIHB1cnBvc2VzLiBGb3IgdGhhdCwgd2UgdXNlIHRpZHl2ZXJzZSBmb3IgZGF0YSBtYW5pcHVsYXRpb24gYW5kIGdncGxvdDIgZnJvbSBleHBsb3JpbmcgdGhlIGRhdGEuIA0KDQojIyMgTG9hZCB0aGUgUGFja2FnZXMNCmBgYHtyfQ0KbGlicmFyeShnZ3Bsb3QyKQ0KbGlicmFyeSh0aWR5dmVyc2UpDQpyZXF1aXJlKGNvd3Bsb3QpDQpgYGANCg0KIyMjIFJlYWQgQmloYXIsIEluZGlhIERhdGENCmBgYHtyIHNldHVwLCBpbmNsdWRlPUZBTFNFLCBlY2hvPUZBTFNFfQ0KcmVxdWlyZSgia25pdHIiKQ0Kb3B0c19rbml0JHNldChyb290LmRpciA9ICJDOi9Vc2Vycy9UYXJlay9Eb2N1bWVudHMvTUlUeCAxNC4zMTB4IC0gRGF0YSBBbmFseXNpcyBmb3IgU29jaWFsIFNjaWVudGlzdHMiKQ0KYGBgDQoNCmBgYHtyfQ0KI2xvYWQgaW4gdGhlIGRhdGEgDQpiaWhhcl9kYXRhIDwtIHJlYWRfY3N2KCJEYXRhIFNldHMvV2VlazMvQmloYXJfc2FtcGxlX2RhdGEuY3N2IikNCg0KI2hhdmUgYSBsb29rDQpoZWFkKGJpaGFyX2RhdGEpDQpgYGANCg0KIyMjIEtlZXAgRmVtYWxlIEFkdWx0cywgQmloYXINCmBgYHtyfQ0KYmloYXJfYWR1bHRfZmVtYWxlcyA8LSBiaWhhcl9kYXRhW2JpaGFyX2RhdGEkZmVtYWxlID09IDEgJiBiaWhhcl9kYXRhJGFkdWx0ID09IDEsIC13aGljaChuYW1lcyhiaWhhcl9kYXRhKSAlaW4lIGMoImZlbWFsZSIsICJhZHVsdCIpKV0NCg0KIyBPUiB1c2luZyBkcGx5cg0KYmloYXJfYWR1bHRfZmVtYWxlc18yIDwtIGJpaGFyX2RhdGEgJT4lIGZpbHRlcihmZW1hbGUgPT0gMSwgYWR1bHQgPT0gMSwgaGVpZ2h0X2NtID4gMTIwLCBoZWlnaHRfY20gPCAyMDApICU+JSBzZWxlY3QoLWMocGVyc29uaWQsIGZlbWFsZSwgYWR1bHQpKQ0KaGVhZChiaWhhcl9hZHVsdF9mZW1hbGVzXzIpDQpgYGANCiMjIyBQbG90aW5nIEhpc3RvZ3JhbSBvZiBGZW1hbGUgSGVpZ2h0IGluIEJpaGFyLCBJbmRpYSB1c2luZyBnZ3Bsb3QyDQpgYGB7cn0NCiNJbmNsdWRlIG9ubHkgZmVtYWxlIGFkdWx0cyB3aG9zZSBoZWlnaHRzIGFyZSBiZXR3ZWVuIDEyMCBjbSBhbmQgMjAwIGNtDQpiaWhhcl9hZHVsdF9mZW1hbGVzX3RydW5jIDwtIGZpbHRlcihiaWhhcl9hZHVsdF9mZW1hbGVzLCBoZWlnaHRfY20gPiAxMjAsIGhlaWdodF9jbSA8IDIwMCkNCg0KcF9iaWhhciA8LSBnZ3Bsb3QoYmloYXJfYWR1bHRfZmVtYWxlc190cnVuYywgYWVzKGhlaWdodF9jbSkpDQpwX2JpaGFyICsgZ2VvbV9oaXN0b2dyYW0oZmlsbD0iYmx1ZSIsIGNvbG9yPSJkYXJrYmx1ZSIsIGJpbndpZHRoID0gNSkgKw0KICAgICAgICAgIHhsYWIoIkhlaWdodCBpbiBjZW50aW1ldGVycywgQmloYXIgRmVtYWxlcyIpICsNCiAgICAgICAgICB5bGFiKCJOdW1iZXIgb2YgRmVtYWxlIGFkdWx0cyIpICsNCiAgICAgICAgICBnZ3RpdGxlKCJIZWlnaHQgb2YgQWR1bHQgRmVtYWxlcyBpbiBCaWhhciwgSW5kaWEiKSArDQogICAgICAgICAgdGhlbWUocGxvdC50aXRsZSA9IGVsZW1lbnRfdGV4dChoanVzdCA9IDAuNSkpDQpnZ3NhdmUoImJpaGFyX3Jhdy5wZGYiKQ0KYGBgDQojIyMgUmVhZCBVUyBEYXRhDQpgYGB7cn0NCnVzX2RhdGEgPC0gcmVhZF9jc3YoIkRhdGEgU2V0cy9XZWVrMy9VU19zYW1wbGVfZGF0YS5jc3YiKQ0KdXNfYWR1bHRfZmVtYWxlc190cnVuYyA8LSB1c19kYXRhICU+JSBmaWx0ZXIoZmVtYWxlID09IDEsIGFkdWx0ID09IDEsIGhlaWdodF9jbSA+IDEyMCwgaGVpZ2h0X2NtIDwgMjAwKSAlPiUgc2VsZWN0KC1jKFgxLCBzZXFuLCBmZW1hbGUsIGFkdWx0KSkNCmhlYWQodXNfYWR1bHRfZmVtYWxlc190cnVuYykNCmBgYA0KIyMjIFBsb3RpbmcgSGlzdG9ncmFtIG9mIEZlbWFsZSBIZWlnaHQgaW4gdGhlIFVTIHVzaW5nIGdncGxvdDINCmBgYHtyfQ0KcF9VUyA8LSBnZ3Bsb3QodXNfYWR1bHRfZmVtYWxlc190cnVuYywgYWVzKGhlaWdodF9jbSkpDQpwX1VTICsgZ2VvbV9oaXN0b2dyYW0oZmlsbD0iYmx1ZSIsIGNvbG9yPSJkYXJrYmx1ZSIsIGJpbndpZHRoID0gNSkgKw0KICAgICAgIHhsYWIoIkhlaWdodCBpbiBjZW50aW1ldGVycywgQmloYXIgRmVtYWxlcyIpICsNCiAgICAgICB5bGFiKCJOdW1iZXIgb2YgQWR1bHQgRmVtYWxlcyIpICsNCiAgICAgICBnZ3RpdGxlKCJIZWlnaHQgb2YgQWR1bHQgRmVtYWxlcyBpbiB0aGUgVVMiKSArDQogICAgICAgdGhlbWUocGxvdC50aXRsZSA9IGVsZW1lbnRfdGV4dChoanVzdCA9IDAuNSkpIA0KYGBgDQojIyMgS2VybmVsIERlbnNpdHkgRXN0aW1hdGlvbg0KYGBge3J9DQpwX1VTICsgZ2VvbV9oaXN0b2dyYW0oZGF0YT11c19hZHVsdF9mZW1hbGVzX3RydW5jLCBhZXMoaGVpZ2h0X2NtICwgLi5kZW5zaXR5Li4pLCBmaWxsPSJibHVlIiAsIGNvbG9yPSJkYXJrcmVkIiwgYnJlYWtzID0gc2VxKDEzMCwgMjAwLCA1KSkgKw0KICAgICAgIGdlb21fZGVuc2l0eShrZXJuZWw9ImdhdXNzaWFuIiwgYWVzKGhlaWdodF9jbSksIGJ3ID0gNSkNCmBgYA0KIyMjIEJpaGFyIGFuZCBVUyBIaXN0b2dyYW1zDQpgYGB7cn0NCnBfYmloYXIgKyBnZW9tX2hpc3RvZ3JhbShkYXRhPWJpaGFyX2FkdWx0X2ZlbWFsZXNfdHJ1bmMsIGFlcyhoZWlnaHRfY20sIC4uZGVuc2l0eS4uICksZmlsbCA9ICJibHVlIiwgY29sb3IgPSAiZGFya2JsdWUiKSArDQogICAgICAgICAgZ2VvbV9oaXN0b2dyYW0oYWxwaGEgPSAwLjUsIGRhdGE9dXNfYWR1bHRfZmVtYWxlc190cnVuYywgYWVzKGhlaWdodF9jbSAsIC4uZGVuc2l0eS4uKSwgZmlsbCA9ICJyZWQiLCBjb2xvciA9ICJkYXJrcmVkIikgKw0KICAgICAgICAgIHhsYWIoIkhlaWdodCBpbiBjZW50aW1ldGVycyIpICsNCiAgICAgICAgICBnZ3RpdGxlKCJEaXN0cmlidXRpb24gb2YgSGVpZ2h0IGZvciBGZW1hbGUgQWR1bHRzIGluIEJpaGFyLCBJbmRpYSBhbmQgdGhlIFVTIikgKw0KICAgICAgICAgIHRoZW1lKHBsb3QudGl0bGUgPSBlbGVtZW50X3RleHQoaGp1c3QgPSAwLjUpKSANCmBgYA0KYGBge3J9DQojIG1vcmUgdmlzaWJsZSBhcyBwb2ludHMgDQpwX2JpaGFyICsgZ2VvbV9mcmVxcG9seShkYXRhPWJpaGFyX2FkdWx0X2ZlbWFsZXNfdHJ1bmMsIGFlcyhoZWlnaHRfY20sIC4uZGVuc2l0eS4uICksIGNvbG9yPSJkYXJrYmx1ZSIgKSsNCiAgICAgICAgICBnZW9tX2ZyZXFwb2x5KGRhdGE9dXNfYWR1bHRfZmVtYWxlc190cnVuYywgYWVzKGhlaWdodF9jbSAsIC4uZGVuc2l0eS4uKSwgIGNvbG9yPSJkYXJrcmVkIiApKw0KICAgICAgICAgIHhsYWIoIkhlaWdodCBpbiBjZW50aW1ldGVycyIpDQpgYGANCmBgYHtyfQ0KIyBLZXJuZWwgZGVuc2l0eQ0KcF9iaWhhciArIGdlb21fZGVuc2l0eShkYXRhPWJpaGFyX2FkdWx0X2ZlbWFsZXNfdHJ1bmMsIGFlcyhoZWlnaHRfY20pLCBjb2xvcj0iZGFya2JsdWUiICkrDQogICAgICAgICAgZ2VvbV9kZW5zaXR5KGRhdGE9dXNfYWR1bHRfZmVtYWxlc190cnVuYywgYWVzKGhlaWdodF9jbSksICBjb2xvcj0iZGFya3JlZCIgKSsNCiAgICAgICAgICB4bGFiKCJIZWlnaHQgaW4gY2VudGltZXRlcnMiKQ0KYGBgDQpgYGB7cn0NCiMgUmVwcmVzZW50aW5nIHRoZSBDREYNCg0KcF9iaWhhciArIHN0YXRfZWNkZihkYXRhPWJpaGFyX2FkdWx0X2ZlbWFsZXNfdHJ1bmMsIGFlcyhoZWlnaHRfY20pLCBjb2xvcj0iZGFya2JsdWUiICkrDQogICAgICAgICAgc3RhdF9lY2RmKGRhdGE9dXNfYWR1bHRfZmVtYWxlc190cnVuYywgYWVzKGhlaWdodF9jbSksICBjb2xvcj0iZGFya3JlZCIgKSsNCiAgICAgICAgICB4bGFiKCJIZWlnaHQgaW4gY2VudGltZXRlcnMiKSArIA0KICAgICAgICAgIHlsYWIoIlByb2JhYmlsaXR5IikNCmBgYA0KYGBge3J9DQpiaWhhcl9hZHVsdF9mZW1hbGVzXzIkbG9jYXRpb24gPC0gJ0JpaGFyJw0KdXNfYWR1bHRfZmVtYWxlc190cnVuYyRsb2NhdGlvbiA8LSAnVVMnDQoNCmNvbWJkYXRhIDwtIHJiaW5kKGJpaGFyX2FkdWx0X2ZlbWFsZXNfMiwgdXNfYWR1bHRfZmVtYWxlc190cnVuYykNCnRhaWwoY29tYmRhdGEpDQpgYGANCmBgYHtyfQ0KZ2dwbG90KGNvbWJkYXRhLCBhZXMoaGVpZ2h0X2NtLCBmaWxsID0gbG9jYXRpb24pKSArIA0KICBnZW9tX2RlbnNpdHkoYWxwaGEgPSAwLjIpICsgDQogIHhsYWIoIkhlaWdodCBpbiBjbSIpICsNCiAgZ2d0aXRsZSgiSGVpZ2h0IG9mIEZlbWFsZSBBZHVsdHMiKSArDQogIHRoZW1lKHBsb3QudGl0bGUgPSBlbGVtZW50X3RleHQoaGp1c3QgPSAwLjUpKQ0KYGBg