The following notebook uses two data sets, one from Bihar, India and the other from the US. Each data set includes 6 variables : pesonid, female (boolean), adult(boolean), age, height_cm, and weight_kg are numeric. The notebook looks at the distribution of female adults in both locations for illustration purposes. For that, we use tidyverse for data manipulation and ggplot2 from exploring the data.

Load the Packages

library(ggplot2)
library(tidyverse)
require(cowplot)

Read Bihar, India Data

#load in the data 
bihar_data <- read_csv("Data Sets/Week3/Bihar_sample_data.csv")
Parsed with column specification:
cols(
  personid = col_double(),
  female = col_double(),
  adult = col_double(),
  age = col_double(),
  height_cm = col_double(),
  weight_kg = col_double()
)
#have a look
head(bihar_data)

Keep Female Adults, Bihar

bihar_adult_females <- bihar_data[bihar_data$female == 1 & bihar_data$adult == 1, -which(names(bihar_data) %in% c("female", "adult"))]

# OR using dplyr
bihar_adult_females_2 <- bihar_data %>% filter(female == 1, adult == 1, height_cm > 120, height_cm < 200) %>% select(-c(personid, female, adult))
head(bihar_adult_females_2)

Ploting Histogram of Female Height in Bihar, India using ggplot2

#Include only female adults whose heights are between 120 cm and 200 cm
Warning messages:
1: In readChar(file, size, TRUE) : truncating string with embedded nuls
2: In readChar(file, size, TRUE) : truncating string with embedded nuls
bihar_adult_females_trunc <- filter(bihar_adult_females, height_cm > 120, height_cm < 200)
p_bihar <- ggplot(bihar_adult_females_trunc, aes(height_cm))
p_bihar + geom_histogram(fill="blue", color="darkblue", binwidth = 5) +
          xlab("Height in centimeters, Bihar Females") +
          ylab("Number of Female adults") +
          ggtitle("Height of Adult Females in Bihar, India") +
          theme(plot.title = element_text(hjust = 0.5))
ggsave("bihar_raw.pdf")
Saving 7.29 x 4.5 in image

Read US Data

us_data <- read_csv("Data Sets/Week3/US_sample_data.csv")
Missing column names filled in: 'X1' [1]Parsed with column specification:
cols(
  X1 = col_double(),
  seqn = col_double(),
  female = col_double(),
  adult = col_double(),
  age = col_double(),
  height_cm = col_double(),
  weight_kg = col_double()
)
us_adult_females_trunc <- us_data %>% filter(female == 1, adult == 1, height_cm > 120, height_cm < 200) %>% select(-c(X1, seqn, female, adult))
head(us_adult_females_trunc)

Ploting Histogram of Female Height in the US using ggplot2

p_US <- ggplot(us_adult_females_trunc, aes(height_cm))
p_US + geom_histogram(fill="blue", color="darkblue", binwidth = 5) +
       xlab("Height in centimeters, Bihar Females") +
       ylab("Number of Adult Females") +
       ggtitle("Height of Adult Females in the US") +
       theme(plot.title = element_text(hjust = 0.5)) 

Kernel Density Estimation

p_US + geom_histogram(data=us_adult_females_trunc, aes(height_cm , ..density..), fill="blue" , color="darkred", breaks = seq(130, 200, 5)) +
       geom_density(kernel="gaussian", aes(height_cm), bw = 5)

Bihar and US Histograms

p_bihar + geom_histogram(data=bihar_adult_females_trunc, aes(height_cm, ..density.. ),fill = "blue", color = "darkblue") +
          geom_histogram(alpha = 0.5, data=us_adult_females_trunc, aes(height_cm , ..density..), fill = "red", color = "darkred") +
          xlab("Height in centimeters") +
          ggtitle("Distribution of Height for Female Adults in Bihar, India and the US") +
          theme(plot.title = element_text(hjust = 0.5)) 

# more visible as points 

ggplot(bihar_adult_females_trunc, aes(height_cm))+
  geom_freqpoly(data=bihar_adult_females_trunc, aes(height_cm, ..density.. ), color="darkblue" )+
  geom_freqpoly(data=us_adult_females_trunc, aes(height_cm , ..density..),  color="darkred" )+
  xlab("Height in centimeters")

# Kernel density

ggplot(bihar_adult_females_trunc, aes(height_cm))+
  geom_density(data=bihar_adult_females_trunc, aes(height_cm), color="darkblue" )+
  geom_density(data=us_adult_females_trunc, aes(height_cm),  color="darkred" )+
  xlab("Height in centimeters")

# Representing the CDF

p_bihar + stat_ecdf(data=bihar_adult_females_trunc, aes(height_cm), color="darkblue" )+
          stat_ecdf(data=us_adult_females_trunc, aes(height_cm),  color="darkred" )+
          xlab("Height in centimeters") + 
          ylab("Probability")

bihar_adult_females_2$location <- 'Bihar'
us_adult_females_trunc$location <- 'US'

combdata <- rbind(bihar_adult_females_2, us_adult_females_trunc)
tail(combdata)
ggplot(combdata, aes(height_cm, fill = location)) + 
  geom_density(alpha = 0.2) + 
  xlab("Height in cm") +
  ggtitle("Height of Female Adults") +
  theme(plot.title = element_text(hjust = 0.5))

LS0tDQp0aXRsZTogIlByb2JhYmlsaXR5IERpc3RyaWJ1dGlvbiINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQoNClRoZSBmb2xsb3dpbmcgbm90ZWJvb2sgdXNlcyB0d28gZGF0YSBzZXRzLCBvbmUgZnJvbSBCaWhhciwgSW5kaWEgYW5kIHRoZSBvdGhlciBmcm9tIHRoZSBVUy4gRWFjaCBkYXRhIHNldCBpbmNsdWRlcyA2IHZhcmlhYmxlcyA6IHBlc29uaWQsIGZlbWFsZSAoYm9vbGVhbiksIGFkdWx0KGJvb2xlYW4pLCBhZ2UsIGhlaWdodF9jbSwgYW5kIHdlaWdodF9rZyBhcmUgbnVtZXJpYy4gVGhlIG5vdGVib29rIGxvb2tzIGF0IHRoZSBkaXN0cmlidXRpb24gb2YgZmVtYWxlIGFkdWx0cyBpbiBib3RoIGxvY2F0aW9ucyBmb3IgaWxsdXN0cmF0aW9uIHB1cnBvc2VzLiBGb3IgdGhhdCwgd2UgdXNlIHRpZHl2ZXJzZSBmb3IgZGF0YSBtYW5pcHVsYXRpb24gYW5kIGdncGxvdDIgZnJvbSBleHBsb3JpbmcgdGhlIGRhdGEuIA0KDQojIyMgTG9hZCB0aGUgUGFja2FnZXMNCmBgYHtyfQ0KbGlicmFyeShnZ3Bsb3QyKQ0KbGlicmFyeSh0aWR5dmVyc2UpDQpyZXF1aXJlKGNvd3Bsb3QpDQpgYGANCg0KIyMjIFJlYWQgQmloYXIsIEluZGlhIERhdGENCmBgYHtyIHNldHVwLCBpbmNsdWRlPUZBTFNFLCBlY2hvPUZBTFNFfQ0KcmVxdWlyZSgia25pdHIiKQ0Kb3B0c19rbml0JHNldChyb290LmRpciA9ICJDOi9Vc2Vycy9UYXJlay9Eb2N1bWVudHMvTUlUeCAxNC4zMTB4IC0gRGF0YSBBbmFseXNpcyBmb3IgU29jaWFsIFNjaWVudGlzdHMiKQ0KYGBgDQoNCmBgYHtyfQ0KI2xvYWQgaW4gdGhlIGRhdGEgDQpiaWhhcl9kYXRhIDwtIHJlYWRfY3N2KCJEYXRhIFNldHMvV2VlazMvQmloYXJfc2FtcGxlX2RhdGEuY3N2IikNCg0KI2hhdmUgYSBsb29rDQpoZWFkKGJpaGFyX2RhdGEpDQpgYGANCg0KIyMjIEtlZXAgRmVtYWxlIEFkdWx0cywgQmloYXINCmBgYHtyfQ0KYmloYXJfYWR1bHRfZmVtYWxlcyA8LSBiaWhhcl9kYXRhW2JpaGFyX2RhdGEkZmVtYWxlID09IDEgJiBiaWhhcl9kYXRhJGFkdWx0ID09IDEsIC13aGljaChuYW1lcyhiaWhhcl9kYXRhKSAlaW4lIGMoImZlbWFsZSIsICJhZHVsdCIpKV0NCg0KIyBPUiB1c2luZyBkcGx5cg0KYmloYXJfYWR1bHRfZmVtYWxlc18yIDwtIGJpaGFyX2RhdGEgJT4lIGZpbHRlcihmZW1hbGUgPT0gMSwgYWR1bHQgPT0gMSwgaGVpZ2h0X2NtID4gMTIwLCBoZWlnaHRfY20gPCAyMDApICU+JSBzZWxlY3QoLWMocGVyc29uaWQsIGZlbWFsZSwgYWR1bHQpKQ0KaGVhZChiaWhhcl9hZHVsdF9mZW1hbGVzXzIpDQpgYGANCiMjIyBQbG90aW5nIEhpc3RvZ3JhbSBvZiBGZW1hbGUgSGVpZ2h0IGluIEJpaGFyLCBJbmRpYSB1c2luZyBnZ3Bsb3QyDQpgYGB7cn0NCiNJbmNsdWRlIG9ubHkgZmVtYWxlIGFkdWx0cyB3aG9zZSBoZWlnaHRzIGFyZSBiZXR3ZWVuIDEyMCBjbSBhbmQgMjAwIGNtDQpiaWhhcl9hZHVsdF9mZW1hbGVzX3RydW5jIDwtIGZpbHRlcihiaWhhcl9hZHVsdF9mZW1hbGVzLCBoZWlnaHRfY20gPiAxMjAsIGhlaWdodF9jbSA8IDIwMCkNCg0KcF9iaWhhciA8LSBnZ3Bsb3QoYmloYXJfYWR1bHRfZmVtYWxlc190cnVuYywgYWVzKGhlaWdodF9jbSkpDQpwX2JpaGFyICsgZ2VvbV9oaXN0b2dyYW0oZmlsbD0iYmx1ZSIsIGNvbG9yPSJkYXJrYmx1ZSIsIGJpbndpZHRoID0gNSkgKw0KICAgICAgICAgIHhsYWIoIkhlaWdodCBpbiBjZW50aW1ldGVycywgQmloYXIgRmVtYWxlcyIpICsNCiAgICAgICAgICB5bGFiKCJOdW1iZXIgb2YgRmVtYWxlIGFkdWx0cyIpICsNCiAgICAgICAgICBnZ3RpdGxlKCJIZWlnaHQgb2YgQWR1bHQgRmVtYWxlcyBpbiBCaWhhciwgSW5kaWEiKSArDQogICAgICAgICAgdGhlbWUocGxvdC50aXRsZSA9IGVsZW1lbnRfdGV4dChoanVzdCA9IDAuNSkpDQpnZ3NhdmUoImJpaGFyX3Jhdy5wZGYiKQ0KYGBgDQojIyMgUmVhZCBVUyBEYXRhDQpgYGB7cn0NCnVzX2RhdGEgPC0gcmVhZF9jc3YoIkRhdGEgU2V0cy9XZWVrMy9VU19zYW1wbGVfZGF0YS5jc3YiKQ0KdXNfYWR1bHRfZmVtYWxlc190cnVuYyA8LSB1c19kYXRhICU+JSBmaWx0ZXIoZmVtYWxlID09IDEsIGFkdWx0ID09IDEsIGhlaWdodF9jbSA+IDEyMCwgaGVpZ2h0X2NtIDwgMjAwKSAlPiUgc2VsZWN0KC1jKFgxLCBzZXFuLCBmZW1hbGUsIGFkdWx0KSkNCmhlYWQodXNfYWR1bHRfZmVtYWxlc190cnVuYykNCmBgYA0KIyMjIFBsb3RpbmcgSGlzdG9ncmFtIG9mIEZlbWFsZSBIZWlnaHQgaW4gdGhlIFVTIHVzaW5nIGdncGxvdDINCmBgYHtyfQ0KcF9VUyA8LSBnZ3Bsb3QodXNfYWR1bHRfZmVtYWxlc190cnVuYywgYWVzKGhlaWdodF9jbSkpDQpwX1VTICsgZ2VvbV9oaXN0b2dyYW0oZmlsbD0iYmx1ZSIsIGNvbG9yPSJkYXJrYmx1ZSIsIGJpbndpZHRoID0gNSkgKw0KICAgICAgIHhsYWIoIkhlaWdodCBpbiBjZW50aW1ldGVycywgQmloYXIgRmVtYWxlcyIpICsNCiAgICAgICB5bGFiKCJOdW1iZXIgb2YgQWR1bHQgRmVtYWxlcyIpICsNCiAgICAgICBnZ3RpdGxlKCJIZWlnaHQgb2YgQWR1bHQgRmVtYWxlcyBpbiB0aGUgVVMiKSArDQogICAgICAgdGhlbWUocGxvdC50aXRsZSA9IGVsZW1lbnRfdGV4dChoanVzdCA9IDAuNSkpIA0KYGBgDQojIyMgS2VybmVsIERlbnNpdHkgRXN0aW1hdGlvbg0KYGBge3J9DQpwX1VTICsgZ2VvbV9oaXN0b2dyYW0oZGF0YT11c19hZHVsdF9mZW1hbGVzX3RydW5jLCBhZXMoaGVpZ2h0X2NtICwgLi5kZW5zaXR5Li4pLCBmaWxsPSJibHVlIiAsIGNvbG9yPSJkYXJrcmVkIiwgYnJlYWtzID0gc2VxKDEzMCwgMjAwLCA1KSkgKw0KICAgICAgIGdlb21fZGVuc2l0eShrZXJuZWw9ImdhdXNzaWFuIiwgYWVzKGhlaWdodF9jbSksIGJ3ID0gNSkNCmBgYA0KIyMjIEJpaGFyIGFuZCBVUyBIaXN0b2dyYW1zDQpgYGB7cn0NCnBfYmloYXIgKyBnZW9tX2hpc3RvZ3JhbShkYXRhPWJpaGFyX2FkdWx0X2ZlbWFsZXNfdHJ1bmMsIGFlcyhoZWlnaHRfY20sIC4uZGVuc2l0eS4uICksZmlsbCA9ICJibHVlIiwgY29sb3IgPSAiZGFya2JsdWUiKSArDQogICAgICAgICAgZ2VvbV9oaXN0b2dyYW0oYWxwaGEgPSAwLjUsIGRhdGE9dXNfYWR1bHRfZmVtYWxlc190cnVuYywgYWVzKGhlaWdodF9jbSAsIC4uZGVuc2l0eS4uKSwgZmlsbCA9ICJyZWQiLCBjb2xvciA9ICJkYXJrcmVkIikgKw0KICAgICAgICAgIHhsYWIoIkhlaWdodCBpbiBjZW50aW1ldGVycyIpICsNCiAgICAgICAgICBnZ3RpdGxlKCJEaXN0cmlidXRpb24gb2YgSGVpZ2h0IGZvciBGZW1hbGUgQWR1bHRzIGluIEJpaGFyLCBJbmRpYSBhbmQgdGhlIFVTIikgKw0KICAgICAgICAgIHRoZW1lKHBsb3QudGl0bGUgPSBlbGVtZW50X3RleHQoaGp1c3QgPSAwLjUpKSANCmBgYA0KYGBge3J9DQojIG1vcmUgdmlzaWJsZSBhcyBwb2ludHMgDQpwX2JpaGFyICsgZ2VvbV9mcmVxcG9seShkYXRhPWJpaGFyX2FkdWx0X2ZlbWFsZXNfdHJ1bmMsIGFlcyhoZWlnaHRfY20sIC4uZGVuc2l0eS4uICksIGNvbG9yPSJkYXJrYmx1ZSIgKSsNCiAgICAgICAgICBnZW9tX2ZyZXFwb2x5KGRhdGE9dXNfYWR1bHRfZmVtYWxlc190cnVuYywgYWVzKGhlaWdodF9jbSAsIC4uZGVuc2l0eS4uKSwgIGNvbG9yPSJkYXJrcmVkIiApKw0KICAgICAgICAgIHhsYWIoIkhlaWdodCBpbiBjZW50aW1ldGVycyIpDQpgYGANCmBgYHtyfQ0KIyBLZXJuZWwgZGVuc2l0eQ0KcF9iaWhhciArIGdlb21fZGVuc2l0eShkYXRhPWJpaGFyX2FkdWx0X2ZlbWFsZXNfdHJ1bmMsIGFlcyhoZWlnaHRfY20pLCBjb2xvcj0iZGFya2JsdWUiICkrDQogICAgICAgICAgZ2VvbV9kZW5zaXR5KGRhdGE9dXNfYWR1bHRfZmVtYWxlc190cnVuYywgYWVzKGhlaWdodF9jbSksICBjb2xvcj0iZGFya3JlZCIgKSsNCiAgICAgICAgICB4bGFiKCJIZWlnaHQgaW4gY2VudGltZXRlcnMiKQ0KYGBgDQpgYGB7cn0NCiMgUmVwcmVzZW50aW5nIHRoZSBDREYNCg0KcF9iaWhhciArIHN0YXRfZWNkZihkYXRhPWJpaGFyX2FkdWx0X2ZlbWFsZXNfdHJ1bmMsIGFlcyhoZWlnaHRfY20pLCBjb2xvcj0iZGFya2JsdWUiICkrDQogICAgICAgICAgc3RhdF9lY2RmKGRhdGE9dXNfYWR1bHRfZmVtYWxlc190cnVuYywgYWVzKGhlaWdodF9jbSksICBjb2xvcj0iZGFya3JlZCIgKSsNCiAgICAgICAgICB4bGFiKCJIZWlnaHQgaW4gY2VudGltZXRlcnMiKSArIA0KICAgICAgICAgIHlsYWIoIlByb2JhYmlsaXR5IikNCmBgYA0KYGBge3J9DQpiaWhhcl9hZHVsdF9mZW1hbGVzXzIkbG9jYXRpb24gPC0gJ0JpaGFyJw0KdXNfYWR1bHRfZmVtYWxlc190cnVuYyRsb2NhdGlvbiA8LSAnVVMnDQoNCmNvbWJkYXRhIDwtIHJiaW5kKGJpaGFyX2FkdWx0X2ZlbWFsZXNfMiwgdXNfYWR1bHRfZmVtYWxlc190cnVuYykNCnRhaWwoY29tYmRhdGEpDQpgYGANCmBgYHtyfQ0KZ2dwbG90KGNvbWJkYXRhLCBhZXMoaGVpZ2h0X2NtLCBmaWxsID0gbG9jYXRpb24pKSArIA0KICBnZW9tX2RlbnNpdHkoYWxwaGEgPSAwLjIpICsgDQogIHhsYWIoIkhlaWdodCBpbiBjbSIpICsNCiAgZ2d0aXRsZSgiSGVpZ2h0IG9mIEZlbWFsZSBBZHVsdHMiKSArDQogIHRoZW1lKHBsb3QudGl0bGUgPSBlbGVtZW50X3RleHQoaGp1c3QgPSAwLjUpKQ0KYGBg