set the library with the packages we use

library(ggplot2)
library(tidyverse)
require(cowplot)#makes the grids

Setting the Correct Directory

setwd("C:/Users/nirma/Documents/EDX courses/MicroMaster MIT/14.310x-Data Analysis for Social Scientists/Programs/")

Loading Required Data

bihar_data <- read_csv("Bihar_sample_data.csv")

Checkin the Data

print(bihar_data)

# A tibble: 39,553 x 6
   personid female adult   age height_cm weight_kg
      <dbl>  <dbl> <dbl> <dbl>     <dbl>     <dbl>
 1 11010101      0     1    70      164.      48.9
 2 11010102      0     1    32      157.      44  
 3 11010103      1     1    28      150.      37.7
 4 11010104      0     0    12      146.      30.7
 5 11010105      1     0    11      135.      30.2
 6 11010201      0     1    38      174.      67.7
 7 11010202      1     1    30      140.      57.3
 8 11010203      0     0    15      163.      59.3
 9 11010204      0     0    10      149.      40.7
10 11010205      1     0    16      153.      43.9
# ... with 39,543 more rows

Subsetting Data, Keeping Females Only Using filter Command

bihar_adult_females <-filter(bihar_data, adult==1,female==1) 
print(bihar_adult_females)#checking if that worked

# A tibble: 11,664 x 6
   personid female adult   age height_cm weight_kg
      <dbl>  <dbl> <dbl> <dbl>     <dbl>     <dbl>
 1 11010103      1     1    28      150.      37.7
 2 11010202      1     1    30      140.      57.3
 3 11010207      1     1    35      148.      38.9
 4 11010302      1     1    48      145.      35.7
 5 11010303      1     1    22       NA       NA  
 6 11010306      1     1    18       NA       NA  
 7 11010308      1     1    28      145.      42.4
 8 11010402      1     1    58      156.      51.1
 9 11010404      1     1    36      156.      50.7
10 11010407      1     1    55      156.      47.2
# ... with 11,654 more rows

Creating a Default Histogram Using ggplot

ggplot(bihar_adult_females, aes(height_cm))+
  geom_histogram()

Removing Outliers

Because some people look like they are very small

bihar_adult_females_trunc <-filter(bihar_adult_females, height_cm>120, height_cm<200)

Better Histogram: nicer label, and some color

ggplot(bihar_adult_females_trunc, aes(height_cm))+
  geom_histogram(fill="blue", color="darkblue")+
  xlab("Height in centimeters, Bihar Females")

playing with the Bins

Bin Width = 5

bihar1 <- ggplot(bihar_adult_females_trunc, aes(height_cm))+
  geom_histogram(fill="blue", color="darkblue", binwidth = 5)+
  xlab("bin width=5")+
  ylab("")
bihar1

Bin Width = 10

bihar2 <- ggplot(bihar_adult_females_trunc, aes(height_cm))+
  geom_histogram(fill="blue", color="darkblue", binwidth = 10)+
  xlab("bin width=10")+
  ylab("")
#ggsave("output/bihar_bin10.pdf") if we want to save the plot in our local drive
bihar2

Bin Width = 20

bihar3 <- ggplot(bihar_adult_females_trunc, aes(height_cm))+
  geom_histogram(fill="blue", color="darkblue", binwidth = 20)+
  xlab("bin width=20")+
  ylab("")
bihar3

Bin Width = 50

bihar4 <- ggplot(bihar_adult_females_trunc, aes(height_cm))+
  geom_histogram(fill="blue", color="darkblue", binwidth = 50)+
  xlab("bin width=50")+
  ylab("")
bihar4

Putting all these Plot Together for Better Comparison

plot_grid(bihar1, bihar2, bihar3, bihar4, 
          labels="Female Height in Bihar", 
          hjust=-1, vjust=0.2)#the function cowplot is used here

Loading A Second Data

Source: US Data from National Health and Nutrition Examination Survey

us_data <- read_csv("US_sample_data.csv")
str(us_data)

tibble [9,813 x 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ X1       : num [1:9813] 1 2 3 4 5 6 7 8 9 10 ...
 $ seqn     : num [1:9813] 73557 73558 73559 73560 73561 ...
 $ female   : num [1:9813] 0 0 0 0 1 0 0 1 1 0 ...
 $ adult    : num [1:9813] 1 1 1 0 1 1 0 1 1 1 ...
 $ age      : num [1:9813] 69 54 72 9 73 56 0 61 56 65 ...
 $ height_cm: num [1:9813] 171 177 175 137 162 ...
 $ weight_kg: num [1:9813] 78.3 89.5 88.9 32.2 52 105 7.4 93.4 61.8 65.3 ...
 - attr(*, "spec")=
  .. cols(
  ..   X1 = col_double(),
  ..   seqn = col_double(),
  ..   female = col_double(),
  ..   adult = col_double(),
  ..   age = col_double(),
  ..   height_cm = col_double(),
  ..   weight_kg = col_double()
  .. )

Subsetting the Data for Adult Females Only

us_adult_females_trunc <- filter(us_data,female==1 , adult==1, height_cm>120 , height_cm<200)
str(us_adult_females_trunc)

tibble [2,969 x 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ X1       : num [1:2969] 5 8 9 11 16 22 24 38 39 46 ...
 $ seqn     : num [1:2969] 73561 73564 73566 73568 73574 ...
 $ female   : num [1:2969] 1 1 1 1 1 1 1 1 1 1 ...
 $ adult    : num [1:2969] 1 1 1 1 1 1 1 1 1 1 ...
 $ age      : num [1:2969] 73 61 56 26 33 38 23 57 50 69 ...
 $ height_cm: num [1:2969] 162 162 153 152 158 ...
 $ weight_kg: num [1:2969] 52 93.4 61.8 47.1 56.8 ...
 - attr(*, "spec")=
  .. cols(
  ..   X1 = col_double(),
  ..   seqn = col_double(),
  ..   female = col_double(),
  ..   adult = col_double(),
  ..   age = col_double(),
  ..   height_cm = col_double(),
  ..   weight_kg = col_double()
  .. )

Plotting the US women in One Go

ggplot(us_adult_females_trunc, aes(height_cm))+
  geom_histogram(fill="red", color="darkred")+
  xlab("Height in centimeters, US females")

Kernel Density Estimation: An Alternative to Histogram

ggplot(us_adult_females_trunc, aes(height_cm))+
  geom_histogram(data=us_adult_females_trunc, aes(height_cm , ..density..), fill="white" , color="darkred")+
  geom_density(kernel="gaussian", aes(height_cm))

Playing with the Bandwidth

As we played around the Bin Width in Histogram, we can play around Bandwidth in Kernal Density Estimation, for example:

Bandwidth = 1

US1 <- ggplot(us_adult_females_trunc, aes(height_cm))+
  geom_histogram(data=us_adult_females_trunc, aes(height_cm , ..density..), fill="white" , color="darkred")+
  geom_density(kernel="gaussian", aes(height_cm), bw=1)+
  xlab("bw=1")+
  ylab("")

Bandwidth = 5

US2 <- ggplot(us_adult_females_trunc, aes(height_cm))+
  geom_histogram(data=us_adult_females_trunc, aes(height_cm , ..density..), fill="white" , color="darkred")+
  geom_density(kernel="gaussian", aes(height_cm), bw=5)+
  xlab("bw=5")+
  ylab("")

Bandwidth = 10

US3 <- ggplot(us_adult_females_trunc, aes(height_cm))+
  geom_histogram(data=us_adult_females_trunc, aes(height_cm , ..density..), fill="white" , color="darkred")+
  geom_density(kernel="gaussian", aes(height_cm), bw=10)+
  xlab("bw=10")+
  ylab("")

Bandwidth = 20

US4 <- ggplot(us_adult_females_trunc, aes(height_cm))+
  geom_histogram(data=us_adult_females_trunc, aes(height_cm , ..density..), fill="white" , color="darkred")+
  geom_density(kernel="gaussian", aes(height_cm), bw=20)+
  xlab("bw=20")+
  ylab("")

Putting all these Kernel Density Plots together for Better Comparison

plot_grid(US1, US2, US3, US4, 
          labels="Female Height in the US", 
          hjust=-1, vjust=0.2)

Combining the Two Histograms

ggplot()+
  geom_histogram(data=bihar_adult_females_trunc, aes(height_cm),fill="blue", color="darkblue" )+
  geom_histogram(data=us_adult_females_trunc, aes(height_cm), fill="red", color="darkred" )

Looking at the histograms, one can mistakenly deduce that Women in India are comparatively taller than women in the US. Thus, this comparative histogram is deceiving. It is not good to use count on the Y-axis. We can rather use density, which makes more sense. Let’s try:

Adding Density to the Existing Histogram

ggplot()+
  geom_histogram(data=bihar_adult_females_trunc, aes(height_cm, ..density.. ),fill="blue", color="darkblue")+
  geom_histogram(data=us_adult_females_trunc, aes(height_cm , ..density..), fill="red", color="darkred" )+
  xlab("Height in centimeters")

These histograms make a little more sense. However, there is some overlapping going on. We are not actually able to see what’s going on on the right hand side of the Bihar plot. So is may be wise to use something that shows the frequency points rather than the bars themselves. Let’s give a try:

more visible as points

ggplot()+
  geom_freqpoly(data=bihar_adult_females_trunc, aes(height_cm, ..density.. ), color="darkblue" )+
  geom_freqpoly(data=us_adult_females_trunc, aes(height_cm , ..density..),  color="darkred" )+
  xlab("Height in centimeters")

Amazing! Everything is visible, and we can have a whole some picture of height distribution of women in Bihar and the US, and how they compare.

Let’s Try: Kernel Density

ggplot()+
  geom_density(data=bihar_adult_females_trunc, aes(height_cm), color="darkblue" )+
  geom_density(data=us_adult_females_trunc, aes(height_cm),  color="darkred" )+
  xlab("Height in centimeters")

We were seeing pointy tips and kinks in the previous plots, however, the tips in this plot seem smoother. It looks definitely better but it is always a trade-off between information in the data and the smoothness of the plots.

Representing the CDF

ggplot()+
  stat_ecdf(data=bihar_adult_females_trunc, aes(height_cm), color="darkblue" )+
  stat_ecdf(data=us_adult_females_trunc, aes(height_cm),  color="darkred" )+
  xlab("Height in centimeters")

The function ‘stat_ecdf’ helps us calculate Cumulative Distribution Function. It looks like the CDF is much better option in comparison. The CDF itself is the integral of the density.

Professor Duflo at MIT thinks that to some extent representing a distribution with a PDF versus a CDF is a matter of (A) choice and (B)what do we want to study.

With the PDF version of the plot we see better the fatness or symmetry, etc. In addition, the shape of one particular distribution is much more visible with a PDF than with a CDF. So, while I am visualizing the data data, we start with the PDF first. Then, when we want to compare to a distribution, then, personally, the professor prefers the CDF.

Explanotry Data Analysis

Nirmal Ghimire

2/27/2021