The assignment tasks students with selecting a messy dataset posted by a peer and cleaning it for analysis as set forth in the peer’s post and beyond. For our dataset, we selected Maria’s dataset from UNICEF, and I undertook her question regarding newborn, infant, and young child nutrition, as well as analyses on water / hygiene and social protection. Prior to importing the data, I executed some operations on the dataset to facilitate importing and analysis including: - change all “-” to “NAs” - change percentages to decimals - strip out footnotes from the bottom - drop tabs that are anciliary to the analysis at hand so as to be able to save the data as .csv - drop extraneous columns and rows
nutrition = read.csv("https://raw.githubusercontent.com/evanmclaughlin/ECM607/master/UNICEF_Table-7-Nutrition-EN.csv")
head(nutrition)
## Country Low.birthweight Unweighed.at.birth
## 1 Afghanistan NA 0.8626837
## 2 Albania 0.04587830 0.1316733
## 3 Algeria 0.07252492 0.1142705
## 4 Andorra 0.07445406 0.1430000
## 5 Angola 0.15256593 0.4480227
## 6 Anguilla NA NA
## Early.initiation.of.breastfeeding Exclusive.breastfeeding...6.months.
## 1 0.6280000 0.575
## 2 0.5652210 0.36538513
## 3 0.3571079 0.25391212
## 4 NA <NA>
## 5 0.4831644 0.37381931
## 6 NA <NA>
## Introduction.to.solid..semi.solid.or.soft.foods Breastfeeding...All
## 1 0.6099433 0.7380154
## 2 0.8850852 0.4320668
## 3 0.7717722 0.3554286
## 4 NA NA
## 5 0.7876945 0.6656777
## 6 NA NA
## Breastfeeding...Poorest.20. Breastfeeding...Richest.20.
## 1 0.8009494 0.6956284
## 2 0.3817219 0.3657669
## 3 0.3531280 0.3357238
## 4 NA NA
## 5 0.7436154 0.5259578
## 6 NA NA
## Minimum.diet.diversity..6.23.months. Minimum.meal.frequency..6.23.months.
## 1 0.2205860 0.5121235
## 2 0.5249851 0.5140581
## 3 NA 0.5204865
## 4 NA NA
## 5 0.2906923 0.3276945
## 6 NA NA
## Minimum.acceptable.diet..6.23.months.
## 1 0.1549437
## 2 0.2924022
## 3 NA
## 4 NA
## 5 0.1326344
## 6 NA
## Zero.vegetable.or.fruit.consumption..6.23.months. Region X X.1
## 1 0.5857060 NA NA
## 2 0.2573565 NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 0.3635851 NA NA
## 6 NA LATAM - Caribbean NA NA
## X.2 X.3 X.4 X.5 X.6 X.7 X.8
## 1 NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA
## 4 NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA
## 6 NA NA NA NA NA NA NA
#start with re-labeling countries and the first variables we're interested in analyzing
names(nutrition)[1] <- "countries"
names(nutrition)[2] <- "low_birthweight"
names(nutrition)[4] <- "early_breast"
names(nutrition)[5] <- "exclusive_breast"
names(nutrition)[7] <- "breast_all"
names(nutrition)[13] <- "zero_veg"
names(nutrition)[14] <- "region"
#head(nutrition)
#next, extract the renamed columns for a subset
nut_ext <- nutrition %>% dplyr::select(1, 2, 4, 5, 7, 13, 14)
head(nut_ext)
## countries low_birthweight early_breast exclusive_breast breast_all
## 1 Afghanistan NA 0.6280000 0.575 0.7380154
## 2 Albania 0.04587830 0.5652210 0.36538513 0.4320668
## 3 Algeria 0.07252492 0.3571079 0.25391212 0.3554286
## 4 Andorra 0.07445406 NA <NA> NA
## 5 Angola 0.15256593 0.4831644 0.37381931 0.6656777
## 6 Anguilla NA NA <NA> NA
## zero_veg region
## 1 0.5857060
## 2 0.2573565
## 3 NA
## 4 NA
## 5 0.3635851
## 6 NA LATAM - Caribbean
## let's filter the dataset for our desired LATAM - Caribbean subest
la_nut <- nut_ext %>% filter(region == 'LATAM - Caribbean')
la_nut
## countries low_birthweight early_breast
## 1 Anguilla NA NA
## 2 Antigua and Barbuda 0.09054119 NA
## 3 Argentina 0.07346758 0.5274035
## 4 Bahamas 0.13135983 NA
## 5 Barbados NA 0.4029047
## 6 Belize 0.08598801 0.6825412
## 7 Bolivia (Plurinational State of) 0.07222335 0.5500000
## 8 Brazil 0.08384270 0.4290000
## 9 British Virgin Islands NA NA
## 10 Chile 0.06246671 NA
## 11 Colombia 0.09955593 0.7200000
## 12 Costa Rica 0.07477285 0.5962632
## 13 Cuba 0.05263557 0.4787731
## 14 Dominica NA NA
## 15 Dominican Republic 0.11294179 0.3807571
## 16 Ecuador 0.11180641 0.5460000
## 17 El Salvador 0.10295944 0.4200486
## 18 Grenada NA NA
## 19 Guatemala 0.10957713 0.6313348
## 20 Haiti NA 0.4735259
## 21 Honduras 0.10897210 0.6379734
## 22 Jamaica 0.14582915 0.6471896
## 23 Mexico 0.07868743 0.5103613
## 24 Montserrat NA NA
## 25 Nicaragua 0.10669083 0.5440000
## 26 Panama 0.10087104 0.4697620
## 27 Paraguay 0.08090015 0.4954847
## 28 Peru 0.09403629 0.4970000
## 29 Saint Kitts and Nevis NA NA
## 30 Saint Lucia NA 0.4958285
## 31 Saint Vincent and the Grenadines NA NA
## 32 Suriname 0.14658528 0.4468282
## 33 Trinidad and Tobago 0.12392281 0.4600000
## 34 Turks and Caicos Islands NA NA
## 35 Uruguay 0.07616777 0.7651041
## 36 Venezuela (Bolivarian Republic of) 0.09104774 NA
## exclusive_breast breast_all zero_veg region
## 1 <NA> NA NA LATAM - Caribbean
## 2 <NA> NA NA LATAM - Caribbean
## 3 0.31951948 0.3910114 NA LATAM - Caribbean
## 4 <NA> NA NA LATAM - Caribbean
## 5 0.19696945 0.4108116 NA LATAM - Caribbean
## 6 0.33164608 0.4713115 0.30324936 LATAM - Caribbean
## 7 0.583 0.5530000 0.19808907 LATAM - Caribbean
## 8 0.386 NA NA LATAM - Caribbean
## 9 <NA> NA NA LATAM - Caribbean
## 10 <NA> NA NA LATAM - Caribbean
## 11 0.361 0.4475464 NA LATAM - Caribbean
## 12 0.32540791 0.3977990 NA LATAM - Caribbean
## 13 0.32832527 0.3068782 0.26882854 LATAM - Caribbean
## 14 <NA> NA NA LATAM - Caribbean
## 15 0.04564405 0.1994798 0.34918953 LATAM - Caribbean
## 16 0.396 NA NA LATAM - Caribbean
## 17 0.46734215 0.6678928 0.15860823 LATAM - Caribbean
## 18 <NA> NA NA LATAM - Caribbean
## 19 0.53236492 0.7204063 0.26755610 LATAM - Caribbean
## 20 0.39875298 0.5246042 0.54659348 LATAM - Caribbean
## 21 0.30749973 0.5891186 0.36059898 LATAM - Caribbean
## 22 0.23779669 0.3766995 NA LATAM - Caribbean
## 23 0.30139965 0.3607925 0.18309301 LATAM - Caribbean
## 24 <NA> NA NA LATAM - Caribbean
## 25 0.317 0.5222334 NA LATAM - Caribbean
## 26 0.21459486 0.4125676 NA LATAM - Caribbean
## 27 0.29593622 0.3283283 0.16459743 LATAM - Caribbean
## 28 0.664 0.6458324 0.07263592 LATAM - Caribbean
## 29 <NA> NA NA LATAM - Caribbean
## 30 0.034969528 0.2869475 NA LATAM - Caribbean
## 31 <NA> NA NA LATAM - Caribbean
## 32 0.027725089 0.1739862 NA LATAM - Caribbean
## 33 0.21 0.3400000 NA LATAM - Caribbean
## 34 <NA> NA NA LATAM - Caribbean
## 35 <NA> NA NA LATAM - Caribbean
## 36 <NA> NA NA LATAM - Caribbean
## We can start by trying to discover some relationship between low birthweight, likelihood of breastfeeding exclusively as an infant, and nutritional habits as the child gets older in the form of the likelihood of zero vegetables being consumed
# Let's strip out the NAs first
la_nut <-na.omit(la_nut)
qplot(x=exclusive_breast, y=zero_veg, data=la_nut, main=" Exclusive Infant Breast-feeding v Zero Vegetables Consumed for Older Children ", xlab="Breast-Feeding", ylab="No Vegetables")
nut_score <- lm(`exclusive_breast` ~ `zero_veg`, data = la_nut)
nut_score
##
## Call:
## lm(formula = exclusive_breast ~ zero_veg, data = la_nut)
##
## Coefficients:
## (Intercept) zero_veg
## 0.681 -1.269