install.packages(“ggplot2”) install.packages(“dpplyr”) install.packages(“tidyverse”) #ggplot2, dplry install.packages(“RefManageR”) install.packages(“HistData”) install.packages(“ggthemes”) #ggplot themes install.packages(“ggrepel”)

Libraries

library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.1
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(dslabs)
## Warning: package 'dslabs' was built under R version 3.5.1
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.1
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 3.5.1
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 3.5.1

Ejercicios Data Type #1

Para conocer que “variables” estan disponibles en la data “heights” utilizamos la funcion “names”.

data(heights)
names (heights)
## [1] "sex"    "height"

Para conocer el total de observaciones de una variable, en este caso de la variable “height”:

data("heights")
x <- heights$height
length (x)
## [1] 1050

Para conocer el total de observaciones UNICAS de una variable, en este caso de la variable “height”:

data("heights")
x <- heights$height
length(unique(x))
## [1] 139

Vamos a proceder a obtener la frecuencia de cada observacion utilizando la funcion:

data("heights")
x <- heights$height
tab <- table (x)
sum (tab==1)
## [1] 63

Obtengamos la proporcion que corresponde a Female y Males en la variable “sex”:

data("heights")
x <- heights$sex
prop.table(table(x))
## x
##    Female      Male 
## 0.2266667 0.7733333

Un grafico nos puede dar una visualizacion y resumir de mejor manera los datos

data("heights")
x <- heights$height
hist(x, col = "green", xlab = "Altura", ylab = "Total observaciones")

Calculemos la media y la desviacion estandar de la variable “height”

data("heights")
x <- heights$height[heights$sex=="Male"]
desviacion<- sd (x)
media<- mean(x)
c(Promedio=media, DesvStand = desviacion)
##  Promedio DesvStand 
## 69.314755  3.611024
data("heights")
x <- heights$height[heights$sex=="Male"]
mean(x>69 & x <=72)
## [1] 0.3337438

Suppose you only have avg and stdev below, but no access to x, can you approximate the proportion of the data that is between 69 and 72 inches?

data("heights")
x <- heights$height[heights$sex=="Male"]
avg <- mean(x)
stdev <- sd (x)
pnorm(72, avg, stdev) - pnorm (69, avg, stdev)
## [1] 0.3061779

Use normal approximation to estimate the proportion of heights between 79 and 81 inches and save it in an object called approx. Report how many times bigger the actual proportion is compared to the approximation.

data("heights")
x <- heights$height[heights$sex=="Male"]
avg <- mean(x)
stdev <- sd (x)
exact <- mean (x>79 & x<= 81)
approx <- pnorm(81, avg, stdev) - pnorm (79, avg, stdev)
exact/approx
## [1] 1.614261

First, we will estimate the proportion of adult men that are 7 feet tall or taller.

Assume that the distribution of adult men in the world as normally distributed with an average of 69 inches and a standard deviation of 3 inches.

1- pnorm (7*12, 69, 3)
## [1] 2.866516e-07

Exercise 5. Estimating the number seven footers Now we have an approximation for the proportion, call it p, of men that are 7 feet tall or taller.

We know that there are about 1 billion men between the ages of 18 and 40 in the world, the age range for the NBA.

Can we use the normal distribution to estimate how many of these 1 billion men are at least seven feet tall?

Use your answer to the previous exercise to estimate the proportion of men that are seven feet tall or taller in the world and store that value as p. Then round the number of 18-40 year old men who are seven feet tall or taller to the nearest integer. (Do not store this value in an object.)

p <- 1 - pnorm(7*12, 69, 3)
round(p * 10^9)
## [1] 287

Exercise 6. How many seven footers are in the NBA? There are about 10 National Basketball Association (NBA) players that are 7 feet tall or higher. Use your answer to exercise 4 to estimate the proportion of men that are seven feet tall or taller in the world and store that value as p. Use your answer to the previous exercise (exercise 5) to round the number of 18-40 year old men who are seven feet tall or taller to the nearest integer and store that value as N. Then calculate the proportion of the world’s 18 to 40 year old seven footers that are in the NBA. (Do not store this value in an object.)

p <- 1 - pnorm(7*12, 69, 3)
N <- round(p * 10^9)
10/N
## [1] 0.03484321

Exercise 7. Lebron James’ height In the previous exerceise we estimated the proportion of seven footers in the NBA using this simple code: Repeat the calculations performed in the previous question for Lebron James’ height: 6 feet 8 inches. There are about 150 players, instead of 10, that are at least that tall in the NBA.

p <- 1 - pnorm(7*12, 69, 3)
N <- round(p * 10^9)
10/N
## [1] 0.03484321

Exercise 1. Vector lengths When analyzing data it’s often important to know the number of measurements you have for each category.

Define a variable male that contains the male heights. Define a variable female that contains the female heights. Report the length of each variable.

library(dslabs)
data(heights)
male <- heights$height[heights$sex=="Male"]
female <- heights$height[heights$sex=="Female"]

length(male)
## [1] 812
length(female)
## [1] 238

Exercise 2. Percentiles Suppose we can’t make a plot and want to compare the distributions side by side. If the number of data points is large, listing all the numbers is inpractical. A more practical approach is to look at the percentiles. We can obtain percentiles using the quantile function like this:

Create two five row vectors showing the 10th, 30th, 50th, 70th, and 90th percentiles for the heights of each sex called these vectors female_percentiles and male_percentiles. Then create a data frame called df with these two vectors as columns. The column names should be female and male and should appear in that order. As an example consider that if you want a data frame to have column names names and grades, in that order, you do it like this:

library(dslabs)
data(heights)
male <- heights$height[heights$sex=="Male"]
female <- heights$height[heights$sex=="Female"]

female_percentiles <- quantile(female, seq(0.1, 0.9, 0.2))
male_percentiles <- quantile(male, seq(0.1, 0.9, 0.2))

df <- data.frame(female = female_percentiles, male = male_percentiles)
df
##       female     male
## 10% 61.00000 65.00000
## 30% 63.00000 68.00000
## 50% 64.98031 69.00000
## 70% 66.46417 71.00000
## 90% 69.00000 73.22751

Exercise 3. Error impact on average In the previous exercises we saw that the mean and median are very similar and so are the standard deviation and MAD. This is expected since the data is approximated by a normal distribution which has this propoerty.

Now suppose that suppose Galton made a mistake when entering the first value, forgetting to use the decimal point. You can imitate this error by typing:

library(HistData)
## Warning: package 'HistData' was built under R version 3.5.1
data(Galton)
x <- Galton$child
x_with_error <- x
x_with_error[1] <- x_with_error[1]*10
mean(x_with_error)- mean(x)
## [1] 0.5983836

Exercise 4. Error impact on SD In the previous exercise we saw how a simple mistake can result in the average of our data increasing more than half a foot, which is a large difference in practical terms. Now let’s explore the effect this outlier has on the standard deviation.

Report how many inches the SD grows after this mistake. Specifically, report the difference between the SD of the data with the mistake x_with_error and the data without the mistake x

x_with_error <- x
x_with_error[1] <- x_with_error[1]*10
sd(x_with_error)- sd(x)
## [1] 15.6746

Exercise 5. Error impact on median In the previous exercises we saw how one mistake can have a substantial effect on the average and the standard deviation.

Now we are going to see how the median and MAD are much more resistant to outliers. For this reason we say that they are robust summaries.

Report how many inches the median grows after the mistake. Specifically, report the difference between the median of the data with the mistake x_with_error and the data without the mistake x.

x_with_error <- x
x_with_error[1] <- x_with_error[1]*10
median(x_with_error)- median(x)
## [1] 0

Exercise 6. Error impact on MAD We saw that the median barely changes. Now let’s see how the MAD is affected.

Report how many inches the MAD grows after the mistake. Specifically, report the difference between the MAD of the data with the mistake x_with_error and the data without the mistake x.

x_with_error <- x
x_with_error[1] <- x_with_error[1]*10
mad(x_with_error)- mad(x)
## [1] 0

Exercise 8. Using EDA to explore changes We have seen how the average can be affected by outliers. But how large can this effect get? This of course depends on the size of the outlier and the size of the dataset.

To see how outliers can affect the average of a dataset, let’s write a simple function that takes the size of the outlier as input and returns the average.

Write a function called error_avg that takes a value k and returns the average of the vector x after the first entry changed to k. Show the results for k=10000 and k=-10000.

x <- Galton$child

error_avg <- function(k){
  x[1] <- k
  mean(x)
}

error_avg(10000)
## [1] 78.79784
error_avg(-10000)
## [1] 57.24612
data("murders")
names(murders)
## [1] "state"      "abb"        "region"     "population" "total"

SECTION 2: GGPPLOT2

murders %>%
  ggplot()+
  geom_abline()+
  geom_point(aes(x=population/10^6, y= total, col=region), size = 2)+
  geom_text(aes(population/10^6, total, label=abb), nudge_x = 0.05, size =2)+
  scale_x_log10()+ #scale_x_continuous(trans = "log10")
  scale_y_log10()+ #scale_y_continuous(trans = "log10")
 
  xlab("Populations in Millions") +
  ylab("Total number of murders") +
  ggtitle("US Gun murders in US 2010")+
  theme_economist()

Exercise 1. ggplot2 basics Start by loading the dplyr and ggplot2 libraries as well as the murders data.

What is the class of the object p?

data("murders")
data("heights")
p<- ggplot(murders)
class(p)
## [1] "gg"     "ggplot"

Usando “pipes” otra forma de asignar una data a ggplot2

murders %>%
  ggplot() 

Exercise 2. Printing Remember that to print an object you can use the command print or simply type the object. For example, instead of

data("murders")
p <- ggplot(murders)
ggplot()

ANSWER: A blank slate plot

Exercise 3. Pipes Now we are going to review the use of pipes by seeing how they can be used with ggplot

Using the pipe %>%, create an object p associated with the heights dataset instead of with the murders dataset as in previous exercises.

p <- heights %>% 
  ggplot()

Exercise 4. Layers Now we are going to add layers and the corresponding aesthetic mappings. For the murders data, we plotted total murders versus population sizes in the videos.

Explore the murders data frame to remind yourself of the names for the two variables (total murders and population size) we want to plot and select the correct answer.

data("murders")
names(murders)
## [1] "state"      "abb"        "region"     "population" "total"

ANSWER: Population and Total

Exercise 5. geom_point 1 To create a scatter plot, we add a layer with the function geom_point. The aesthetic mappings require us to define the x-axis and y-axis variables respectively.

Remake the plot but flip the axes so that total is on the x-axis and population is on the y-axis.

murders %>% ggplot(aes(x =total , y = population )) +
  geom_point()

Exercise 6. geom_point 2 Note that if we don’t use argument names, we can obtain the same plot by making sure we enter the variable names in the desired order:

Remake the plot but flip the axes so that total is on the x-axis and population is on the y-axis.

murders %>% ggplot(aes(total, population)) +
  geom_point()

Exercise 7. geom_point text If instead of points we want to add text, we can use the geom_text() or geom_label() geometries. However, note that the following code:

murders %>% ggplot(aes(population, total)) + geom_label()

will give us the error message: ANSWER: We need to map a character to each point through the label argument in aes.

Exercise 8. geom_point text You can also add labels to the points on a plot. Rewrite the code from the previous exercise to add the state abbreviation as the label through aes.

murders %>% ggplot(aes(population, total)) +
  geom_point()+
  geom_label(aes(population, total, label=abb))

Exercise 9. geom_point colors Now let’s change the color of the labels to blue. How can we do this?

ANSWER: By using the color argument in geom_label because we want all colors to be blue so we do not need to map colors

Exercise 10. geom_point colors 2 Now let’s go ahead and make the labels blue. We previously wrote this code to add labels to our plot:

murders %>% ggplot(aes(population, total)) +
  geom_point()+
  geom_label(aes(population, total, label=abb),color="blue")

Exercise 11. geom_labels by region Now suppose we want to use color to represent the different regions. So the states from the West will be one color, states from the Northeast another, and so on. In this case, which of the following is most appropriate:

ANSWER: Mapping the colors through the color argument of aes because each label needs a different color

Exercise 12. geom_label colors We previously used this code to make a plot using the state abbreviations as labels: We are now going to add color to represent the region.

murders %>% ggplot(aes(population, total, label = abb, color=region)) +
geom_label(aes(population, total, label=abb))

Exercise 13. Log-scale Now we are going to change the axes to log scales to account for the fact that the population distribution is skewed. Let’s start by defining an object p that holds the plot we have made up to now:

Change both axes to be in the log scale. Make sure you do not redefine p - just add the appropriate layers.

p <- murders %>% ggplot(aes(population, total, label = abb, color = region)) +
  geom_label()
p+scale_x_log10()+
scale_y_log10()

Exercise 14. Titles In the previous exercises we created a plot using the following code:

data(murders)
p<- murders %>% ggplot(aes(population, total, label = abb, color = region)) +
  geom_label()
p + scale_x_log10() + scale_y_log10()+
  ggtitle("Gun murder data")

Exercise 15. Histograms We are going to shift our focus from the murders dataset to explore the heights dataset.

We use the geom_histogram function to make a histogram of the heights in the heights data frame. When reading the documentation for this function we see that it requires just one mapping, the values to be used for the histogram.

What is the variable containing the heights in inches in the heights data frame?

heights %>%
  names()
## [1] "sex"    "height"

ANSWER: HEIGHT

Exercise 16. A second example We are now going to make a histogram of the heights so we will load the heights dataset. The following code has been pre-run for you to load the heights dataset:

data(heights)
p<- heights %>% 
  ggplot(aes(height))

Exercise 17. Histograms 2 Now we are ready to add a layer to actually make the histogram.

data(heights)
p<- heights %>% 
  ggplot(aes(height))
p +   geom_histogram(aes(height))+
  labs(x= "Height", y="densidad", title="Histograma de heights", subtitle= "Estudiantes en USA", caption= "Autor:RicardoC")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Exercise 18. Histogram binwidth Note that when we run the code from the previous exercise we get the following warning: Use the binwidth argument to change the histogram made in the previous exercise to use bins of size 1 inch.

data(heights)
p<- heights %>% 
  ggplot(aes(height))
p +   geom_histogram(aes(height, fill="red"), binwidth = 1, color = "black")

Exercise 19. Smooth density plot Now instead of a histogram we are going to make a smooth density plot. In this case, we will not make an object p. Instead we will render the plot using a single line of code. In the previous exercise, we could have created a histogram using one line of code like this:

Now instead of geom_histogram we will use geom_density to create a smooth density plot.

heights %>% 
  ggplot(aes(height)) +
  geom_density()

Exercise 20. Two smooth density plots Now we are going to make density plots for males and females separately. We can do this using the group argument within the aes mapping. Because each point will be assigned to a different density depending on a variable from the dataset, we need to map within aes.

heights %>% 
  ggplot(aes(height, group = sex)) +
  geom_density()

Exercise 21. Two smooth density plots 2 In the previous exercise we made the two density plots, one for each sex, using:

We can also assign groups through the color or fill argument. For example, if you type color = sex ggplot knows you want a different color for each sex. So two densities must be drawn. You can therefore skip the group = sex mapping. Using color has the added benefit that it uses color to distinguish the groups.

heights %>% 
  ggplot(aes(height, color = sex)) +
  geom_density()

Exercise 22. Two smooth density plots 3 We can also assign groups using the fill argument. When using the geom_density geometry, color creates a colored line for the smooth density plot while fill colors in the area under the curve.

However, here the second density is drawn over the other. We can change this by using something called alpha blending.

heights %>% 
  ggplot(aes(height, fill = sex)) +
  geom_density(alpha = 0.2)

Section 3 Overview

heights %>%
filter(sex=="Male") %>%
  summarise(DevSt=sd(height), Media=mean(height))
## Warning: package 'bindrcpp' was built under R version 3.5.1
##      DevSt    Media
## 1 3.611024 69.31475
heights %>%
filter(sex=="Male") %>%
  summarise(Media=mean(height), Minimo=min(height), Maximo=max(height))
##      Media Minimo   Maximo
## 1 69.31475     50 82.67717
murders%>%
  summarise(rate = sum(total)/sum(population)*100000)
##       rate
## 1 3.034555
heights%>%
  group_by(sex)%>%
  summarise(Media=mean(height), Median=median(height), Minimo=min(height), Maximo=max(height))
## # A tibble: 2 x 5
##   sex    Media Median Minimo Maximo
##   <fct>  <dbl>  <dbl>  <dbl>  <dbl>
## 1 Female  64.9   65.0     51   79  
## 2 Male    69.3   69       50   82.7
murders%>%
  group_by(region)%>%
   summarise(sum(population), sum(total), length(state), rate=sum(total)/length(state) )
## # A tibble: 4 x 5
##   region        `sum(population)` `sum(total)` `length(state)`  rate
##   <fct>                     <dbl>        <dbl>           <int> <dbl>
## 1 Northeast              55317240         1469               9  163.
## 2 South                 115674434         4195              17  247.
## 3 North Central          66927001         1828              12  152.
## 4 West                   71945553         1911              13  147
murders%>%
  group_by(region)%>%
   summarise(sum(population), sum(total), length(state), rate=sum(total)/length(state) )
## # A tibble: 4 x 5
##   region        `sum(population)` `sum(total)` `length(state)`  rate
##   <fct>                     <dbl>        <dbl>           <int> <dbl>
## 1 Northeast              55317240         1469               9  163.
## 2 South                 115674434         4195              17  247.
## 3 North Central          66927001         1828              12  152.
## 4 West                   71945553         1911              13  147
murders%>%
  arrange(desc(total))%>%
  top_n(10)%>%
  group_by(state)%>%
  summarise(sum(total))
## Selecting by total
## # A tibble: 10 x 2
##    state        `sum(total)`
##    <chr>               <dbl>
##  1 California           1257
##  2 Florida               669
##  3 Georgia               376
##  4 Illinois              364
##  5 Louisiana             351
##  6 Michigan              413
##  7 Missouri              321
##  8 New York              517
##  9 Pennsylvania          457
## 10 Texas                 805
heights%>%
  ggplot(aes(height, fill=sex))+
  geom_density(alpha=0.25)+
  labs(x="Alturas Estudiantes", y="Densidad", title = "Histograma de las Alturas", caption = "Autor:RicardoC", subtitle="Maculino y Femenino")

heights %>% 
  ggplot(aes(height, fill = sex)) +
  geom_density(alpha = 0.2)+
  labs(x="Hola", y="chao", caption="autor:RicardoC", subtitle="Masculino", title="titulo2")

gapminder%>%
  head()
##               country year infant_mortality life_expectancy fertility
## 1             Albania 1960           115.40           62.87      6.19
## 2             Algeria 1960           148.20           47.50      7.65
## 3              Angola 1960           208.00           35.98      7.32
## 4 Antigua and Barbuda 1960               NA           62.97      4.43
## 5           Argentina 1960            59.87           65.39      3.11
## 6             Armenia 1960               NA           66.86      4.55
##   population          gdp continent          region
## 1    1636054           NA    Europe Southern Europe
## 2   11124892  13828152297    Africa Northern Africa
## 3    5270844           NA    Africa   Middle Africa
## 4      54681           NA  Americas       Caribbean
## 5   20619075 108322326649  Americas   South America
## 6    1867396           NA      Asia    Western Asia

Practice Exercise. National Center for Health Statistics

install.packages(“NHANES”)

library(NHANES)
## Warning: package 'NHANES' was built under R version 3.5.1
NHANES%>%
  head()
## # A tibble: 6 x 76
##      ID SurveyYr Gender   Age AgeDecade AgeMonths Race1 Race3 Education
##   <int> <fct>    <fct>  <int> <fct>         <int> <fct> <fct> <fct>    
## 1 51624 2009_10  male      34 " 30-39"        409 White <NA>  High Sch~
## 2 51624 2009_10  male      34 " 30-39"        409 White <NA>  High Sch~
## 3 51624 2009_10  male      34 " 30-39"        409 White <NA>  High Sch~
## 4 51625 2009_10  male       4 " 0-9"           49 Other <NA>  <NA>     
## 5 51630 2009_10  female    49 " 40-49"        596 White <NA>  Some Col~
## 6 51638 2009_10  male       9 " 0-9"          115 White <NA>  <NA>     
## # ... with 67 more variables: MaritalStatus <fct>, HHIncome <fct>,
## #   HHIncomeMid <int>, Poverty <dbl>, HomeRooms <int>, HomeOwn <fct>,
## #   Work <fct>, Weight <dbl>, Length <dbl>, HeadCirc <dbl>, Height <dbl>,
## #   BMI <dbl>, BMICatUnder20yrs <fct>, BMI_WHO <fct>, Pulse <int>,
## #   BPSysAve <int>, BPDiaAve <int>, BPSys1 <int>, BPDia1 <int>,
## #   BPSys2 <int>, BPDia2 <int>, BPSys3 <int>, BPDia3 <int>,
## #   Testosterone <dbl>, DirectChol <dbl>, TotChol <dbl>, UrineVol1 <int>,
## #   UrineFlow1 <dbl>, UrineVol2 <int>, UrineFlow2 <dbl>, Diabetes <fct>,
## #   DiabetesAge <int>, HealthGen <fct>, DaysPhysHlthBad <int>,
## #   DaysMentHlthBad <int>, LittleInterest <fct>, Depressed <fct>,
## #   nPregnancies <int>, nBabies <int>, Age1stBaby <int>,
## #   SleepHrsNight <int>, SleepTrouble <fct>, PhysActive <fct>,
## #   PhysActiveDays <int>, TVHrsDay <fct>, CompHrsDay <fct>,
## #   TVHrsDayChild <int>, CompHrsDayChild <int>, Alcohol12PlusYr <fct>,
## #   AlcoholDay <int>, AlcoholYear <int>, SmokeNow <fct>, Smoke100 <fct>,
## #   Smoke100n <fct>, SmokeAge <int>, Marijuana <fct>, AgeFirstMarij <int>,
## #   RegularMarij <fct>, AgeRegMarij <int>, HardDrugs <fct>, SexEver <fct>,
## #   SexAge <int>, SexNumPartnLife <int>, SexNumPartYear <int>,
## #   SameSex <fct>, SexOrientation <fct>, PregnantNow <fct>

Exercise 1. Blood pressure 1 Let’s explore the NHANES data. We will be exploring blood pressure in this dataset.

First let’s select a group to set the standard. We will use 20-29 year old females. Note that the category is coded with 20-29, with a space in front of the 20! The AgeDecade is a categorical variable with these ages.

To know if someone is female, you can look at the Gender variable.

NHANES%>%
  filter(AgeDecade==" 20-29" & Gender=="female") #OJO INCLUYE SPACE ESPACIO ANTES20
## # A tibble: 681 x 76
##       ID SurveyYr Gender   Age AgeDecade AgeMonths Race1 Race3 Education
##    <int> <fct>    <fct>  <int> <fct>         <int> <fct> <fct> <fct>    
##  1 51710 2009_10  female    26 " 20-29"        319 White <NA>  College ~
##  2 51731 2009_10  female    28 " 20-29"        346 Black <NA>  High Sch~
##  3 51741 2009_10  female    21 " 20-29"        253 Black <NA>  Some Col~
##  4 51741 2009_10  female    21 " 20-29"        253 Black <NA>  Some Col~
##  5 51760 2009_10  female    27 " 20-29"        334 Hisp~ <NA>  9 - 11th~
##  6 51764 2009_10  female    29 " 20-29"        357 White <NA>  College ~
##  7 51764 2009_10  female    29 " 20-29"        357 White <NA>  College ~
##  8 51764 2009_10  female    29 " 20-29"        357 White <NA>  College ~
##  9 51774 2009_10  female    26 " 20-29"        312 White <NA>  8th Grade
## 10 51774 2009_10  female    26 " 20-29"        312 White <NA>  8th Grade
## # ... with 671 more rows, and 67 more variables: MaritalStatus <fct>,
## #   HHIncome <fct>, HHIncomeMid <int>, Poverty <dbl>, HomeRooms <int>,
## #   HomeOwn <fct>, Work <fct>, Weight <dbl>, Length <dbl>, HeadCirc <dbl>,
## #   Height <dbl>, BMI <dbl>, BMICatUnder20yrs <fct>, BMI_WHO <fct>,
## #   Pulse <int>, BPSysAve <int>, BPDiaAve <int>, BPSys1 <int>,
## #   BPDia1 <int>, BPSys2 <int>, BPDia2 <int>, BPSys3 <int>, BPDia3 <int>,
## #   Testosterone <dbl>, DirectChol <dbl>, TotChol <dbl>, UrineVol1 <int>,
## #   UrineFlow1 <dbl>, UrineVol2 <int>, UrineFlow2 <dbl>, Diabetes <fct>,
## #   DiabetesAge <int>, HealthGen <fct>, DaysPhysHlthBad <int>,
## #   DaysMentHlthBad <int>, LittleInterest <fct>, Depressed <fct>,
## #   nPregnancies <int>, nBabies <int>, Age1stBaby <int>,
## #   SleepHrsNight <int>, SleepTrouble <fct>, PhysActive <fct>,
## #   PhysActiveDays <int>, TVHrsDay <fct>, CompHrsDay <fct>,
## #   TVHrsDayChild <int>, CompHrsDayChild <int>, Alcohol12PlusYr <fct>,
## #   AlcoholDay <int>, AlcoholYear <int>, SmokeNow <fct>, Smoke100 <fct>,
## #   Smoke100n <fct>, SmokeAge <int>, Marijuana <fct>, AgeFirstMarij <int>,
## #   RegularMarij <fct>, AgeRegMarij <int>, HardDrugs <fct>, SexEver <fct>,
## #   SexAge <int>, SexNumPartnLife <int>, SexNumPartYear <int>,
## #   SameSex <fct>, SexOrientation <fct>, PregnantNow <fct>

Exercise 2. Blood pressure 2 Now we will compute the average and standard deviation for the subgroup we defined in the previous exercise (20-29 year old females), which we will use reference for what is typical.

You will determine the average and standard deviation of systolic blood pressure, which are stored in the BPSysAve variable in the NHANES dataset.

NHANES%>%
  filter(AgeDecade==" 20-29" & Gender=="female")%>%
  summarize(average = mean(BPSysAve, na.rm = TRUE), standard_deviation = sd(BPSysAve, na.rm=TRUE))  #na.rm no considera los valores vacios
## # A tibble: 1 x 2
##   average standard_deviation
##     <dbl>              <dbl>
## 1    108.               10.1
#termina con RF

Exercise 3. Summarizing averages Now we will repeat the exercise and generate only the average blood pressure for 20-29 year old females. For this exercise, you should review how to use the place holder . in dplyr.

Modify the line of sample code to assign the average to a numeric variable called ref_avg.

ref_avg <- NHANES %>%
  filter(AgeDecade==" 20-29" & Gender=="female")%>%
  summarize(average = mean(BPSysAve, na.rm = TRUE))%>%
  .$average

NO ENTENDI EL EJERCICIO 3

Exercise 4. Min and max Let’s continue practicing by calculating two other data summaries: the minimum and the maximum.

Again we will do it for the BPSysAve variable and the group of 20-29 year old females.

Report the min and max values for the same group as in the previous exercises. Use filter and summarize connected by the pipe %>% again. The functions min and max can be used to get the values you want. Within summarize, save the min and max of systolic blood pressure as min and max.

NHANES%>%
  filter(AgeDecade==" 20-29" & Gender=="female")%>%
  summarize(min = min(BPSysAve, na.rm = TRUE), max = max(BPSysAve, na.rm=TRUE))
## # A tibble: 1 x 2
##     min   max
##   <dbl> <dbl>
## 1    84   179

Exercise 5. group_by Now let’s practice using the group_by function.

What we are about to do is a very common operation in data science: you will split a data table into groups and then compute summary statistics for each group.

We will compute the average and standard deviation of systolic blood pressure for females for each age group separately. Remember that the age groups are contained in AgeDecade

NHANES%>%
  filter(Gender=="female")%>%
  group_by(AgeDecade)%>%
  summarize(average = mean(BPSysAve, na.rm = TRUE), standard_deviation = sd(BPSysAve, na.rm=TRUE))
## # A tibble: 9 x 3
##   AgeDecade average standard_deviation
##   <fct>       <dbl>              <dbl>
## 1 " 0-9"      100.0               9.07
## 2 " 10-19"    104.                9.46
## 3 " 20-29"    108.               10.1 
## 4 " 30-39"    111.               12.3 
## 5 " 40-49"    115.               14.5 
## 6 " 50-59"    122.               16.2 
## 7 " 60-69"    127.               17.1 
## 8 " 70+"      134.               19.8 
## 9 <NA>        142.               22.9

Exercise 6. group_by example 2 Now let’s practice using group_by some more. We are going to repeat the previous exercise of calculating the average and standard deviation of systolic blood pressure, but for males instead of females.

This time we will not provide much sample code. You are on your own!

Calculate the average and standard deviation of systolic blood pressure for males for each age group separately using the same methods as in the previous exercise.

NHANES%>%
  filter(Gender=="male")%>%
  group_by(AgeDecade)%>%
  summarize(average = mean(BPSysAve, na.rm = TRUE), standard_deviation = sd(BPSysAve, na.rm=TRUE))
## # A tibble: 9 x 3
##   AgeDecade average standard_deviation
##   <fct>       <dbl>              <dbl>
## 1 " 0-9"       97.4               8.32
## 2 " 10-19"    110.               11.2 
## 3 " 20-29"    118.               11.3 
## 4 " 30-39"    119.               12.3 
## 5 " 40-49"    121.               14.0 
## 6 " 50-59"    126.               17.8 
## 7 " 60-69"    127.               17.5 
## 8 " 70+"      130.               18.7 
## 9 <NA>        136.               23.5

Exercise 7. group_by example 3 We can actually combine both of these summaries into a single line of code. This is because group_by permits us to group by more than one variable.

We can use group_by(AgeDecade, Gender) to group by both age decades and gender.

NHANES%>%
  group_by(AgeDecade, Gender)%>%
  summarize(average = mean(BPSysAve, na.rm = TRUE), standard_deviation = sd(BPSysAve, na.rm=TRUE))
## # A tibble: 18 x 4
## # Groups:   AgeDecade [?]
##    AgeDecade Gender average standard_deviation
##    <fct>     <fct>    <dbl>              <dbl>
##  1 " 0-9"    female   100.0               9.07
##  2 " 0-9"    male      97.4               8.32
##  3 " 10-19"  female   104.                9.46
##  4 " 10-19"  male     110.               11.2 
##  5 " 20-29"  female   108.               10.1 
##  6 " 20-29"  male     118.               11.3 
##  7 " 30-39"  female   111.               12.3 
##  8 " 30-39"  male     119.               12.3 
##  9 " 40-49"  female   115.               14.5 
## 10 " 40-49"  male     121.               14.0 
## 11 " 50-59"  female   122.               16.2 
## 12 " 50-59"  male     126.               17.8 
## 13 " 60-69"  female   127.               17.1 
## 14 " 60-69"  male     127.               17.5 
## 15 " 70+"    female   134.               19.8 
## 16 " 70+"    male     130.               18.7 
## 17 <NA>      female   142.               22.9 
## 18 <NA>      male     136.               23.5

Exercise 8. Arrange Now we are going to explore differences in systolic blood pressure across races, as reported in the Race1 variable.

We will learn to use the arrange function to order the outcome acording to one variable.

Note that this function can be used to order any table by a given outcome. Here is an example that arranges by systolic blood pressure.

NHANES %>% arrange(BPSysAve)

If we want it in descending order we can use the desc function like this:

NHANES %>% arrange(desc(BPSysAve))

NHANES%>%
 filter(Gender=="male" & AgeDecade==" 40-49")%>%
  group_by(Race1)%>%
  summarize(average = mean(BPSysAve, na.rm = TRUE), standard_deviation = sd(BPSysAve, na.rm=TRUE))%>%
  arrange(average)
## # A tibble: 5 x 3
##   Race1    average standard_deviation
##   <fct>      <dbl>              <dbl>
## 1 White       120.               13.4
## 2 Other       120.               16.2
## 3 Hispanic    122.               11.1
## 4 Mexican     122.               13.9
## 5 Black       126.               17.1

GAPMINDER

install.packages(“dslabs”)

library(dslabs)
gapminder%>%
  names()
## [1] "country"          "year"             "infant_mortality"
## [4] "life_expectancy"  "fertility"        "population"      
## [7] "gdp"              "continent"        "region"
gapminder%>%
  head(20)
##                country year infant_mortality life_expectancy fertility
## 1              Albania 1960           115.40           62.87      6.19
## 2              Algeria 1960           148.20           47.50      7.65
## 3               Angola 1960           208.00           35.98      7.32
## 4  Antigua and Barbuda 1960               NA           62.97      4.43
## 5            Argentina 1960            59.87           65.39      3.11
## 6              Armenia 1960               NA           66.86      4.55
## 7                Aruba 1960               NA           65.66      4.82
## 8            Australia 1960            20.30           70.87      3.45
## 9              Austria 1960            37.30           68.75      2.70
## 10          Azerbaijan 1960               NA           61.33      5.57
## 11             Bahamas 1960            51.00           62.00      4.50
## 12             Bahrain 1960           134.50           51.64      7.09
## 13          Bangladesh 1960           176.30           46.20      6.73
## 14            Barbados 1960            69.50           61.80      4.33
## 15             Belarus 1960               NA           71.59      2.74
## 16             Belgium 1960            29.50           69.59      2.60
## 17              Belize 1960               NA           60.08      6.50
## 18               Benin 1960           186.90           38.29      6.28
## 19              Bhutan 1960           175.00           35.94      6.67
## 20             Bolivia 1960           173.40           43.77      6.70
##    population          gdp continent                    region
## 1     1636054           NA    Europe           Southern Europe
## 2    11124892  13828152297    Africa           Northern Africa
## 3     5270844           NA    Africa             Middle Africa
## 4       54681           NA  Americas                 Caribbean
## 5    20619075 108322326649  Americas             South America
## 6     1867396           NA      Asia              Western Asia
## 7       54208           NA  Americas                 Caribbean
## 8    10292328  96677859364   Oceania Australia and New Zealand
## 9     7065525  52392699681    Europe            Western Europe
## 10    3897889           NA      Asia              Western Asia
## 11     109526   1306269490  Americas                 Caribbean
## 12     162501           NA      Asia              Western Asia
## 13   48200702  12767231590      Asia             Southern Asia
## 14     230934    784120376  Americas                 Caribbean
## 15    8190027           NA    Europe            Eastern Europe
## 16    9140563  68236665814    Europe            Western Europe
## 17      92068     86532304  Americas           Central America
## 18    2431620    621797131    Africa            Western Africa
## 19     224108           NA      Asia             Southern Asia
## 20    3693451   3001815692  Americas             South America
gapminder%>%
  filter(year==1962)%>%
  ggplot(aes(fertility, life_expectancy, color=continent))+
  geom_point()+
  theme_fivethirtyeight()+
  labs(x="FERTILIDAD", y="EXPECTATIVA_de_VIDA", title="GAPMINDER DATA POR CONTINENTE", caption="Autor:RicardoC", subtitle="Año 1962")

periodo <- c(1962, 2012)
continentes <- c("Asia", "Europe")
gapminder%>%
    filter(year%in%periodo, continent%in%continentes)%>%
  ggplot(aes(fertility, life_expectancy, color=continent))+
  geom_point()+
  facet_wrap(.~ year)+
  labs(x="FERTILIDAD", y="EXPECTATIVA_de_VIDA", title="GAPMINDER DATA POR CONTINENTE", caption="Autor:RicardoC", subtitle="Año 1962")

countries <- c("South Korea", "Germany")
gapminder%>%
    filter(country%in%countries)%>%
  ggplot(aes(year, fertility, color=country))+
  geom_line()+  #geom_point()
    labs(x="YEARS", y="FERTILITY", title="GAPMINDER DATA POR PAIS: SOUTH KOREA VS GERMANY", caption="Autor:RicardoC", subtitle="Evolucion por año")
## Warning: Removed 2 rows containing missing values (geom_path).

countries<- c("United States", "Germany")
  gapminder%>%
  filter(country%in%countries)%>%
  ggplot(aes(year, fertility, col=country))+
  geom_line()+
  labs(x="FERTILIDAD", y="EXPECTATIVA_de_VIDA", title="GAPMINDER DATA POR CONTINENTE", caption="Autor:RicardoC", subtitle="Año 1962")
## Warning: Removed 2 rows containing missing values (geom_path).

gapminder%>%
  names()
## [1] "country"          "year"             "infant_mortality"
## [4] "life_expectancy"  "fertility"        "population"      
## [7] "gdp"              "continent"        "region"
gapminder%>%
  filter(year== 2010 & !is.na(gdp)) %>%
  ggplot(aes(region, gdp/365/population, fill = continent))+
  geom_boxplot()+
    theme(axis.text.x =element_text(angle = 90, hjust = 1))+
  scale_y_continuous(trans = "log2")+
  geom_point(show.legend = FALSE) #enfatiza los punto en los graficos