install.packages(“ggplot2”) install.packages(“dpplyr”) install.packages(“tidyverse”) #ggplot2, dplry install.packages(“RefManageR”) install.packages(“HistData”) install.packages(“ggthemes”) #ggplot themes install.packages(“ggrepel”)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.1
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(dslabs)
## Warning: package 'dslabs' was built under R version 3.5.1
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.1
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 3.5.1
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 3.5.1
Para conocer que “variables” estan disponibles en la data “heights” utilizamos la funcion “names”.
data(heights)
names (heights)
## [1] "sex" "height"
Para conocer el total de observaciones de una variable, en este caso de la variable “height”:
data("heights")
x <- heights$height
length (x)
## [1] 1050
Para conocer el total de observaciones UNICAS de una variable, en este caso de la variable “height”:
data("heights")
x <- heights$height
length(unique(x))
## [1] 139
Vamos a proceder a obtener la frecuencia de cada observacion utilizando la funcion:
data("heights")
x <- heights$height
tab <- table (x)
sum (tab==1)
## [1] 63
Obtengamos la proporcion que corresponde a Female y Males en la variable “sex”:
data("heights")
x <- heights$sex
prop.table(table(x))
## x
## Female Male
## 0.2266667 0.7733333
Un grafico nos puede dar una visualizacion y resumir de mejor manera los datos
data("heights")
x <- heights$height
hist(x, col = "green", xlab = "Altura", ylab = "Total observaciones")
Calculemos la media y la desviacion estandar de la variable “height”
data("heights")
x <- heights$height[heights$sex=="Male"]
desviacion<- sd (x)
media<- mean(x)
c(Promedio=media, DesvStand = desviacion)
## Promedio DesvStand
## 69.314755 3.611024
data("heights")
x <- heights$height[heights$sex=="Male"]
mean(x>69 & x <=72)
## [1] 0.3337438
Suppose you only have avg and stdev below, but no access to x, can you approximate the proportion of the data that is between 69 and 72 inches?
data("heights")
x <- heights$height[heights$sex=="Male"]
avg <- mean(x)
stdev <- sd (x)
pnorm(72, avg, stdev) - pnorm (69, avg, stdev)
## [1] 0.3061779
Use normal approximation to estimate the proportion of heights between 79 and 81 inches and save it in an object called approx. Report how many times bigger the actual proportion is compared to the approximation.
data("heights")
x <- heights$height[heights$sex=="Male"]
avg <- mean(x)
stdev <- sd (x)
exact <- mean (x>79 & x<= 81)
approx <- pnorm(81, avg, stdev) - pnorm (79, avg, stdev)
exact/approx
## [1] 1.614261
First, we will estimate the proportion of adult men that are 7 feet tall or taller.
Assume that the distribution of adult men in the world as normally distributed with an average of 69 inches and a standard deviation of 3 inches.
1- pnorm (7*12, 69, 3)
## [1] 2.866516e-07
Exercise 5. Estimating the number seven footers Now we have an approximation for the proportion, call it p, of men that are 7 feet tall or taller.
We know that there are about 1 billion men between the ages of 18 and 40 in the world, the age range for the NBA.
Can we use the normal distribution to estimate how many of these 1 billion men are at least seven feet tall?
Use your answer to the previous exercise to estimate the proportion of men that are seven feet tall or taller in the world and store that value as p. Then round the number of 18-40 year old men who are seven feet tall or taller to the nearest integer. (Do not store this value in an object.)
p <- 1 - pnorm(7*12, 69, 3)
round(p * 10^9)
## [1] 287
Exercise 6. How many seven footers are in the NBA? There are about 10 National Basketball Association (NBA) players that are 7 feet tall or higher. Use your answer to exercise 4 to estimate the proportion of men that are seven feet tall or taller in the world and store that value as p. Use your answer to the previous exercise (exercise 5) to round the number of 18-40 year old men who are seven feet tall or taller to the nearest integer and store that value as N. Then calculate the proportion of the world’s 18 to 40 year old seven footers that are in the NBA. (Do not store this value in an object.)
p <- 1 - pnorm(7*12, 69, 3)
N <- round(p * 10^9)
10/N
## [1] 0.03484321
Exercise 7. Lebron James’ height In the previous exerceise we estimated the proportion of seven footers in the NBA using this simple code: Repeat the calculations performed in the previous question for Lebron James’ height: 6 feet 8 inches. There are about 150 players, instead of 10, that are at least that tall in the NBA.
p <- 1 - pnorm(7*12, 69, 3)
N <- round(p * 10^9)
10/N
## [1] 0.03484321
Exercise 1. Vector lengths When analyzing data it’s often important to know the number of measurements you have for each category.
Define a variable male that contains the male heights. Define a variable female that contains the female heights. Report the length of each variable.
library(dslabs)
data(heights)
male <- heights$height[heights$sex=="Male"]
female <- heights$height[heights$sex=="Female"]
length(male)
## [1] 812
length(female)
## [1] 238
Exercise 2. Percentiles Suppose we can’t make a plot and want to compare the distributions side by side. If the number of data points is large, listing all the numbers is inpractical. A more practical approach is to look at the percentiles. We can obtain percentiles using the quantile function like this:
Create two five row vectors showing the 10th, 30th, 50th, 70th, and 90th percentiles for the heights of each sex called these vectors female_percentiles and male_percentiles. Then create a data frame called df with these two vectors as columns. The column names should be female and male and should appear in that order. As an example consider that if you want a data frame to have column names names and grades, in that order, you do it like this:
library(dslabs)
data(heights)
male <- heights$height[heights$sex=="Male"]
female <- heights$height[heights$sex=="Female"]
female_percentiles <- quantile(female, seq(0.1, 0.9, 0.2))
male_percentiles <- quantile(male, seq(0.1, 0.9, 0.2))
df <- data.frame(female = female_percentiles, male = male_percentiles)
df
## female male
## 10% 61.00000 65.00000
## 30% 63.00000 68.00000
## 50% 64.98031 69.00000
## 70% 66.46417 71.00000
## 90% 69.00000 73.22751
Exercise 3. Error impact on average In the previous exercises we saw that the mean and median are very similar and so are the standard deviation and MAD. This is expected since the data is approximated by a normal distribution which has this propoerty.
Now suppose that suppose Galton made a mistake when entering the first value, forgetting to use the decimal point. You can imitate this error by typing:
library(HistData)
## Warning: package 'HistData' was built under R version 3.5.1
data(Galton)
x <- Galton$child
x_with_error <- x
x_with_error[1] <- x_with_error[1]*10
mean(x_with_error)- mean(x)
## [1] 0.5983836
Exercise 4. Error impact on SD In the previous exercise we saw how a simple mistake can result in the average of our data increasing more than half a foot, which is a large difference in practical terms. Now let’s explore the effect this outlier has on the standard deviation.
Report how many inches the SD grows after this mistake. Specifically, report the difference between the SD of the data with the mistake x_with_error and the data without the mistake x
x_with_error <- x
x_with_error[1] <- x_with_error[1]*10
sd(x_with_error)- sd(x)
## [1] 15.6746
Exercise 5. Error impact on median In the previous exercises we saw how one mistake can have a substantial effect on the average and the standard deviation.
Now we are going to see how the median and MAD are much more resistant to outliers. For this reason we say that they are robust summaries.
Report how many inches the median grows after the mistake. Specifically, report the difference between the median of the data with the mistake x_with_error and the data without the mistake x.
x_with_error <- x
x_with_error[1] <- x_with_error[1]*10
median(x_with_error)- median(x)
## [1] 0
Exercise 6. Error impact on MAD We saw that the median barely changes. Now let’s see how the MAD is affected.
Report how many inches the MAD grows after the mistake. Specifically, report the difference between the MAD of the data with the mistake x_with_error and the data without the mistake x.
x_with_error <- x
x_with_error[1] <- x_with_error[1]*10
mad(x_with_error)- mad(x)
## [1] 0
Exercise 8. Using EDA to explore changes We have seen how the average can be affected by outliers. But how large can this effect get? This of course depends on the size of the outlier and the size of the dataset.
To see how outliers can affect the average of a dataset, let’s write a simple function that takes the size of the outlier as input and returns the average.
Write a function called error_avg that takes a value k and returns the average of the vector x after the first entry changed to k. Show the results for k=10000 and k=-10000.
x <- Galton$child
error_avg <- function(k){
x[1] <- k
mean(x)
}
error_avg(10000)
## [1] 78.79784
error_avg(-10000)
## [1] 57.24612
data("murders")
names(murders)
## [1] "state" "abb" "region" "population" "total"
SECTION 2: GGPPLOT2
murders %>%
ggplot()+
geom_abline()+
geom_point(aes(x=population/10^6, y= total, col=region), size = 2)+
geom_text(aes(population/10^6, total, label=abb), nudge_x = 0.05, size =2)+
scale_x_log10()+ #scale_x_continuous(trans = "log10")
scale_y_log10()+ #scale_y_continuous(trans = "log10")
xlab("Populations in Millions") +
ylab("Total number of murders") +
ggtitle("US Gun murders in US 2010")+
theme_economist()
Exercise 1. ggplot2 basics Start by loading the dplyr and ggplot2 libraries as well as the murders data.
What is the class of the object p?
data("murders")
data("heights")
p<- ggplot(murders)
class(p)
## [1] "gg" "ggplot"
Usando “pipes” otra forma de asignar una data a ggplot2
murders %>%
ggplot()
Exercise 2. Printing Remember that to print an object you can use the command print or simply type the object. For example, instead of
data("murders")
p <- ggplot(murders)
ggplot()
ANSWER: A blank slate plot
Exercise 3. Pipes Now we are going to review the use of pipes by seeing how they can be used with ggplot
Using the pipe %>%, create an object p associated with the heights dataset instead of with the murders dataset as in previous exercises.
p <- heights %>%
ggplot()
Exercise 4. Layers Now we are going to add layers and the corresponding aesthetic mappings. For the murders data, we plotted total murders versus population sizes in the videos.
Explore the murders data frame to remind yourself of the names for the two variables (total murders and population size) we want to plot and select the correct answer.
data("murders")
names(murders)
## [1] "state" "abb" "region" "population" "total"
ANSWER: Population and Total
Exercise 5. geom_point 1 To create a scatter plot, we add a layer with the function geom_point. The aesthetic mappings require us to define the x-axis and y-axis variables respectively.
Remake the plot but flip the axes so that total is on the x-axis and population is on the y-axis.
murders %>% ggplot(aes(x =total , y = population )) +
geom_point()
Exercise 6. geom_point 2 Note that if we don’t use argument names, we can obtain the same plot by making sure we enter the variable names in the desired order:
Remake the plot but flip the axes so that total is on the x-axis and population is on the y-axis.
murders %>% ggplot(aes(total, population)) +
geom_point()
Exercise 7. geom_point text If instead of points we want to add text, we can use the geom_text() or geom_label() geometries. However, note that the following code:
murders %>% ggplot(aes(population, total)) + geom_label()
will give us the error message: ANSWER: We need to map a character to each point through the label argument in aes.
Exercise 8. geom_point text You can also add labels to the points on a plot. Rewrite the code from the previous exercise to add the state abbreviation as the label through aes.
murders %>% ggplot(aes(population, total)) +
geom_point()+
geom_label(aes(population, total, label=abb))
Exercise 9. geom_point colors Now let’s change the color of the labels to blue. How can we do this?
ANSWER: By using the color argument in geom_label because we want all colors to be blue so we do not need to map colors
Exercise 10. geom_point colors 2 Now let’s go ahead and make the labels blue. We previously wrote this code to add labels to our plot:
murders %>% ggplot(aes(population, total)) +
geom_point()+
geom_label(aes(population, total, label=abb),color="blue")
Exercise 11. geom_labels by region Now suppose we want to use color to represent the different regions. So the states from the West will be one color, states from the Northeast another, and so on. In this case, which of the following is most appropriate:
ANSWER: Mapping the colors through the color argument of aes because each label needs a different color
Exercise 12. geom_label colors We previously used this code to make a plot using the state abbreviations as labels: We are now going to add color to represent the region.
murders %>% ggplot(aes(population, total, label = abb, color=region)) +
geom_label(aes(population, total, label=abb))
Exercise 13. Log-scale Now we are going to change the axes to log scales to account for the fact that the population distribution is skewed. Let’s start by defining an object p that holds the plot we have made up to now:
Change both axes to be in the log scale. Make sure you do not redefine p - just add the appropriate layers.
p <- murders %>% ggplot(aes(population, total, label = abb, color = region)) +
geom_label()
p+scale_x_log10()+
scale_y_log10()
Exercise 14. Titles In the previous exercises we created a plot using the following code:
data(murders)
p<- murders %>% ggplot(aes(population, total, label = abb, color = region)) +
geom_label()
p + scale_x_log10() + scale_y_log10()+
ggtitle("Gun murder data")
Exercise 15. Histograms We are going to shift our focus from the murders dataset to explore the heights dataset.
We use the geom_histogram function to make a histogram of the heights in the heights data frame. When reading the documentation for this function we see that it requires just one mapping, the values to be used for the histogram.
What is the variable containing the heights in inches in the heights data frame?
heights %>%
names()
## [1] "sex" "height"
ANSWER: HEIGHT
Exercise 16. A second example We are now going to make a histogram of the heights so we will load the heights dataset. The following code has been pre-run for you to load the heights dataset:
data(heights)
p<- heights %>%
ggplot(aes(height))
Exercise 17. Histograms 2 Now we are ready to add a layer to actually make the histogram.
data(heights)
p<- heights %>%
ggplot(aes(height))
p + geom_histogram(aes(height))+
labs(x= "Height", y="densidad", title="Histograma de heights", subtitle= "Estudiantes en USA", caption= "Autor:RicardoC")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Exercise 18. Histogram binwidth Note that when we run the code from the previous exercise we get the following warning: Use the binwidth argument to change the histogram made in the previous exercise to use bins of size 1 inch.
data(heights)
p<- heights %>%
ggplot(aes(height))
p + geom_histogram(aes(height, fill="red"), binwidth = 1, color = "black")
Exercise 19. Smooth density plot Now instead of a histogram we are going to make a smooth density plot. In this case, we will not make an object p. Instead we will render the plot using a single line of code. In the previous exercise, we could have created a histogram using one line of code like this:
Now instead of geom_histogram we will use geom_density to create a smooth density plot.
heights %>%
ggplot(aes(height)) +
geom_density()
Exercise 20. Two smooth density plots Now we are going to make density plots for males and females separately. We can do this using the group argument within the aes mapping. Because each point will be assigned to a different density depending on a variable from the dataset, we need to map within aes.
heights %>%
ggplot(aes(height, group = sex)) +
geom_density()
Exercise 21. Two smooth density plots 2 In the previous exercise we made the two density plots, one for each sex, using:
We can also assign groups through the color or fill argument. For example, if you type color = sex ggplot knows you want a different color for each sex. So two densities must be drawn. You can therefore skip the group = sex mapping. Using color has the added benefit that it uses color to distinguish the groups.
heights %>%
ggplot(aes(height, color = sex)) +
geom_density()
Exercise 22. Two smooth density plots 3 We can also assign groups using the fill argument. When using the geom_density geometry, color creates a colored line for the smooth density plot while fill colors in the area under the curve.
However, here the second density is drawn over the other. We can change this by using something called alpha blending.
heights %>%
ggplot(aes(height, fill = sex)) +
geom_density(alpha = 0.2)
Section 3 Overview
heights %>%
filter(sex=="Male") %>%
summarise(DevSt=sd(height), Media=mean(height))
## Warning: package 'bindrcpp' was built under R version 3.5.1
## DevSt Media
## 1 3.611024 69.31475
heights %>%
filter(sex=="Male") %>%
summarise(Media=mean(height), Minimo=min(height), Maximo=max(height))
## Media Minimo Maximo
## 1 69.31475 50 82.67717
murders%>%
summarise(rate = sum(total)/sum(population)*100000)
## rate
## 1 3.034555
heights%>%
group_by(sex)%>%
summarise(Media=mean(height), Median=median(height), Minimo=min(height), Maximo=max(height))
## # A tibble: 2 x 5
## sex Media Median Minimo Maximo
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Female 64.9 65.0 51 79
## 2 Male 69.3 69 50 82.7
murders%>%
group_by(region)%>%
summarise(sum(population), sum(total), length(state), rate=sum(total)/length(state) )
## # A tibble: 4 x 5
## region `sum(population)` `sum(total)` `length(state)` rate
## <fct> <dbl> <dbl> <int> <dbl>
## 1 Northeast 55317240 1469 9 163.
## 2 South 115674434 4195 17 247.
## 3 North Central 66927001 1828 12 152.
## 4 West 71945553 1911 13 147
murders%>%
group_by(region)%>%
summarise(sum(population), sum(total), length(state), rate=sum(total)/length(state) )
## # A tibble: 4 x 5
## region `sum(population)` `sum(total)` `length(state)` rate
## <fct> <dbl> <dbl> <int> <dbl>
## 1 Northeast 55317240 1469 9 163.
## 2 South 115674434 4195 17 247.
## 3 North Central 66927001 1828 12 152.
## 4 West 71945553 1911 13 147
murders%>%
arrange(desc(total))%>%
top_n(10)%>%
group_by(state)%>%
summarise(sum(total))
## Selecting by total
## # A tibble: 10 x 2
## state `sum(total)`
## <chr> <dbl>
## 1 California 1257
## 2 Florida 669
## 3 Georgia 376
## 4 Illinois 364
## 5 Louisiana 351
## 6 Michigan 413
## 7 Missouri 321
## 8 New York 517
## 9 Pennsylvania 457
## 10 Texas 805
heights%>%
ggplot(aes(height, fill=sex))+
geom_density(alpha=0.25)+
labs(x="Alturas Estudiantes", y="Densidad", title = "Histograma de las Alturas", caption = "Autor:RicardoC", subtitle="Maculino y Femenino")
heights %>%
ggplot(aes(height, fill = sex)) +
geom_density(alpha = 0.2)+
labs(x="Hola", y="chao", caption="autor:RicardoC", subtitle="Masculino", title="titulo2")
gapminder%>%
head()
## country year infant_mortality life_expectancy fertility
## 1 Albania 1960 115.40 62.87 6.19
## 2 Algeria 1960 148.20 47.50 7.65
## 3 Angola 1960 208.00 35.98 7.32
## 4 Antigua and Barbuda 1960 NA 62.97 4.43
## 5 Argentina 1960 59.87 65.39 3.11
## 6 Armenia 1960 NA 66.86 4.55
## population gdp continent region
## 1 1636054 NA Europe Southern Europe
## 2 11124892 13828152297 Africa Northern Africa
## 3 5270844 NA Africa Middle Africa
## 4 54681 NA Americas Caribbean
## 5 20619075 108322326649 Americas South America
## 6 1867396 NA Asia Western Asia
Practice Exercise. National Center for Health Statistics
install.packages(“NHANES”)
library(NHANES)
## Warning: package 'NHANES' was built under R version 3.5.1
NHANES%>%
head()
## # A tibble: 6 x 76
## ID SurveyYr Gender Age AgeDecade AgeMonths Race1 Race3 Education
## <int> <fct> <fct> <int> <fct> <int> <fct> <fct> <fct>
## 1 51624 2009_10 male 34 " 30-39" 409 White <NA> High Sch~
## 2 51624 2009_10 male 34 " 30-39" 409 White <NA> High Sch~
## 3 51624 2009_10 male 34 " 30-39" 409 White <NA> High Sch~
## 4 51625 2009_10 male 4 " 0-9" 49 Other <NA> <NA>
## 5 51630 2009_10 female 49 " 40-49" 596 White <NA> Some Col~
## 6 51638 2009_10 male 9 " 0-9" 115 White <NA> <NA>
## # ... with 67 more variables: MaritalStatus <fct>, HHIncome <fct>,
## # HHIncomeMid <int>, Poverty <dbl>, HomeRooms <int>, HomeOwn <fct>,
## # Work <fct>, Weight <dbl>, Length <dbl>, HeadCirc <dbl>, Height <dbl>,
## # BMI <dbl>, BMICatUnder20yrs <fct>, BMI_WHO <fct>, Pulse <int>,
## # BPSysAve <int>, BPDiaAve <int>, BPSys1 <int>, BPDia1 <int>,
## # BPSys2 <int>, BPDia2 <int>, BPSys3 <int>, BPDia3 <int>,
## # Testosterone <dbl>, DirectChol <dbl>, TotChol <dbl>, UrineVol1 <int>,
## # UrineFlow1 <dbl>, UrineVol2 <int>, UrineFlow2 <dbl>, Diabetes <fct>,
## # DiabetesAge <int>, HealthGen <fct>, DaysPhysHlthBad <int>,
## # DaysMentHlthBad <int>, LittleInterest <fct>, Depressed <fct>,
## # nPregnancies <int>, nBabies <int>, Age1stBaby <int>,
## # SleepHrsNight <int>, SleepTrouble <fct>, PhysActive <fct>,
## # PhysActiveDays <int>, TVHrsDay <fct>, CompHrsDay <fct>,
## # TVHrsDayChild <int>, CompHrsDayChild <int>, Alcohol12PlusYr <fct>,
## # AlcoholDay <int>, AlcoholYear <int>, SmokeNow <fct>, Smoke100 <fct>,
## # Smoke100n <fct>, SmokeAge <int>, Marijuana <fct>, AgeFirstMarij <int>,
## # RegularMarij <fct>, AgeRegMarij <int>, HardDrugs <fct>, SexEver <fct>,
## # SexAge <int>, SexNumPartnLife <int>, SexNumPartYear <int>,
## # SameSex <fct>, SexOrientation <fct>, PregnantNow <fct>
Exercise 1. Blood pressure 1 Let’s explore the NHANES data. We will be exploring blood pressure in this dataset.
First let’s select a group to set the standard. We will use 20-29 year old females. Note that the category is coded with 20-29, with a space in front of the 20! The AgeDecade is a categorical variable with these ages.
To know if someone is female, you can look at the Gender variable.
NHANES%>%
filter(AgeDecade==" 20-29" & Gender=="female") #OJO INCLUYE SPACE ESPACIO ANTES20
## # A tibble: 681 x 76
## ID SurveyYr Gender Age AgeDecade AgeMonths Race1 Race3 Education
## <int> <fct> <fct> <int> <fct> <int> <fct> <fct> <fct>
## 1 51710 2009_10 female 26 " 20-29" 319 White <NA> College ~
## 2 51731 2009_10 female 28 " 20-29" 346 Black <NA> High Sch~
## 3 51741 2009_10 female 21 " 20-29" 253 Black <NA> Some Col~
## 4 51741 2009_10 female 21 " 20-29" 253 Black <NA> Some Col~
## 5 51760 2009_10 female 27 " 20-29" 334 Hisp~ <NA> 9 - 11th~
## 6 51764 2009_10 female 29 " 20-29" 357 White <NA> College ~
## 7 51764 2009_10 female 29 " 20-29" 357 White <NA> College ~
## 8 51764 2009_10 female 29 " 20-29" 357 White <NA> College ~
## 9 51774 2009_10 female 26 " 20-29" 312 White <NA> 8th Grade
## 10 51774 2009_10 female 26 " 20-29" 312 White <NA> 8th Grade
## # ... with 671 more rows, and 67 more variables: MaritalStatus <fct>,
## # HHIncome <fct>, HHIncomeMid <int>, Poverty <dbl>, HomeRooms <int>,
## # HomeOwn <fct>, Work <fct>, Weight <dbl>, Length <dbl>, HeadCirc <dbl>,
## # Height <dbl>, BMI <dbl>, BMICatUnder20yrs <fct>, BMI_WHO <fct>,
## # Pulse <int>, BPSysAve <int>, BPDiaAve <int>, BPSys1 <int>,
## # BPDia1 <int>, BPSys2 <int>, BPDia2 <int>, BPSys3 <int>, BPDia3 <int>,
## # Testosterone <dbl>, DirectChol <dbl>, TotChol <dbl>, UrineVol1 <int>,
## # UrineFlow1 <dbl>, UrineVol2 <int>, UrineFlow2 <dbl>, Diabetes <fct>,
## # DiabetesAge <int>, HealthGen <fct>, DaysPhysHlthBad <int>,
## # DaysMentHlthBad <int>, LittleInterest <fct>, Depressed <fct>,
## # nPregnancies <int>, nBabies <int>, Age1stBaby <int>,
## # SleepHrsNight <int>, SleepTrouble <fct>, PhysActive <fct>,
## # PhysActiveDays <int>, TVHrsDay <fct>, CompHrsDay <fct>,
## # TVHrsDayChild <int>, CompHrsDayChild <int>, Alcohol12PlusYr <fct>,
## # AlcoholDay <int>, AlcoholYear <int>, SmokeNow <fct>, Smoke100 <fct>,
## # Smoke100n <fct>, SmokeAge <int>, Marijuana <fct>, AgeFirstMarij <int>,
## # RegularMarij <fct>, AgeRegMarij <int>, HardDrugs <fct>, SexEver <fct>,
## # SexAge <int>, SexNumPartnLife <int>, SexNumPartYear <int>,
## # SameSex <fct>, SexOrientation <fct>, PregnantNow <fct>
Exercise 2. Blood pressure 2 Now we will compute the average and standard deviation for the subgroup we defined in the previous exercise (20-29 year old females), which we will use reference for what is typical.
You will determine the average and standard deviation of systolic blood pressure, which are stored in the BPSysAve variable in the NHANES dataset.
NHANES%>%
filter(AgeDecade==" 20-29" & Gender=="female")%>%
summarize(average = mean(BPSysAve, na.rm = TRUE), standard_deviation = sd(BPSysAve, na.rm=TRUE)) #na.rm no considera los valores vacios
## # A tibble: 1 x 2
## average standard_deviation
## <dbl> <dbl>
## 1 108. 10.1
#termina con RF
Exercise 3. Summarizing averages Now we will repeat the exercise and generate only the average blood pressure for 20-29 year old females. For this exercise, you should review how to use the place holder . in dplyr.
Modify the line of sample code to assign the average to a numeric variable called ref_avg.
ref_avg <- NHANES %>%
filter(AgeDecade==" 20-29" & Gender=="female")%>%
summarize(average = mean(BPSysAve, na.rm = TRUE))%>%
.$average
Exercise 4. Min and max Let’s continue practicing by calculating two other data summaries: the minimum and the maximum.
Again we will do it for the BPSysAve variable and the group of 20-29 year old females.
Report the min and max values for the same group as in the previous exercises. Use filter and summarize connected by the pipe %>% again. The functions min and max can be used to get the values you want. Within summarize, save the min and max of systolic blood pressure as min and max.
NHANES%>%
filter(AgeDecade==" 20-29" & Gender=="female")%>%
summarize(min = min(BPSysAve, na.rm = TRUE), max = max(BPSysAve, na.rm=TRUE))
## # A tibble: 1 x 2
## min max
## <dbl> <dbl>
## 1 84 179
Exercise 5. group_by Now let’s practice using the group_by function.
What we are about to do is a very common operation in data science: you will split a data table into groups and then compute summary statistics for each group.
We will compute the average and standard deviation of systolic blood pressure for females for each age group separately. Remember that the age groups are contained in AgeDecade
NHANES%>%
filter(Gender=="female")%>%
group_by(AgeDecade)%>%
summarize(average = mean(BPSysAve, na.rm = TRUE), standard_deviation = sd(BPSysAve, na.rm=TRUE))
## # A tibble: 9 x 3
## AgeDecade average standard_deviation
## <fct> <dbl> <dbl>
## 1 " 0-9" 100.0 9.07
## 2 " 10-19" 104. 9.46
## 3 " 20-29" 108. 10.1
## 4 " 30-39" 111. 12.3
## 5 " 40-49" 115. 14.5
## 6 " 50-59" 122. 16.2
## 7 " 60-69" 127. 17.1
## 8 " 70+" 134. 19.8
## 9 <NA> 142. 22.9
Exercise 6. group_by example 2 Now let’s practice using group_by some more. We are going to repeat the previous exercise of calculating the average and standard deviation of systolic blood pressure, but for males instead of females.
This time we will not provide much sample code. You are on your own!
Calculate the average and standard deviation of systolic blood pressure for males for each age group separately using the same methods as in the previous exercise.
NHANES%>%
filter(Gender=="male")%>%
group_by(AgeDecade)%>%
summarize(average = mean(BPSysAve, na.rm = TRUE), standard_deviation = sd(BPSysAve, na.rm=TRUE))
## # A tibble: 9 x 3
## AgeDecade average standard_deviation
## <fct> <dbl> <dbl>
## 1 " 0-9" 97.4 8.32
## 2 " 10-19" 110. 11.2
## 3 " 20-29" 118. 11.3
## 4 " 30-39" 119. 12.3
## 5 " 40-49" 121. 14.0
## 6 " 50-59" 126. 17.8
## 7 " 60-69" 127. 17.5
## 8 " 70+" 130. 18.7
## 9 <NA> 136. 23.5
Exercise 7. group_by example 3 We can actually combine both of these summaries into a single line of code. This is because group_by permits us to group by more than one variable.
We can use group_by(AgeDecade, Gender) to group by both age decades and gender.
NHANES%>%
group_by(AgeDecade, Gender)%>%
summarize(average = mean(BPSysAve, na.rm = TRUE), standard_deviation = sd(BPSysAve, na.rm=TRUE))
## # A tibble: 18 x 4
## # Groups: AgeDecade [?]
## AgeDecade Gender average standard_deviation
## <fct> <fct> <dbl> <dbl>
## 1 " 0-9" female 100.0 9.07
## 2 " 0-9" male 97.4 8.32
## 3 " 10-19" female 104. 9.46
## 4 " 10-19" male 110. 11.2
## 5 " 20-29" female 108. 10.1
## 6 " 20-29" male 118. 11.3
## 7 " 30-39" female 111. 12.3
## 8 " 30-39" male 119. 12.3
## 9 " 40-49" female 115. 14.5
## 10 " 40-49" male 121. 14.0
## 11 " 50-59" female 122. 16.2
## 12 " 50-59" male 126. 17.8
## 13 " 60-69" female 127. 17.1
## 14 " 60-69" male 127. 17.5
## 15 " 70+" female 134. 19.8
## 16 " 70+" male 130. 18.7
## 17 <NA> female 142. 22.9
## 18 <NA> male 136. 23.5
Exercise 8. Arrange Now we are going to explore differences in systolic blood pressure across races, as reported in the Race1 variable.
We will learn to use the arrange function to order the outcome acording to one variable.
Note that this function can be used to order any table by a given outcome. Here is an example that arranges by systolic blood pressure.
NHANES %>% arrange(BPSysAve)
If we want it in descending order we can use the desc function like this:
NHANES %>% arrange(desc(BPSysAve))
NHANES%>%
filter(Gender=="male" & AgeDecade==" 40-49")%>%
group_by(Race1)%>%
summarize(average = mean(BPSysAve, na.rm = TRUE), standard_deviation = sd(BPSysAve, na.rm=TRUE))%>%
arrange(average)
## # A tibble: 5 x 3
## Race1 average standard_deviation
## <fct> <dbl> <dbl>
## 1 White 120. 13.4
## 2 Other 120. 16.2
## 3 Hispanic 122. 11.1
## 4 Mexican 122. 13.9
## 5 Black 126. 17.1
GAPMINDER
install.packages(“dslabs”)
library(dslabs)
gapminder%>%
names()
## [1] "country" "year" "infant_mortality"
## [4] "life_expectancy" "fertility" "population"
## [7] "gdp" "continent" "region"
gapminder%>%
head(20)
## country year infant_mortality life_expectancy fertility
## 1 Albania 1960 115.40 62.87 6.19
## 2 Algeria 1960 148.20 47.50 7.65
## 3 Angola 1960 208.00 35.98 7.32
## 4 Antigua and Barbuda 1960 NA 62.97 4.43
## 5 Argentina 1960 59.87 65.39 3.11
## 6 Armenia 1960 NA 66.86 4.55
## 7 Aruba 1960 NA 65.66 4.82
## 8 Australia 1960 20.30 70.87 3.45
## 9 Austria 1960 37.30 68.75 2.70
## 10 Azerbaijan 1960 NA 61.33 5.57
## 11 Bahamas 1960 51.00 62.00 4.50
## 12 Bahrain 1960 134.50 51.64 7.09
## 13 Bangladesh 1960 176.30 46.20 6.73
## 14 Barbados 1960 69.50 61.80 4.33
## 15 Belarus 1960 NA 71.59 2.74
## 16 Belgium 1960 29.50 69.59 2.60
## 17 Belize 1960 NA 60.08 6.50
## 18 Benin 1960 186.90 38.29 6.28
## 19 Bhutan 1960 175.00 35.94 6.67
## 20 Bolivia 1960 173.40 43.77 6.70
## population gdp continent region
## 1 1636054 NA Europe Southern Europe
## 2 11124892 13828152297 Africa Northern Africa
## 3 5270844 NA Africa Middle Africa
## 4 54681 NA Americas Caribbean
## 5 20619075 108322326649 Americas South America
## 6 1867396 NA Asia Western Asia
## 7 54208 NA Americas Caribbean
## 8 10292328 96677859364 Oceania Australia and New Zealand
## 9 7065525 52392699681 Europe Western Europe
## 10 3897889 NA Asia Western Asia
## 11 109526 1306269490 Americas Caribbean
## 12 162501 NA Asia Western Asia
## 13 48200702 12767231590 Asia Southern Asia
## 14 230934 784120376 Americas Caribbean
## 15 8190027 NA Europe Eastern Europe
## 16 9140563 68236665814 Europe Western Europe
## 17 92068 86532304 Americas Central America
## 18 2431620 621797131 Africa Western Africa
## 19 224108 NA Asia Southern Asia
## 20 3693451 3001815692 Americas South America
gapminder%>%
filter(year==1962)%>%
ggplot(aes(fertility, life_expectancy, color=continent))+
geom_point()+
theme_fivethirtyeight()+
labs(x="FERTILIDAD", y="EXPECTATIVA_de_VIDA", title="GAPMINDER DATA POR CONTINENTE", caption="Autor:RicardoC", subtitle="Año 1962")
periodo <- c(1962, 2012)
continentes <- c("Asia", "Europe")
gapminder%>%
filter(year%in%periodo, continent%in%continentes)%>%
ggplot(aes(fertility, life_expectancy, color=continent))+
geom_point()+
facet_wrap(.~ year)+
labs(x="FERTILIDAD", y="EXPECTATIVA_de_VIDA", title="GAPMINDER DATA POR CONTINENTE", caption="Autor:RicardoC", subtitle="Año 1962")
countries <- c("South Korea", "Germany")
gapminder%>%
filter(country%in%countries)%>%
ggplot(aes(year, fertility, color=country))+
geom_line()+ #geom_point()
labs(x="YEARS", y="FERTILITY", title="GAPMINDER DATA POR PAIS: SOUTH KOREA VS GERMANY", caption="Autor:RicardoC", subtitle="Evolucion por año")
## Warning: Removed 2 rows containing missing values (geom_path).
countries<- c("United States", "Germany")
gapminder%>%
filter(country%in%countries)%>%
ggplot(aes(year, fertility, col=country))+
geom_line()+
labs(x="FERTILIDAD", y="EXPECTATIVA_de_VIDA", title="GAPMINDER DATA POR CONTINENTE", caption="Autor:RicardoC", subtitle="Año 1962")
## Warning: Removed 2 rows containing missing values (geom_path).
gapminder%>%
names()
## [1] "country" "year" "infant_mortality"
## [4] "life_expectancy" "fertility" "population"
## [7] "gdp" "continent" "region"
gapminder%>%
filter(year== 2010 & !is.na(gdp)) %>%
ggplot(aes(region, gdp/365/population, fill = continent))+
geom_boxplot()+
theme(axis.text.x =element_text(angle = 90, hjust = 1))+
scale_y_continuous(trans = "log2")+
geom_point(show.legend = FALSE) #enfatiza los punto en los graficos