This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.2
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.2 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.1.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(openintro)
## Warning: package 'openintro' was built under R version 4.1.2
## Loading required package: airports
## Warning: package 'airports' was built under R version 4.1.2
## Loading required package: cherryblossom
## Warning: package 'cherryblossom' was built under R version 4.1.2
## Loading required package: usdata
## Warning: package 'usdata' was built under R version 4.1.2
data('present', package='openintro')
present
## # A tibble: 63 x 3
## year boys girls
## <dbl> <dbl> <dbl>
## 1 1940 1211684 1148715
## 2 1941 1289734 1223693
## 3 1942 1444365 1364631
## 4 1943 1508959 1427901
## 5 1944 1435301 1359499
## 6 1945 1404587 1330869
## 7 1946 1691220 1597452
## 8 1947 1899876 1800064
## 9 1948 1813852 1721216
## 10 1949 1826352 1733177
## # ... with 53 more rows
glimpse(present)
## Rows: 63
## Columns: 3
## $ year <dbl> 1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950~
## $ boys <dbl> 1211684, 1289734, 1444365, 1508959, 1435301, 1404587, 1691220, 1~
## $ girls <dbl> 1148715, 1223693, 1364631, 1427901, 1359499, 1330869, 1597452, 1~
present$boys
## [1] 1211684 1289734 1444365 1508959 1435301 1404587 1691220 1899876 1813852
## [10] 1826352 1823555 1923020 1971262 2001798 2059068 2073719 2133588 2179960
## [19] 2152546 2173638 2179708 2186274 2132466 2101632 2060162 1927054 1845862
## [28] 1803388 1796326 1846572 1915378 1822910 1669927 1608326 1622114 1613135
## [37] 1624436 1705916 1709394 1791267 1852616 1860272 1885676 1865553 1879490
## [46] 1927983 1924868 1951153 2002424 2069490 2129495 2101518 2082097 2048861
## [55] 2022589 1996355 1990480 1985596 2016205 2026854 2076969 2057922 2057979
#Exercise 1: to extract just the counts of girls birth record in US
count(present, "girls")
## # A tibble: 1 x 2
## `"girls"` n
## <chr> <int>
## 1 girls 63
present$girls
## [1] 1148715 1223693 1364631 1427901 1359499 1330869 1597452 1800064 1721216
## [10] 1733177 1730594 1827830 1875724 1900322 1958294 1973576 2029502 2074824
## [19] 2051266 2071158 2078142 2082052 2034896 1996388 1967328 1833304 1760412
## [28] 1717571 1705238 1753634 1816008 1733060 1588484 1528639 1537844 1531063
## [37] 1543352 1620716 1623885 1703131 1759642 1768966 1794861 1773380 1789651
## [46] 1832578 1831679 1858241 1907086 1971468 2028717 2009389 1982917 1951379
## [55] 1930178 1903234 1901014 1895298 1925348 1932563 1981845 1968011 1963747
ggplot(data = present, aes(x = year, y = girls)) +
geom_point()
## simple line plot of the number of girls birth record per year in US using ggplot
ggplot(data = present, aes(x = year, y = girls)) +
geom_line()
## Exercise 2: From 1940 to 1960 there is increase in the number of girls born per year. ## Then from 1980 to 2000 there is a increase in the number of girls born per year.
present$boys + present$girls
## [1] 2360399 2513427 2808996 2936860 2794800 2735456 3288672 3699940 3535068
## [10] 3559529 3554149 3750850 3846986 3902120 4017362 4047295 4163090 4254784
## [19] 4203812 4244796 4257850 4268326 4167362 4098020 4027490 3760358 3606274
## [28] 3520959 3501564 3600206 3731386 3555970 3258411 3136965 3159958 3144198
## [37] 3167788 3326632 3333279 3494398 3612258 3629238 3680537 3638933 3669141
## [46] 3760561 3756547 3809394 3909510 4040958 4158212 4110907 4065014 4000240
## [55] 3952767 3899589 3891494 3880894 3941553 3959417 4058814 4025933 4021726
present <- present %>%
mutate(total = boys + girls)
present
## # A tibble: 63 x 4
## year boys girls total
## <dbl> <dbl> <dbl> <dbl>
## 1 1940 1211684 1148715 2360399
## 2 1941 1289734 1223693 2513427
## 3 1942 1444365 1364631 2808996
## 4 1943 1508959 1427901 2936860
## 5 1944 1435301 1359499 2794800
## 6 1945 1404587 1330869 2735456
## 7 1946 1691220 1597452 3288672
## 8 1947 1899876 1800064 3699940
## 9 1948 1813852 1721216 3535068
## 10 1949 1826352 1733177 3559529
## # ... with 53 more rows
present <- present %>%
mutate(boy_ratio = boys/total)
present
## # A tibble: 63 x 5
## year boys girls total boy_ratio
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1940 1211684 1148715 2360399 0.513
## 2 1941 1289734 1223693 2513427 0.513
## 3 1942 1444365 1364631 2808996 0.514
## 4 1943 1508959 1427901 2936860 0.514
## 5 1944 1435301 1359499 2794800 0.514
## 6 1945 1404587 1330869 2735456 0.513
## 7 1946 1691220 1597452 3288672 0.514
## 8 1947 1899876 1800064 3699940 0.513
## 9 1948 1813852 1721216 3535068 0.513
## 10 1949 1826352 1733177 3559529 0.513
## # ... with 53 more rows
ggplot(data = present, aes(x = year, y = boy_ratio)) +
geom_line()
ggplot(data = present, aes(x = year, y = boy_ratio)) +
geom_point() +
geom_smooth(method=lm) # add linear trend line
## `geom_smooth()` using formula 'y ~ x'
# boys birth ratio per year is a downward slope from 1940 to 2000
present <- present %>%
mutate(boy_to_girl_ratio = boys / girls)
present
## # A tibble: 63 x 6
## year boys girls total boy_ratio boy_to_girl_ratio
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1940 1211684 1148715 2360399 0.513 1.05
## 2 1941 1289734 1223693 2513427 0.513 1.05
## 3 1942 1444365 1364631 2808996 0.514 1.06
## 4 1943 1508959 1427901 2936860 0.514 1.06
## 5 1944 1435301 1359499 2794800 0.514 1.06
## 6 1945 1404587 1330869 2735456 0.513 1.06
## 7 1946 1691220 1597452 3288672 0.514 1.06
## 8 1947 1899876 1800064 3699940 0.513 1.06
## 9 1948 1813852 1721216 3535068 0.513 1.05
## 10 1949 1826352 1733177 3559529 0.513 1.05
## # ... with 53 more rows
present <- present %>%
mutate(more_boys = boys > girls)
present
## # A tibble: 63 x 7
## year boys girls total boy_ratio boy_to_girl_ratio more_boys
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl>
## 1 1940 1211684 1148715 2360399 0.513 1.05 TRUE
## 2 1941 1289734 1223693 2513427 0.513 1.05 TRUE
## 3 1942 1444365 1364631 2808996 0.514 1.06 TRUE
## 4 1943 1508959 1427901 2936860 0.514 1.06 TRUE
## 5 1944 1435301 1359499 2794800 0.514 1.06 TRUE
## 6 1945 1404587 1330869 2735456 0.513 1.06 TRUE
## 7 1946 1691220 1597452 3288672 0.514 1.06 TRUE
## 8 1947 1899876 1800064 3699940 0.513 1.06 TRUE
## 9 1948 1813852 1721216 3535068 0.513 1.05 TRUE
## 10 1949 1826352 1733177 3559529 0.513 1.05 TRUE
## # ... with 53 more rows
present %>%
summarize(min = min(boys), max = max(boys))
## # A tibble: 1 x 2
## min max
## <dbl> <dbl>
## 1 1211684 2186274
present$year
## [1] 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954
## [16] 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969
## [31] 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984
## [46] 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999
## [61] 2000 2001 2002
glimpse(present)
## Rows: 63
## Columns: 7
## $ year <dbl> 1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947, 1948~
## $ boys <dbl> 1211684, 1289734, 1444365, 1508959, 1435301, 1404587~
## $ girls <dbl> 1148715, 1223693, 1364631, 1427901, 1359499, 1330869~
## $ total <dbl> 2360399, 2513427, 2808996, 2936860, 2794800, 2735456~
## $ boy_ratio <dbl> 0.5133386, 0.5131376, 0.5141926, 0.5138001, 0.513561~
## $ boy_to_girl_ratio <dbl> 1.054817, 1.053969, 1.058429, 1.056767, 1.055757, 1.~
## $ more_boys <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE~
dim(present)
## [1] 63 7
colnames(present)
## [1] "year" "boys" "girls"
## [4] "total" "boy_ratio" "boy_to_girl_ratio"
## [7] "more_boys"
# arbuthnot data mean is:
mean(arbuthnot$boys + arbuthnot$girls)
## [1] 11441.74
# present data mean is:
mean(present$boys + present$girls)
## [1] 3679515
present <- present %>%
mutate(girl_ratio = girls/total)
present
## # A tibble: 63 x 8
## year boys girls total boy_ratio boy_to_girl_rat~ more_boys girl_ratio
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl> <dbl>
## 1 1940 1211684 1148715 2360399 0.513 1.05 TRUE 0.487
## 2 1941 1289734 1223693 2513427 0.513 1.05 TRUE 0.487
## 3 1942 1444365 1364631 2808996 0.514 1.06 TRUE 0.486
## 4 1943 1508959 1427901 2936860 0.514 1.06 TRUE 0.486
## 5 1944 1435301 1359499 2794800 0.514 1.06 TRUE 0.486
## 6 1945 1404587 1330869 2735456 0.513 1.06 TRUE 0.487
## 7 1946 1691220 1597452 3288672 0.514 1.06 TRUE 0.486
## 8 1947 1899876 1800064 3699940 0.513 1.06 TRUE 0.487
## 9 1948 1813852 1721216 3535068 0.513 1.05 TRUE 0.487
## 10 1949 1826352 1733177 3559529 0.513 1.05 TRUE 0.487
## # ... with 53 more rows
ggplot(data = present, aes(x = year, y = girl_ratio)) +
geom_point() +
geom_smooth(method=lm) # add linear trend line
## `geom_smooth()` using formula 'y ~ x'
# Analysis: you can see that the girls birth ratio increases from 1940 to 2000
# Whereas the boys birth ratio is decreasing from 1940 to 2000
present <- present %>%
mutate(total = boys + girls)
present
## # A tibble: 63 x 8
## year boys girls total boy_ratio boy_to_girl_rat~ more_boys girl_ratio
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl> <dbl>
## 1 1940 1211684 1148715 2360399 0.513 1.05 TRUE 0.487
## 2 1941 1289734 1223693 2513427 0.513 1.05 TRUE 0.487
## 3 1942 1444365 1364631 2808996 0.514 1.06 TRUE 0.486
## 4 1943 1508959 1427901 2936860 0.514 1.06 TRUE 0.486
## 5 1944 1435301 1359499 2794800 0.514 1.06 TRUE 0.486
## 6 1945 1404587 1330869 2735456 0.513 1.06 TRUE 0.487
## 7 1946 1691220 1597452 3288672 0.514 1.06 TRUE 0.486
## 8 1947 1899876 1800064 3699940 0.513 1.06 TRUE 0.487
## 9 1948 1813852 1721216 3535068 0.513 1.05 TRUE 0.487
## 10 1949 1826352 1733177 3559529 0.513 1.05 TRUE 0.487
## # ... with 53 more rows
total_desc <- present %>%
arrange(desc(total))
total_desc
## # A tibble: 63 x 8
## year boys girls total boy_ratio boy_to_girl_rat~ more_boys girl_ratio
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl> <dbl>
## 1 1961 2186274 2082052 4268326 0.512 1.05 TRUE 0.488
## 2 1960 2179708 2078142 4257850 0.512 1.05 TRUE 0.488
## 3 1957 2179960 2074824 4254784 0.512 1.05 TRUE 0.488
## 4 1959 2173638 2071158 4244796 0.512 1.05 TRUE 0.488
## 5 1958 2152546 2051266 4203812 0.512 1.05 TRUE 0.488
## 6 1962 2132466 2034896 4167362 0.512 1.05 TRUE 0.488
## 7 1956 2133588 2029502 4163090 0.513 1.05 TRUE 0.487
## 8 1990 2129495 2028717 4158212 0.512 1.05 TRUE 0.488
## 9 1991 2101518 2009389 4110907 0.511 1.05 TRUE 0.489
## 10 1963 2101632 1996388 4098020 0.513 1.05 TRUE 0.487
## # ... with 53 more rows
ggplot(data = present, aes(x = year, y = total)) +
geom_line()
# 1960 is the year where you can see the most total number of births in the U.S.
Note that the echo = FALSE
parameter was added to the code chunk to prevent printing of the R code that generated the plot.