library(gapminder)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(ggplot2)
library(gt)
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(nycflights13)

#1A It seems as though the countries with the highest GDP per capita have higher life expectancies.

plot1 <- ggplot(data = gapminder, aes(x = gdpPercap,y=lifeExp)) + geom_point(mapping = aes(color = continent)) + geom_smooth() +
  ylab("Life Expectancy") + xlab("GDP Per Capita") + ggtitle("The Effect of the GDP Per Capita on the Life Expectancy in the World") + labs(subtitle = "Excluding Antartica due to lack of human life") + theme_dark() + theme(plot.title = element_text(hjust = 0.5)) + scale_color_discrete (name="Continents") 
ggplotly(plot1)

#1B

plot2 <- plot1 + scale_y_continuous("Life Expectancy") + scale_x_log10("the logarithm of GDP per capita") + ggtitle("The Effect of the GDP Per Capita on the Life Expectancy in the World") + theme_dark() + theme(plot.title = element_text(hjust = 0.5)) + scale_color_discrete (name="Continents") 
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
ggplotly(plot2)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

#1C The top six countries are in the continent of Africa, and the rest are in Europe. The countries in Africa growth in life expectancy over the years has been miniscule, and in some cases negative. While the ones in Europe have grown more effectively. Lasty the countries in Africa’s life expectancies are typically in the 40s to 50s range, while the countries in Europe are in the 70s to 80s range.

table1 <- gapminder %>%
  group_by(country)%>%
  summarise(
    first_year = first(lifeExp),
    last_year = last(lifeExp),
    pct_change = (last_year - first_year)/(first_year)
  )%>%
 arrange(pct_change)%>%
 head(10)%>%
gt(table1)%>%
  tab_header(
    title = md("Countries With The Least Development In Life Expectancy"),
               subtitle = md("Top Ten"))
## Warning in !is.null(rowname_col) && rowname_col %in% colnames(data_tbl):
## 'length(x) = 4 > 1' in coercion to 'logical(1)'

#2 I conclude that clarity alone is not enough to determine the value of a diamond. Because the median price of IF is higher than I1 even though it is considered to be the worst. But considering other confounding variables such as the diamond’s carat size will help the user understand that the size of the diamond affects the pricing as well. In my opinion, it seems like it is the confounding variable with the greatest affect compared to clarity.

plot2 <- ggplot(data = diamonds) +  geom_boxplot(mapping = aes(x = price, y = clarity)) + xlab("Price") + ylab("Clarity") + ggtitle("The Price of a Diamond Based on Clarity") + theme_bw()
plot2

#3 The average delay tends to decrease as the number of flights increases. This may be due to a larger sample size allowing for more specified results. While destinations with smaller flight amounts might cause the average arrival delay to be skewed up or skewed down.

nyc_flight_info <- flights%>%
  group_by(dest)%>%
  summarise(
    avg_flights = n(),
    avg_delay = mean(arr_delay, na.rm = TRUE)
  )
nyc_flight_info
## # A tibble: 105 × 3
##    dest  avg_flights avg_delay
##    <chr>       <int>     <dbl>
##  1 ABQ           254      4.38
##  2 ACK           265      4.85
##  3 ALB           439     14.4 
##  4 ANC             8     -2.5 
##  5 ATL         17215     11.3 
##  6 AUS          2439      6.02
##  7 AVL           275      8.00
##  8 BDL           443      7.05
##  9 BGR           375      8.03
## 10 BHM           297     16.9 
## # … with 95 more rows
plot3 <- ggplot(data = nyc_flight_info, aes(x = avg_flights, y = avg_delay)) + geom_point() + geom_smooth() + ylab("Average Arrival Delay") + xlab("Average Flights") + ggtitle("New York Airports's Average Arrival Delay Per Destination") + theme(plot.title = element_text(hjust = 0.5)) 
ggplotly(plot3 + theme(
  panel.background = element_rect(fill = "yellow",
  colour = "yellow",)))

#4A According to the smooth line,the user can conclude that as the length of the flight increase, so does the speed of the flight.

nyc_flight_info_2 <- flights%>%
  summarise(
    Speed = (distance/hour),
    Distance = distance
  )
nyc_flight_info_2
## # A tibble: 336,776 × 2
##    Speed Distance
##    <dbl>    <dbl>
##  1 280       1400
##  2 283.      1416
##  3 218.      1089
##  4 315.      1576
##  5 127        762
##  6 144.       719
##  7 178.      1065
##  8  38.2      229
##  9 157.       944
## 10 122.       733
## # … with 336,766 more rows
plot4 <- ggplot(data = nyc_flight_info_2, aes(x = Distance, y = Speed)) + geom_point() + geom_smooth() + ggtitle("New York's Flight's Speed According To Size") + theme_bw()
ggplotly(plot4)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

#4B The probability that it arrived on time is about 27.7%.

nyc_flight_info_3 <- flights%>%
mutate(early_ot = ifelse((arr_delay <= 0),T,F))%>%
filter(dep_delay > 0)%>%
  summarise(
    prob_ot = mean(early_ot, na.rm = TRUE)
  )
nyc_flight_info_3
## # A tibble: 1 × 1
##   prob_ot
##     <dbl>
## 1   0.277

#4C

nyc_flight_info_4 <- flights%>%
mutate(early_ot = ifelse((arr_delay <= 0),T,F))%>%
mutate(delay = ifelse((dep_delay > 0), T, F))%>%
filter(!is.na(dep_delay), !is.na(arr_delay))%>%
filter(delay == TRUE)%>%
filter(early_ot == TRUE)%>%
  summarise(
    max_delay = max(dep_delay),
    avg_delay_2 = mean(dep_delay)
  )
nyc_flight_info_4
## # A tibble: 1 × 2
##   max_delay avg_delay_2
##       <dbl>       <dbl>
## 1        63        7.47

#5 Since my first idea for my final project was a bit too complex, I decided to scrap the whole idea and focus on finding data more associated with my career interest.I want to be a criminal/civil rights lawyer therefore I would like to know what type of clients I may be dealing with, and, as a result, be able to represent them better. A lot of defenses in the legal system have stemmed from the accused’s childhood. One of the beliefs of why the crime rate is high is due to single parent households. Therefore I have the US arrest record seperated by race for one dataset, and the US single parent household dataset divided by race for my second dataset. So the question will be do single parent households actually affect the crime rate, and if so, for which race does it mostly affect?