Part I

library(tidyverse)

This dataset is from data.world and contains information about global land temperatures for major cities around the world (https://data.world/data-society/global-climate-change-data/workspace/file?filename=GlobalLandTemperatures%2FGlobalLandTemperaturesByMajorCity.csv).

GlobalTemp <- read_csv("GlobalLandTemperatures_GlobalLandTemperaturesByMajorCity.csv")
head(GlobalTemp)
## # A tibble: 6 x 7
##   dt         AverageTemperatu~ AverageTemperat~ City  Country Latitude Longitude
##   <date>                 <dbl>            <dbl> <chr> <chr>   <chr>    <chr>    
## 1 1849-01-01              26.7             1.44 Abid~ Côte D~ 5.63N    3.23W    
## 2 1849-02-01              27.4             1.36 Abid~ Côte D~ 5.63N    3.23W    
## 3 1849-03-01              28.1             1.61 Abid~ Côte D~ 5.63N    3.23W    
## 4 1849-04-01              26.1             1.39 Abid~ Côte D~ 5.63N    3.23W    
## 5 1849-05-01              25.4             1.2  Abid~ Côte D~ 5.63N    3.23W    
## 6 1849-06-01              24.8             1.40 Abid~ Côte D~ 5.63N    3.23W
tail(GlobalTemp)
## # A tibble: 6 x 7
##   dt         AverageTemperatu~ AverageTemperat~ City  Country Latitude Longitude
##   <date>                 <dbl>            <dbl> <chr> <chr>   <chr>    <chr>    
## 1 2013-04-01              12.6            1.82  Xian  China   34.56N   108.97E  
## 2 2013-05-01              19.0            0.807 Xian  China   34.56N   108.97E  
## 3 2013-06-01              23.5            0.647 Xian  China   34.56N   108.97E  
## 4 2013-07-01              25.3            1.04  Xian  China   34.56N   108.97E  
## 5 2013-08-01              24.5            0.84  Xian  China   34.56N   108.97E  
## 6 2013-09-01              NA             NA     Xian  China   34.56N   108.97E
Variables <- tibble(
  Name = names(GlobalTemp),
  Description = c("Date of measurement",
                  "Mean Land Temperature",
                  "Error around the mean",
                  "City where temperature was measured",
                  "Country where City is located",
                  "Angular distance from north to sout",
                  "Angular distance from east to west")
)
Variables
## # A tibble: 7 x 2
##   Name                          Description                        
##   <chr>                         <chr>                              
## 1 dt                            Date of measurement                
## 2 AverageTemperature            Mean Land Temperature              
## 3 AverageTemperatureUncertainty Error around the mean              
## 4 City                          City where temperature was measured
## 5 Country                       Country where City is located      
## 6 Latitude                      Angular distance from north to sout
## 7 Longitude                     Angular distance from east to west

Here are mean and standard deviation for the numerical variables in the dataset:

temp_summary <- tibble (
  "Mean Average Temp" = mean(GlobalTemp$AverageTemperature, na.rm = T),
  "Mean Temp Uncertainty" = mean(GlobalTemp$AverageTemperatureUncertainty,
                                 na.rm = T),
  "S.D. of Ave. Temp." = sd(GlobalTemp$AverageTemperature, na.rm = T),
  "S.D. of Temp. Uncertainty" = sd(GlobalTemp$AverageTemperatureUncertainty,
                                   na.rm = T)
)
temp_summary
## # A tibble: 1 x 4
##   `Mean Average Tem~ `Mean Temp Uncertai~ `S.D. of Ave. Te~ `S.D. of Temp. Unce~
##                <dbl>                <dbl>             <dbl>                <dbl>
## 1               18.1                0.969              10.0                0.980
# Calculate proportions
prop_calc <- function(x) {
  var <- unique(x)
  prop <- c()
  
  for (i in 1:length(var)) {
    elem <- var[i]
    m <- mean(x == elem, na.rm = T)
    prop <- c(prop, m)
  }
  
  return(prop)
}

prop_table <- function(x, y) {
  table <- tibble(
    n = x,
    m = y
  )
  
  return(table)
}

Proportion of each city as part of the dataset:

City_Proportions <- tibble(
  City = unique(GlobalTemp$City),
  Proportion = prop_calc(GlobalTemp$City)
)

City_Props <- prop_table(unique(GlobalTemp$City),
                         prop_calc(GlobalTemp$City))

names(City_Props) <- c("City", "Proportion")

City_Props
## # A tibble: 100 x 2
##    City           Proportion
##    <chr>               <dbl>
##  1 Abidjan           0.00827
##  2 Addis Abeba       0.00822
##  3 Ahmadabad         0.0109 
##  4 Aleppo            0.0112 
##  5 Alexandria        0.0112 
##  6 Ankara            0.0130 
##  7 Baghdad           0.00977
##  8 Bangalore         0.0109 
##  9 Bangkok           0.00991
## 10 Belo Horizonte    0.00912
## # ... with 90 more rows

Proportion of countries used in the dataset:

Country_Proportions <- tibble(
  Country = unique(GlobalTemp$Country),
  Proportion = prop_calc(GlobalTemp$Country)
)

Country_Proportions %>% 
  arrange(desc(Proportion))
## # A tibble: 49 x 2
##    Country       Proportion
##    <chr>              <dbl>
##  1 India             0.153 
##  2 China             0.148 
##  3 Brazil            0.0541
##  4 Turkey            0.0396
##  5 United States     0.0354
##  6 Egypt             0.0317
##  7 Pakistan          0.0308
##  8 Canada            0.0271
##  9 Russia            0.0271
## 10 Nigeria           0.0241
## # ... with 39 more rows

There are 100 unique cities and 49 countries in the dataset.

count(unique(GlobalTemp[, "City"]))
## # A tibble: 1 x 1
##       n
##   <int>
## 1   100
count(unique(GlobalTemp[, "Country"]))
## # A tibble: 1 x 1
##       n
##   <int>
## 1    49

Calculating the mean land temperature and mean temp standard deviation to plot each by yea.

GlobalTempYear <- GlobalTemp %>% 
  mutate(Year = as.numeric(gsub("-\\d\\d-\\d\\d*", "", dt))) %>%
  select(Year, everything(), -dt) %>%
  group_by(Year) %>%
  summarize(MeanLandTemp = round(mean(AverageTemperature, na.rm = T), 2),
            MeanTempError = round(mean(AverageTemperatureUncertainty, na.rm = T), 2)) %>%
  filter(MeanLandTemp != "NaN")

head(GlobalTempYear)
## # A tibble: 6 x 3
##    Year MeanLandTemp MeanTempError
##   <dbl>        <dbl>         <dbl>
## 1  1743         4.02          1.91
## 2  1744        10.3           1.8 
## 3  1745         0.38          1.81
## 4  1750         9.15          1.81
## 5  1751         9.44          1.75
## 6  1752         3.37          1.92
tail(GlobalTempYear)
## # A tibble: 6 x 3
##    Year MeanLandTemp MeanTempError
##   <dbl>        <dbl>         <dbl>
## 1  2008         19.6         0.35 
## 2  2009         19.8         0.35 
## 3  2010         19.9         0.36 
## 4  2011         19.6         0.39 
## 5  2012         19.7         0.5  
## 6  2013         20.3         0.580

When plotting the average land temperatures, we can see that it has been increasing as the years go by. The measurement error decreases exponentially, and one can attribute this to the improvement in measuring tools.

ggplot(GlobalTempYear, aes(Year, MeanLandTemp, color = MeanTempError)) +
  geom_point()

ggplot(GlobalTempYear, aes(Year, MeanTempError, color = MeanLandTemp)) +
  geom_point()

ggplot(GlobalTemp) +
  geom_histogram( aes(AverageTemperature), fill = "darkred")

ggplot(GlobalTemp) +
  geom_histogram(aes(AverageTemperatureUncertainty), fill = "salmon")

temps <- GlobalTemp %>%
  select(dt, AverageTemperature, AverageTemperatureUncertainty, City)
temps
## # A tibble: 239,177 x 4
##    dt         AverageTemperature AverageTemperatureUncertainty City   
##    <date>                  <dbl>                         <dbl> <chr>  
##  1 1849-01-01               26.7                          1.44 Abidjan
##  2 1849-02-01               27.4                          1.36 Abidjan
##  3 1849-03-01               28.1                          1.61 Abidjan
##  4 1849-04-01               26.1                          1.39 Abidjan
##  5 1849-05-01               25.4                          1.2  Abidjan
##  6 1849-06-01               24.8                          1.40 Abidjan
##  7 1849-07-01               24.1                          1.25 Abidjan
##  8 1849-08-01               23.6                          1.26 Abidjan
##  9 1849-09-01               23.7                          1.23 Abidjan
## 10 1849-10-01               25.3                          1.18 Abidjan
## # ... with 239,167 more rows
location <- GlobalTemp %>%
  select(dt, City, Country, Latitude, Longitude)
location
## # A tibble: 239,177 x 5
##    dt         City    Country       Latitude Longitude
##    <date>     <chr>   <chr>         <chr>    <chr>    
##  1 1849-01-01 Abidjan Côte D'Ivoire 5.63N    3.23W    
##  2 1849-02-01 Abidjan Côte D'Ivoire 5.63N    3.23W    
##  3 1849-03-01 Abidjan Côte D'Ivoire 5.63N    3.23W    
##  4 1849-04-01 Abidjan Côte D'Ivoire 5.63N    3.23W    
##  5 1849-05-01 Abidjan Côte D'Ivoire 5.63N    3.23W    
##  6 1849-06-01 Abidjan Côte D'Ivoire 5.63N    3.23W    
##  7 1849-07-01 Abidjan Côte D'Ivoire 5.63N    3.23W    
##  8 1849-08-01 Abidjan Côte D'Ivoire 5.63N    3.23W    
##  9 1849-09-01 Abidjan Côte D'Ivoire 5.63N    3.23W    
## 10 1849-10-01 Abidjan Côte D'Ivoire 5.63N    3.23W    
## # ... with 239,167 more rows
write_csv(temps, "temperatures.csv")
write_csv(location, "locations.csv")
full_join(temps, location, by = c("dt","City"))
## # A tibble: 239,177 x 7
##    dt         AverageTemperat~ AverageTemperat~ City  Country Latitude Longitude
##    <date>                <dbl>            <dbl> <chr> <chr>   <chr>    <chr>    
##  1 1849-01-01             26.7             1.44 Abid~ Côte D~ 5.63N    3.23W    
##  2 1849-02-01             27.4             1.36 Abid~ Côte D~ 5.63N    3.23W    
##  3 1849-03-01             28.1             1.61 Abid~ Côte D~ 5.63N    3.23W    
##  4 1849-04-01             26.1             1.39 Abid~ Côte D~ 5.63N    3.23W    
##  5 1849-05-01             25.4             1.2  Abid~ Côte D~ 5.63N    3.23W    
##  6 1849-06-01             24.8             1.40 Abid~ Côte D~ 5.63N    3.23W    
##  7 1849-07-01             24.1             1.25 Abid~ Côte D~ 5.63N    3.23W    
##  8 1849-08-01             23.6             1.26 Abid~ Côte D~ 5.63N    3.23W    
##  9 1849-09-01             23.7             1.23 Abid~ Côte D~ 5.63N    3.23W    
## 10 1849-10-01             25.3             1.18 Abid~ Côte D~ 5.63N    3.23W    
## # ... with 239,167 more rows

Part II

The dataset that I will be using is the ToothGrowth dataset from R.

head(ToothGrowth)
##    len supp dose
## 1  4.2   VC  0.5
## 2 11.5   VC  0.5
## 3  7.3   VC  0.5
## 4  5.8   VC  0.5
## 5  6.4   VC  0.5
## 6 10.0   VC  0.5
tail(ToothGrowth)
##     len supp dose
## 55 24.8   OJ    2
## 56 30.9   OJ    2
## 57 26.4   OJ    2
## 58 27.3   OJ    2
## 59 29.4   OJ    2
## 60 23.0   OJ    2
ggplot(ToothGrowth, aes(dose, len, color = supp)) +
  geom_point() +
  facet_grid(supp ~ .) +
  geom_smooth() +
  labs(title = "Tooth Growth on Supplement Type") +
  xlab("Dose (mg)") +
  ylab("Tooth Length")

vc <- ToothGrowth %>%
  filter(supp == "VC")

vcg <- lm(len ~ dose, data = vc)
summary(vcg)
## 
## Call:
## lm(formula = len ~ dose, data = vc)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.2264 -2.6029  0.0814  2.2288  7.4893 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    3.295      1.427   2.309   0.0285 *  
## dose          11.716      1.079  10.860 1.51e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.685 on 28 degrees of freedom
## Multiple R-squared:  0.8082, Adjusted R-squared:  0.8013 
## F-statistic: 117.9 on 1 and 28 DF,  p-value: 1.509e-11

The linear regression model for the supplement type VC is:

Length = 3.295 + 11.716 * dose

OJ <- ToothGrowth %>%
  filter(supp == "OJ")

OJG <- lm(len ~ dose, data = OJ)
summary(OJG)
## 
## Call:
## lm(formula = len ~ dose, data = OJ)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.2557 -3.7979 -0.0643  3.3521  7.9386 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   11.550      1.722   6.708 2.79e-07 ***
## dose           7.811      1.302   6.001 1.82e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.446 on 28 degrees of freedom
## Multiple R-squared:  0.5626, Adjusted R-squared:  0.547 
## F-statistic: 36.01 on 1 and 28 DF,  p-value: 1.825e-06

The linera regression model for the supplement type OJ is:

Length = 11.55 + 7.811 * dose