library(tidyverse)
This dataset is from data.world and contains information about global land temperatures for major cities around the world (https://data.world/data-society/global-climate-change-data/workspace/file?filename=GlobalLandTemperatures%2FGlobalLandTemperaturesByMajorCity.csv).
GlobalTemp <- read_csv("GlobalLandTemperatures_GlobalLandTemperaturesByMajorCity.csv")
head(GlobalTemp)
## # A tibble: 6 x 7
## dt AverageTemperatu~ AverageTemperat~ City Country Latitude Longitude
## <date> <dbl> <dbl> <chr> <chr> <chr> <chr>
## 1 1849-01-01 26.7 1.44 Abid~ Côte D~ 5.63N 3.23W
## 2 1849-02-01 27.4 1.36 Abid~ Côte D~ 5.63N 3.23W
## 3 1849-03-01 28.1 1.61 Abid~ Côte D~ 5.63N 3.23W
## 4 1849-04-01 26.1 1.39 Abid~ Côte D~ 5.63N 3.23W
## 5 1849-05-01 25.4 1.2 Abid~ Côte D~ 5.63N 3.23W
## 6 1849-06-01 24.8 1.40 Abid~ Côte D~ 5.63N 3.23W
tail(GlobalTemp)
## # A tibble: 6 x 7
## dt AverageTemperatu~ AverageTemperat~ City Country Latitude Longitude
## <date> <dbl> <dbl> <chr> <chr> <chr> <chr>
## 1 2013-04-01 12.6 1.82 Xian China 34.56N 108.97E
## 2 2013-05-01 19.0 0.807 Xian China 34.56N 108.97E
## 3 2013-06-01 23.5 0.647 Xian China 34.56N 108.97E
## 4 2013-07-01 25.3 1.04 Xian China 34.56N 108.97E
## 5 2013-08-01 24.5 0.84 Xian China 34.56N 108.97E
## 6 2013-09-01 NA NA Xian China 34.56N 108.97E
Variables <- tibble(
Name = names(GlobalTemp),
Description = c("Date of measurement",
"Mean Land Temperature",
"Error around the mean",
"City where temperature was measured",
"Country where City is located",
"Angular distance from north to sout",
"Angular distance from east to west")
)
Variables
## # A tibble: 7 x 2
## Name Description
## <chr> <chr>
## 1 dt Date of measurement
## 2 AverageTemperature Mean Land Temperature
## 3 AverageTemperatureUncertainty Error around the mean
## 4 City City where temperature was measured
## 5 Country Country where City is located
## 6 Latitude Angular distance from north to sout
## 7 Longitude Angular distance from east to west
Here are mean and standard deviation for the numerical variables in the dataset:
temp_summary <- tibble (
"Mean Average Temp" = mean(GlobalTemp$AverageTemperature, na.rm = T),
"Mean Temp Uncertainty" = mean(GlobalTemp$AverageTemperatureUncertainty,
na.rm = T),
"S.D. of Ave. Temp." = sd(GlobalTemp$AverageTemperature, na.rm = T),
"S.D. of Temp. Uncertainty" = sd(GlobalTemp$AverageTemperatureUncertainty,
na.rm = T)
)
temp_summary
## # A tibble: 1 x 4
## `Mean Average Tem~ `Mean Temp Uncertai~ `S.D. of Ave. Te~ `S.D. of Temp. Unce~
## <dbl> <dbl> <dbl> <dbl>
## 1 18.1 0.969 10.0 0.980
# Calculate proportions
prop_calc <- function(x) {
var <- unique(x)
prop <- c()
for (i in 1:length(var)) {
elem <- var[i]
m <- mean(x == elem, na.rm = T)
prop <- c(prop, m)
}
return(prop)
}
prop_table <- function(x, y) {
table <- tibble(
n = x,
m = y
)
return(table)
}
Proportion of each city as part of the dataset:
City_Proportions <- tibble(
City = unique(GlobalTemp$City),
Proportion = prop_calc(GlobalTemp$City)
)
City_Props <- prop_table(unique(GlobalTemp$City),
prop_calc(GlobalTemp$City))
names(City_Props) <- c("City", "Proportion")
City_Props
## # A tibble: 100 x 2
## City Proportion
## <chr> <dbl>
## 1 Abidjan 0.00827
## 2 Addis Abeba 0.00822
## 3 Ahmadabad 0.0109
## 4 Aleppo 0.0112
## 5 Alexandria 0.0112
## 6 Ankara 0.0130
## 7 Baghdad 0.00977
## 8 Bangalore 0.0109
## 9 Bangkok 0.00991
## 10 Belo Horizonte 0.00912
## # ... with 90 more rows
Proportion of countries used in the dataset:
Country_Proportions <- tibble(
Country = unique(GlobalTemp$Country),
Proportion = prop_calc(GlobalTemp$Country)
)
Country_Proportions %>%
arrange(desc(Proportion))
## # A tibble: 49 x 2
## Country Proportion
## <chr> <dbl>
## 1 India 0.153
## 2 China 0.148
## 3 Brazil 0.0541
## 4 Turkey 0.0396
## 5 United States 0.0354
## 6 Egypt 0.0317
## 7 Pakistan 0.0308
## 8 Canada 0.0271
## 9 Russia 0.0271
## 10 Nigeria 0.0241
## # ... with 39 more rows
There are 100 unique cities and 49 countries in the dataset.
count(unique(GlobalTemp[, "City"]))
## # A tibble: 1 x 1
## n
## <int>
## 1 100
count(unique(GlobalTemp[, "Country"]))
## # A tibble: 1 x 1
## n
## <int>
## 1 49
Calculating the mean land temperature and mean temp standard deviation to plot each by yea.
GlobalTempYear <- GlobalTemp %>%
mutate(Year = as.numeric(gsub("-\\d\\d-\\d\\d*", "", dt))) %>%
select(Year, everything(), -dt) %>%
group_by(Year) %>%
summarize(MeanLandTemp = round(mean(AverageTemperature, na.rm = T), 2),
MeanTempError = round(mean(AverageTemperatureUncertainty, na.rm = T), 2)) %>%
filter(MeanLandTemp != "NaN")
head(GlobalTempYear)
## # A tibble: 6 x 3
## Year MeanLandTemp MeanTempError
## <dbl> <dbl> <dbl>
## 1 1743 4.02 1.91
## 2 1744 10.3 1.8
## 3 1745 0.38 1.81
## 4 1750 9.15 1.81
## 5 1751 9.44 1.75
## 6 1752 3.37 1.92
tail(GlobalTempYear)
## # A tibble: 6 x 3
## Year MeanLandTemp MeanTempError
## <dbl> <dbl> <dbl>
## 1 2008 19.6 0.35
## 2 2009 19.8 0.35
## 3 2010 19.9 0.36
## 4 2011 19.6 0.39
## 5 2012 19.7 0.5
## 6 2013 20.3 0.580
When plotting the average land temperatures, we can see that it has been increasing as the years go by. The measurement error decreases exponentially, and one can attribute this to the improvement in measuring tools.
ggplot(GlobalTempYear, aes(Year, MeanLandTemp, color = MeanTempError)) +
geom_point()
ggplot(GlobalTempYear, aes(Year, MeanTempError, color = MeanLandTemp)) +
geom_point()
ggplot(GlobalTemp) +
geom_histogram( aes(AverageTemperature), fill = "darkred")
ggplot(GlobalTemp) +
geom_histogram(aes(AverageTemperatureUncertainty), fill = "salmon")
temps <- GlobalTemp %>%
select(dt, AverageTemperature, AverageTemperatureUncertainty, City)
temps
## # A tibble: 239,177 x 4
## dt AverageTemperature AverageTemperatureUncertainty City
## <date> <dbl> <dbl> <chr>
## 1 1849-01-01 26.7 1.44 Abidjan
## 2 1849-02-01 27.4 1.36 Abidjan
## 3 1849-03-01 28.1 1.61 Abidjan
## 4 1849-04-01 26.1 1.39 Abidjan
## 5 1849-05-01 25.4 1.2 Abidjan
## 6 1849-06-01 24.8 1.40 Abidjan
## 7 1849-07-01 24.1 1.25 Abidjan
## 8 1849-08-01 23.6 1.26 Abidjan
## 9 1849-09-01 23.7 1.23 Abidjan
## 10 1849-10-01 25.3 1.18 Abidjan
## # ... with 239,167 more rows
location <- GlobalTemp %>%
select(dt, City, Country, Latitude, Longitude)
location
## # A tibble: 239,177 x 5
## dt City Country Latitude Longitude
## <date> <chr> <chr> <chr> <chr>
## 1 1849-01-01 Abidjan Côte D'Ivoire 5.63N 3.23W
## 2 1849-02-01 Abidjan Côte D'Ivoire 5.63N 3.23W
## 3 1849-03-01 Abidjan Côte D'Ivoire 5.63N 3.23W
## 4 1849-04-01 Abidjan Côte D'Ivoire 5.63N 3.23W
## 5 1849-05-01 Abidjan Côte D'Ivoire 5.63N 3.23W
## 6 1849-06-01 Abidjan Côte D'Ivoire 5.63N 3.23W
## 7 1849-07-01 Abidjan Côte D'Ivoire 5.63N 3.23W
## 8 1849-08-01 Abidjan Côte D'Ivoire 5.63N 3.23W
## 9 1849-09-01 Abidjan Côte D'Ivoire 5.63N 3.23W
## 10 1849-10-01 Abidjan Côte D'Ivoire 5.63N 3.23W
## # ... with 239,167 more rows
write_csv(temps, "temperatures.csv")
write_csv(location, "locations.csv")
full_join(temps, location, by = c("dt","City"))
## # A tibble: 239,177 x 7
## dt AverageTemperat~ AverageTemperat~ City Country Latitude Longitude
## <date> <dbl> <dbl> <chr> <chr> <chr> <chr>
## 1 1849-01-01 26.7 1.44 Abid~ Côte D~ 5.63N 3.23W
## 2 1849-02-01 27.4 1.36 Abid~ Côte D~ 5.63N 3.23W
## 3 1849-03-01 28.1 1.61 Abid~ Côte D~ 5.63N 3.23W
## 4 1849-04-01 26.1 1.39 Abid~ Côte D~ 5.63N 3.23W
## 5 1849-05-01 25.4 1.2 Abid~ Côte D~ 5.63N 3.23W
## 6 1849-06-01 24.8 1.40 Abid~ Côte D~ 5.63N 3.23W
## 7 1849-07-01 24.1 1.25 Abid~ Côte D~ 5.63N 3.23W
## 8 1849-08-01 23.6 1.26 Abid~ Côte D~ 5.63N 3.23W
## 9 1849-09-01 23.7 1.23 Abid~ Côte D~ 5.63N 3.23W
## 10 1849-10-01 25.3 1.18 Abid~ Côte D~ 5.63N 3.23W
## # ... with 239,167 more rows
The dataset that I will be using is the ToothGrowth dataset from R.
head(ToothGrowth)
## len supp dose
## 1 4.2 VC 0.5
## 2 11.5 VC 0.5
## 3 7.3 VC 0.5
## 4 5.8 VC 0.5
## 5 6.4 VC 0.5
## 6 10.0 VC 0.5
tail(ToothGrowth)
## len supp dose
## 55 24.8 OJ 2
## 56 30.9 OJ 2
## 57 26.4 OJ 2
## 58 27.3 OJ 2
## 59 29.4 OJ 2
## 60 23.0 OJ 2
ggplot(ToothGrowth, aes(dose, len, color = supp)) +
geom_point() +
facet_grid(supp ~ .) +
geom_smooth() +
labs(title = "Tooth Growth on Supplement Type") +
xlab("Dose (mg)") +
ylab("Tooth Length")
vc <- ToothGrowth %>%
filter(supp == "VC")
vcg <- lm(len ~ dose, data = vc)
summary(vcg)
##
## Call:
## lm(formula = len ~ dose, data = vc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.2264 -2.6029 0.0814 2.2288 7.4893
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.295 1.427 2.309 0.0285 *
## dose 11.716 1.079 10.860 1.51e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.685 on 28 degrees of freedom
## Multiple R-squared: 0.8082, Adjusted R-squared: 0.8013
## F-statistic: 117.9 on 1 and 28 DF, p-value: 1.509e-11
The linear regression model for the supplement type VC is:
Length = 3.295 + 11.716 * dose
OJ <- ToothGrowth %>%
filter(supp == "OJ")
OJG <- lm(len ~ dose, data = OJ)
summary(OJG)
##
## Call:
## lm(formula = len ~ dose, data = OJ)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.2557 -3.7979 -0.0643 3.3521 7.9386
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.550 1.722 6.708 2.79e-07 ***
## dose 7.811 1.302 6.001 1.82e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.446 on 28 degrees of freedom
## Multiple R-squared: 0.5626, Adjusted R-squared: 0.547
## F-statistic: 36.01 on 1 and 28 DF, p-value: 1.825e-06
The linera regression model for the supplement type OJ is:
Length = 11.55 + 7.811 * dose