This project investigates the changes in temperature in a few cities over a series of years. Load the file above into R and save it as a dataframe called weather using the read.csv command.
setwd("/Users/ethan/Desktop")
weather<-read.csv("GlobalLandTemperaturesByMajorCity.csv")
head(weather)
## dt AverageTemperature AverageTemperatureUncertainty City
## 1 1849-01-01 26.704 1.435 Abidjan
## 2 1849-02-01 27.434 1.362 Abidjan
## 3 1849-03-01 28.101 1.612 Abidjan
## 4 1849-04-01 26.140 1.387 Abidjan
## 5 1849-05-01 25.427 1.200 Abidjan
## 6 1849-06-01 24.844 1.402 Abidjan
## Country Latitude Longitude
## 1 Côte D'Ivoire 5.63N 3.23W
## 2 Côte D'Ivoire 5.63N 3.23W
## 3 Côte D'Ivoire 5.63N 3.23W
## 4 Côte D'Ivoire 5.63N 3.23W
## 5 Côte D'Ivoire 5.63N 3.23W
## 6 Côte D'Ivoire 5.63N 3.23W
print("number of columns:")
## [1] "number of columns:"
print(length(weather[1,]))
## [1] 7
print("number of rows:")
## [1] "number of rows:"
print(length(weather[,1]))
## [1] 239177
print("This is a dataframe full of the average temperature data for various cities at various dates, in addition to their latitude and longitude")
## [1] "This is a dataframe full of the average temperature data for various cities at various dates, in addition to their latitude and longitude"
citiesVector<- unique(weather[,"City"])
print("there are")
## [1] "there are"
print(length(citiesVector))
## [1] 100
print("different cities.")
## [1] "different cities."
USCitiesVector<-unique(weather[c(which(weather["Country"] == "United States")),"City"])
print("the ones in the US are")
## [1] "the ones in the US are"
USCitiesVector
## [1] "Chicago" "Los Angeles" "New York"
weather.us<- weather[which(weather["Country"]=="United States"), c(1, 2, 4)]
head(weather.us)
## dt AverageTemperature City
## 51675 1743-11-01 5.436 Chicago
## 51676 1743-12-01 NA Chicago
## 51677 1744-01-01 NA Chicago
## 51678 1744-02-01 NA Chicago
## 51679 1744-03-01 NA Chicago
## 51680 1744-04-01 8.766 Chicago
tail(weather.us)
## dt AverageTemperature City
## 173003 2013-04-01 9.723 New York
## 173004 2013-05-01 15.544 New York
## 173005 2013-06-01 20.892 New York
## 173006 2013-07-01 24.722 New York
## 173007 2013-08-01 21.001 New York
## 173008 2013-09-01 17.408 New York
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
uselessFunction<- function(x)
return(x)
splitByCity<- c(tapply(X = weather.us[,2], INDEX = weather.us[,3], FUN = uselessFunction))
listOfBools<- c(tapply(X = weather.us[,2], INDEX = weather.us[,3], FUN = is.na))
sum(x = splitByCity[[3]], na.rm = TRUE)
## [1] 29703.16
length(splitByCity[[3]])
## [1] 3239
length(c(which(is.na(splitByCity[[3]]) == FALSE )))
## [1] 3119
29703.16/3119
## [1] 9.523296
print("Okay, jeez, I guess new york's average temperature is 9c. my american brain does NOT like that.")
## [1] "Okay, jeez, I guess new york's average temperature is 9c. my american brain does NOT like that."
weather.newyork<- weather.us[c(which(weather.us["City"]== "New York")),]
#head(weather.newyork)
weather.newyork.fixed<- weather.newyork[c(which(is.na(weather.newyork["AverageTemperature"])==FALSE)),]
#head(weather.newyork.fixed)
weather.newyork.fixed.1900sforward<-weather.newyork.fixed[c(which(year(weather.newyork.fixed[,1]) >=1901)),]
#head(weather.newyork.fixed.1900sforward)
weather.newyork.fixed.early1900s<-weather.newyork.fixed.1900sforward[c(which(year(weather.newyork.fixed.1900sforward[,1]) <=1950)),]
#head(weather.newyork.fixed.early1900s)
#tail(weather.newyork.fixed.early1900s)
weather.newyork.fixed.2000backward<-weather.newyork.fixed[c(which(year(weather.newyork.fixed[,1]) <= 2000)),]
#tail(weather.newyork.fixed.2000backward)
weather.newyork.fixed.late1900s<-weather.newyork.fixed.2000backward[c(which(year(weather.newyork.fixed.2000backward[,1]) >1950)),]
#tail(weather.newyork.fixed.late1900s)
#head(weather.newyork.fixed.late1900s)
#length(c(weather.newyork.fixed.early1900s[,2]))
#length(c(weather.newyork.fixed.late1900s[,2]))
print("I chose not to include the year 1900, for the sake of keeping the samples the same size for the paired test. ")
## [1] "I chose not to include the year 1900, for the sake of keeping the samples the same size for the paired test. "
t.test(x = c(weather.newyork.fixed.early1900s[,2]), y =c(weather.newyork.fixed.late1900s[,2]),
alternative = c("greater"), paired = TRUE,
conf.level = 0.95)
##
## Paired t-test
##
## data: c(weather.newyork.fixed.early1900s[, 2]) and c(weather.newyork.fixed.late1900s[, 2])
## t = -3.9622, df = 599, p-value = 1
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## -0.5604685 Inf
## sample estimates:
## mean of the differences
## -0.3958717
We’d like to get visualizations of what is happening to the temperatures in the cities over time. If you just quickly do a plot, you realize there are simply too many datapoints to get good pictures. For instance, execute the following code to get a quick timeseries plot of temperature in NYC!
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.3
ggplot(weather.us[which(weather.us$City=="New York"),], aes(x=dt, y=AverageTemperature))+geom_point()
## Warning: Removed 120 rows containing missing values (geom_point).
So our goal is to clean this up, using subsetting and data summaries to provide a better idea what may be happening.
weather.us["Year"]<-year(weather.us[,"dt"])
newVector<- c("")
quickVector<-c(toString(weather.us[,"Year"]))
quickVector<-c(strsplit(quickVector, ", "))
for (i in 1:length(quickVector[[1]])){
newList<-strsplit(quickVector[[1]][i], "")
decade<-as.numeric(newList[[1]][1])*1000+as.numeric(newList[[1]][2])*100+as.numeric(newList[[1]][3])*10
newVector[i]<-decade
}
weather.us["Decade"]<-newVector
head(weather.us)
## dt AverageTemperature City Year Decade
## 51675 1743-11-01 5.436 Chicago 1743 1740
## 51676 1743-12-01 NA Chicago 1743 1740
## 51677 1744-01-01 NA Chicago 1744 1740
## 51678 1744-02-01 NA Chicago 1744 1740
## 51679 1744-03-01 NA Chicago 1744 1740
## 51680 1744-04-01 8.766 Chicago 1744 1740
tail(weather.us)
## dt AverageTemperature City Year Decade
## 173003 2013-04-01 9.723 New York 2013 2010
## 173004 2013-05-01 15.544 New York 2013 2010
## 173005 2013-06-01 20.892 New York 2013 2010
## 173006 2013-07-01 24.722 New York 2013 2010
## 173007 2013-08-01 21.001 New York 2013 2010
## 173008 2013-09-01 17.408 New York 2013 2010
#weather.us[,"Decade"]
#tapply(X = weather.us[,"AverageTemperature"], INDEX = weather.us[,"Decade"], FUN = mean)
weather.newyork<- weather.us[c(which(weather.us["City"]== "New York")),]
#head(weather.newyork)
weather.newyork.fixed<- weather.newyork[c(which(is.na(weather.newyork["AverageTemperature"])==FALSE)),]
head(weather.newyork.fixed)
## dt AverageTemperature City Year Decade
## 169770 1743-11-01 3.264 New York 1743 1740
## 169775 1744-04-01 9.788 New York 1744 1740
## 169776 1744-05-01 15.708 New York 1744 1740
## 169777 1744-06-01 21.210 New York 1744 1740
## 169778 1744-07-01 22.207 New York 1744 1740
## 169780 1744-09-01 14.922 New York 1744 1740
newyorkbydecade<- tapply(X = weather.newyork.fixed[,"AverageTemperature"], INDEX = weather.newyork.fixed[,"Decade"], FUN = mean)
newyorkbydecade<- strsplit(toString(newyorkbydecade), ", ")
newyorkdecades<-c(unique(weather.newyork.fixed["Decade"]))[[1]]
length(newyorkbydecade[[1]])
## [1] 28
length(newyorkdecades)
## [1] 28
weather.chicago<- weather.us[c(which(weather.us["City"]== "Chicago")),]
#head(weather.newyork)
weather.chicago.fixed<- weather.chicago[c(which(is.na(weather.chicago["AverageTemperature"])==FALSE)),]
head(weather.chicago.fixed)
## dt AverageTemperature City Year Decade
## 51675 1743-11-01 5.436 Chicago 1743 1740
## 51680 1744-04-01 8.766 Chicago 1744 1740
## 51681 1744-05-01 11.605 Chicago 1744 1740
## 51682 1744-06-01 17.965 Chicago 1744 1740
## 51683 1744-07-01 21.680 Chicago 1744 1740
## 51685 1744-09-01 17.030 Chicago 1744 1740
chicagobydecade<- tapply(X = weather.chicago.fixed[,"AverageTemperature"], INDEX = weather.chicago.fixed[,"Decade"], FUN = mean)
chicagobydecade<- strsplit(toString(chicagobydecade), ", ")
chicagodecades<-c(unique(weather.chicago.fixed["Decade"]))[[1]]
length(chicagobydecade[[1]])
## [1] 28
length(chicagodecades)
## [1] 28
weather.losangeles<- weather.us[c(which(weather.us["City"]== "Los Angeles")),]
#head(weather.newyork)
weather.losangeles.fixed<- weather.losangeles[c(which(is.na(weather.losangeles["AverageTemperature"])==FALSE)),]
head(weather.losangeles.fixed)
## dt AverageTemperature City Year Decade
## 131847 1849-01-01 8.819 Los Angeles 1849 1840
## 131848 1849-02-01 9.577 Los Angeles 1849 1840
## 131849 1849-03-01 11.814 Los Angeles 1849 1840
## 131850 1849-04-01 13.704 Los Angeles 1849 1840
## 131851 1849-05-01 14.834 Los Angeles 1849 1840
## 131852 1849-06-01 21.173 Los Angeles 1849 1840
losangelesbydecade<- tapply(X = weather.losangeles.fixed[,"AverageTemperature"], INDEX = weather.losangeles.fixed[,"Decade"], FUN = mean)
losangelesbydecade<- strsplit(toString(losangelesbydecade), ", ")
losangelesdecades<-c(unique(weather.losangeles.fixed["Decade"]))[[1]]
length(losangelesbydecade[[1]])
## [1] 18
length(losangelesdecades)
## [1] 18
decadecolumn<-c(1:74)
decadecolumn[1:28]<-newyorkdecades
decadecolumn[29:56]<-chicagodecades
decadecolumn[57:74]<-losangelesdecades
#length(decadecolumn)
averagetemperaturecolumn<-c(1:74)
averagetemperaturecolumn[1:28]<-c(c(newyorkbydecade)[[1]])
averagetemperaturecolumn[29:56]<-c(c(chicagobydecade)[[1]])
averagetemperaturecolumn[57:74]<-c(c(losangelesbydecade)[[1]])
#length(averagetemperaturecolumn)
citycolumn<-c(1:74)
citycolumn[1:28]<-"New York"
citycolumn[29:56]<-"Chicago"
citycolumn[57:74]<-"Los Angeles"
#length(citycolumn)
weather.us.plottable<-data.frame(c(1:74))
weather.us.plottable["Decade"]<-decadecolumn
weather.us.plottable["AverageTemperature"]<- averagetemperaturecolumn
weather.us.plottable["City"]<- citycolumn
#View(weather.us.plottable)
weather.us.plottable["AverageTemperature"]<-c(as.numeric(c(weather.us.plottable["AverageTemperature"])[[1]]))
prettypicture<- ggplot(weather.us.plottable, aes(x=Decade, y=AverageTemperature, color = City))+geom_point()
prettypicture
prettypicture+geom_smooth(method="lm",
formula = y~x)
The visualizations give us some indication of the trend of temperature changes over time. To be more precise, you can actually find the equation of the trend line using the lm command in R (linear model). Here is an example:
plot(cars)
lm(cars$dist~cars$speed)
##
## Call:
## lm(formula = cars$dist ~ cars$speed)
##
## Coefficients:
## (Intercept) cars$speed
## -17.579 3.932
abline(lm(cars$dist~cars$speed))#add the trend line in base R
If you look at the output, it shows the intercept (y-intercept) of -17.579, and the slope of 3.932.
Have a great Winter Break!!!!