Read a CSV file from my desktop.

AirPort_MaxTemp <- read.csv(file="C:\\Users\\Gang Zhang\\Documents\\R\\R Workshop\\Week 2\\maxtemp.csv", header=TRUE, sep=",")

AirPort_MaxTemp
##       X time value
## 1  1971 1971  34.6
## 2  1972 1972  39.3
## 3  1973 1973  40.5
## 4  1974 1974  36.8
## 5  1975 1975  39.7
## 6  1976 1976  40.5
## 7  1977 1977  41.5
## 8  1978 1978  38.2
## 9  1979 1979  41.4
## 10 1980 1980  41.5
## 11 1981 1981  41.9
## 12 1982 1982  43.5
## 13 1983 1983  43.2
## 14 1984 1984  35.1
## 15 1985 1985  42.1
## 16 1986 1986  38.1
## 17 1987 1987  39.6
## 18 1988 1988  39.9
## 19 1989 1989  37.8
## 20 1990 1990  39.0
## 21 1991 1991  41.3
## 22 1992 1992  38.7
## 23 1993 1993  37.8
## 24 1994 1994  38.9
## 25 1995 1995  39.7
## 26 1996 1996  36.1
## 27 1997 1997  41.3
## 28 1998 1998  40.6
## 29 1999 1999  37.5
## 30 2000 2000  39.6
## 31 2001 2001  38.8
## 32 2002 2002  36.6
## 33 2003 2003  44.3
## 34 2004 2004  39.7
## 35 2005 2005  43.0
## 36 2006 2006  43.1
## 37 2007 2007  41.5
## 38 2008 2008  40.5
## 39 2009 2009  46.7
## 40 2010 2010  43.9
## 41 2011 2011  39.5
## 42 2012 2012  39.7
## 43 2013 2013  41.5
## 44 2014 2014  43.7
## 45 2015 2015  42.7
## 46 2016 2016  42.9

Task 1: Use the summary function to gain an overview of the data set. Then display the mean and median for at least two attributes.

## Summary of Airport Temperature:

summary(AirPort_MaxTemp)
##        X             time          value      
##  Min.   :1971   Min.   :1971   Min.   :34.60  
##  1st Qu.:1982   1st Qu.:1982   1st Qu.:38.83  
##  Median :1994   Median :1994   Median :40.20  
##  Mean   :1994   Mean   :1994   Mean   :40.30  
##  3rd Qu.:2005   3rd Qu.:2005   3rd Qu.:41.80  
##  Max.   :2016   Max.   :2016   Max.   :46.70
## Mean and Median for time column:

Time_Mean <- mean(AirPort_MaxTemp$time)
print(Time_Mean)
## [1] 1993.5
Time_Median <- median(AirPort_MaxTemp$time)
print(Time_Median)
## [1] 1993.5
## Mean and Median for Value column:

Value_Mean <- mean(AirPort_MaxTemp$value)
print(Value_Mean)
## [1] 40.3
Value_Median <- median(AirPort_MaxTemp$value)
print(Value_Median)
## [1] 40.2

Task 2: Create a new data frame with a subset of the columns and rows. Make sure to rename it.

library(dplyr)

AirPort_MaxTemp2 <- subset(AirPort_MaxTemp, between(time, 1980, 2000))

AirPort_MaxTemp2
##       X time value
## 10 1980 1980  41.5
## 11 1981 1981  41.9
## 12 1982 1982  43.5
## 13 1983 1983  43.2
## 14 1984 1984  35.1
## 15 1985 1985  42.1
## 16 1986 1986  38.1
## 17 1987 1987  39.6
## 18 1988 1988  39.9
## 19 1989 1989  37.8
## 20 1990 1990  39.0
## 21 1991 1991  41.3
## 22 1992 1992  38.7
## 23 1993 1993  37.8
## 24 1994 1994  38.9
## 25 1995 1995  39.7
## 26 1996 1996  36.1
## 27 1997 1997  41.3
## 28 1998 1998  40.6
## 29 1999 1999  37.5
## 30 2000 2000  39.6

Task 3: Create new column names for the new data frame.

## Remove the X column as it is a duplicate data:

AirPort_MaxTemp2$X <- NULL

## Update the column names:

colnames(AirPort_MaxTemp2) <- c("Year", "Temperature")

AirPort_MaxTemp2
##    Year Temperature
## 10 1980        41.5
## 11 1981        41.9
## 12 1982        43.5
## 13 1983        43.2
## 14 1984        35.1
## 15 1985        42.1
## 16 1986        38.1
## 17 1987        39.6
## 18 1988        39.9
## 19 1989        37.8
## 20 1990        39.0
## 21 1991        41.3
## 22 1992        38.7
## 23 1993        37.8
## 24 1994        38.9
## 25 1995        39.7
## 26 1996        36.1
## 27 1997        41.3
## 28 1998        40.6
## 29 1999        37.5
## 30 2000        39.6

Task 4: Use the summary function to create an overview of your new data frame. Then print the mean and median for the same two attributes. Please compare.

summary(AirPort_MaxTemp2)
##       Year       Temperature   
##  Min.   :1980   Min.   :35.10  
##  1st Qu.:1985   1st Qu.:38.10  
##  Median :1990   Median :39.60  
##  Mean   :1990   Mean   :39.68  
##  3rd Qu.:1995   3rd Qu.:41.30  
##  Max.   :2000   Max.   :43.50
## Mean and Median for Year column:

Year_Mean <- mean(AirPort_MaxTemp2$Year)
print(Year_Mean)
## [1] 1990
Year_Median <- median(AirPort_MaxTemp2$Year)
print(Year_Median)
## [1] 1990
## Mean and Median for Temperature column:

Temp_Mean <- mean(AirPort_MaxTemp2$Temperature)
print(Temp_Mean)
## [1] 39.67619
Temp_Median <- median(AirPort_MaxTemp2$Temperature)
print(Temp_Median)
## [1] 39.6
## Compare by a table.

comparison_table <- matrix(c(Time_Mean,Time_Median,round(Value_Mean, digits=1),Value_Median,Year_Mean,Year_Median,round(Temp_Mean, digits=1),Temp_Median),ncol=4,nrow=2,byrow=TRUE)

dimnames(comparison_table) = list(c("MaxTemp","MaxTemp_Subset"), c("Year_Mean","Year_Median","Temp_Mean","Temp_Median"))

comparison_table
##                Year_Mean Year_Median Temp_Mean Temp_Median
## MaxTemp           1993.5      1993.5      40.3        40.2
## MaxTemp_Subset    1990.0      1990.0      39.7        39.6
## Compare by bar plots.

year_table <- matrix(c(Time_Mean,Time_Median,Year_Mean,Year_Median),ncol=2,nrow=2,byrow=TRUE)

dimnames(year_table) = list(c("MaxTemp","MaxTemp_Subset"), c("Mean", "Median"))

## Year Barplot

barplot(year_table,legend=T, args.legend = list(x = "topright", bty = "n", inset=c(-0.08, 0), cex=1.0),beside=T, ylim=c(1990,1995), las=2, ylab="Year", cex.names=1.4,cex.axis=1.0, cex.lab=1.2, main="Year Comparison", col=c("blue","red"))

temp_table <- matrix(c(round(Value_Mean, digits=1),Value_Median,round(Temp_Mean, digits=1),Temp_Median),ncol=2,nrow=2,byrow=TRUE)

dimnames(temp_table) = list(c("MaxTemp","MaxTemp_Subset"), c("Mean","Median"))

##Temperature Barplot

barplot(temp_table,legend=T, args.legend = list(x = "topright", bty = "n", inset=c(-0.08, 0), cex=1.0),beside=T, ylim=c(38,42), las=0.5, ylab="°F", cex.names=1.4,cex.axis=1.0, cex.lab=1.2, main="Temperature Comparison", col=c("blue","red"))

Task 5: For at least 3 values in a column please rename so that every value in that column is renamed. For example, suppose I have 20 values of the letter “e” in one column. Rename those values so that all 20 would show as “excellent”.

names(AirPort_MaxTemp2)[2] <- gsub("e", "X", names(AirPort_MaxTemp2)[2])

AirPort_MaxTemp2
##    Year TXmpXraturX
## 10 1980        41.5
## 11 1981        41.9
## 12 1982        43.5
## 13 1983        43.2
## 14 1984        35.1
## 15 1985        42.1
## 16 1986        38.1
## 17 1987        39.6
## 18 1988        39.9
## 19 1989        37.8
## 20 1990        39.0
## 21 1991        41.3
## 22 1992        38.7
## 23 1993        37.8
## 24 1994        38.9
## 25 1995        39.7
## 26 1996        36.1
## 27 1997        41.3
## 28 1998        40.6
## 29 1999        37.5
## 30 2000        39.6

Task 6: Display enough rows to see examples of all of steps 1-5 above.

## I can think of two ways to do this:

examples_step1to5 <- AirPort_MaxTemp2[which(AirPort_MaxTemp2$Year >= 1985 & AirPort_MaxTemp2$Year <= 1995),]

examples_step1to5
##    Year TXmpXraturX
## 15 1985        42.1
## 16 1986        38.1
## 17 1987        39.6
## 18 1988        39.9
## 19 1989        37.8
## 20 1990        39.0
## 21 1991        41.3
## 22 1992        38.7
## 23 1993        37.8
## 24 1994        38.9
## 25 1995        39.7
## Or by this way

examples_step1to5_2 <- head(AirPort_MaxTemp2, n=15)

examples_step1to5_2
##    Year TXmpXraturX
## 10 1980        41.5
## 11 1981        41.9
## 12 1982        43.5
## 13 1983        43.2
## 14 1984        35.1
## 15 1985        42.1
## 16 1986        38.1
## 17 1987        39.6
## 18 1988        39.9
## 19 1989        37.8
## 20 1990        39.0
## 21 1991        41.3
## 22 1992        38.7
## 23 1993        37.8
## 24 1994        38.9

Task 7: BONUS – place the original .csv in a github file and have R read from the link. This will be a very useful skill as you progress in your data science education and career.

library("rio")

AirPort_MaxTemp3 <- import("https://raw.githubusercontent.com/SieSiongWong/CUNY-SPS-R-Workshop/master/maxtemp.csv")

AirPort_MaxTemp3
##      V1 time value
## 1  1971 1971  34.6
## 2  1972 1972  39.3
## 3  1973 1973  40.5
## 4  1974 1974  36.8
## 5  1975 1975  39.7
## 6  1976 1976  40.5
## 7  1977 1977  41.5
## 8  1978 1978  38.2
## 9  1979 1979  41.4
## 10 1980 1980  41.5
## 11 1981 1981  41.9
## 12 1982 1982  43.5
## 13 1983 1983  43.2
## 14 1984 1984  35.1
## 15 1985 1985  42.1
## 16 1986 1986  38.1
## 17 1987 1987  39.6
## 18 1988 1988  39.9
## 19 1989 1989  37.8
## 20 1990 1990  39.0
## 21 1991 1991  41.3
## 22 1992 1992  38.7
## 23 1993 1993  37.8
## 24 1994 1994  38.9
## 25 1995 1995  39.7
## 26 1996 1996  36.1
## 27 1997 1997  41.3
## 28 1998 1998  40.6
## 29 1999 1999  37.5
## 30 2000 2000  39.6
## 31 2001 2001  38.8
## 32 2002 2002  36.6
## 33 2003 2003  44.3
## 34 2004 2004  39.7
## 35 2005 2005  43.0
## 36 2006 2006  43.1
## 37 2007 2007  41.5
## 38 2008 2008  40.5
## 39 2009 2009  46.7
## 40 2010 2010  43.9
## 41 2011 2011  39.5
## 42 2012 2012  39.7
## 43 2013 2013  41.5
## 44 2014 2014  43.7
## 45 2015 2015  42.7
## 46 2016 2016  42.9