Load dataset.
Print the structure of your dataset.
cityweather = cityweather2012_10_01_1300
str(cityweather)
## tibble [36 × 10] (S3: tbl_df/tbl/data.frame)
## $ City : chr [1:36] "Vancouver" "Portland" "San Francisco" "Seattle" ...
## $ Country : chr [1:36] "Canada" "United States" "United States" "United States" ...
## $ Latitude : num [1:36] 49.2 45.5 37.8 47.6 34.1 ...
## $ Longitude : num [1:36] -123 -123 -122 -122 -118 ...
## $ Temperature : num [1:36] 285 282 289 282 292 ...
## $ Wind Speed : num [1:36] 0 0 2 0 0 0 0 2 4 4 ...
## $ Wind Direction : num [1:36] 0 0 150 0 0 0 0 10 360 20 ...
## $ Pressure : num [1:36] NA 1024 1009 1027 1013 ...
## $ Humidity : num [1:36] 76 81 88 81 88 82 22 23 50 62 ...
## $ Weather Description: chr [1:36] "mist" "scattered clouds" "light rain" "sky is clear" ...
List the variables in your dataset.
ls(cityweather)
## [1] "City" "Country" "Humidity"
## [4] "Latitude" "Longitude" "Pressure"
## [7] "Temperature" "Weather Description" "Wind Direction"
## [10] "Wind Speed"
Print the top 15 rows of your dataset.
head(cityweather, 15)
## # A tibble: 15 × 10
## City Country Latitude Longitude Temperature `Wind Speed` `Wind Direction`
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Vancouv… Canada 49.2 -123. 285. 0 0
## 2 Portland United… 45.5 -123. 282. 0 0
## 3 San Fra… United… 37.8 -122. 289. 2 150
## 4 Seattle United… 47.6 -122. 282. 0 0
## 5 Los Ang… United… 34.1 -118. 292. 0 0
## 6 San Die… United… 32.7 -117. 292. 0 0
## 7 Las Veg… United… 36.2 -115. 293. 0 0
## 8 Phoenix United… 33.4 -112. 297. 2 10
## 9 Albuque… United… 35.1 -107. 285. 4 360
## 10 Denver United… 39.7 -105. 285. 4 20
## 11 San Ant… United… 29.4 -98.5 289. 0 0
## 12 Dallas United… 32.8 -96.8 290. 3 340
## 13 Houston United… 29.8 -95.4 288. 1 270
## 14 Kansas … United… 39.1 -94.6 290. 0 0
## 15 Minneap… United… 45.0 -93.3 287. 3 330
## # ℹ 3 more variables: Pressure <dbl>, Humidity <dbl>,
## # `Weather Description` <chr>
Write a user defined function using any of the variables from the dataset.
#Create variables.
city = cityweather$City
temp = cityweather$Temperature
#Create a data frame using these two variables.
citytemp = cbind(city, temp)
citytemp = as.data.frame(citytemp)
#Temperature was changed to a string variable, change this back to numeric.
citytemp$temp = as.numeric(citytemp$temp)
#Create a user defined function to detemine temperature in Celsius. Current temperature values are given in Kelvins.
tempC <- function(temp) {
#Calculation for temperature conversion to Celsius is C = K - 273.15.
result <- temp - 273.15
#Return the result.
return(TemperatureCelsius = result)
}
#Test the function on the dataset.
tempC(citytemp$temp)
## [1] 11.48 8.93 16.33 8.65 18.72 18.38 20.26 23.45 11.97 11.46 16.14 16.59
## [13] 15.12 16.83 13.72 13.03 10.86 14.26 10.70 20.88 10.88 25.02 15.50 26.57
## [25] 7.85 13.11 12.48 15.07 12.68 14.02 34.44 32.32 37.43 31.25 31.25 30.35
#Add a new column for Temperature in C to the citytemp dataframe.
citytemp$TempC = tempC(citytemp$temp)
Use data manipulation techniques and filter rows based on any logical criteria that exist in your dataset.
#Create a new data frame including; City, Latitude, Longitude, Temperature, Humidity and Country
weathernew = cbind(cityweather$City, cityweather$Country, cityweather$Latitude, cityweather$Longitude, cityweather$Temperature, cityweather$Humidity)
weathernew = as.data.frame(weathernew)
#Rename columns.
colnames(weathernew) <- c("City", "Country", "Latitude", "Longitude", "Temperature", "Humidity")
#Values for Latitude, Longitude, Temperature, Humidity were changed to string variables. Change back to numeric.
weathernew$Latitude <- as.numeric(weathernew$Latitude)
weathernew$Longitude <- as.numeric(weathernew$Longitude)
weathernew$Temperature <- as.numeric(weathernew$Temperature)
weathernew$Humidity <- as.numeric(weathernew$Humidity)
#View.
library("knitr")
kable(head(weathernew))
| City | Country | Latitude | Longitude | Temperature | Humidity |
|---|---|---|---|---|---|
| Vancouver | Canada | 49.24966 | -123.1193 | 284.63 | 76 |
| Portland | United States | 45.52345 | -122.6762 | 282.08 | 81 |
| San Francisco | United States | 37.77493 | -122.4194 | 289.48 | 88 |
| Seattle | United States | 47.60621 | -122.3321 | 281.80 | 81 |
| Los Angeles | United States | 34.05223 | -118.2437 | 291.87 | 88 |
| San Diego | United States | 32.71533 | -117.1573 | 291.53 | 82 |
#Run tidyverse package.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#Sort data by Latitude, from lowest to highest.
citybylatitude = weathernew %>% arrange(weathernew$Latitude)
#Filter data to only show cities located in the United States.
USweather = weathernew %>% filter(weathernew$Country == "United States")
#View.
library("knitr")
kable(head(USweather))
| City | Country | Latitude | Longitude | Temperature | Humidity |
|---|---|---|---|---|---|
| Portland | United States | 45.52345 | -122.6762 | 282.08 | 81 |
| San Francisco | United States | 37.77493 | -122.4194 | 289.48 | 88 |
| Seattle | United States | 47.60621 | -122.3321 | 281.80 | 81 |
| Los Angeles | United States | 34.05223 | -118.2437 | 291.87 | 88 |
| San Diego | United States | 32.71533 | -117.1573 | 291.53 | 82 |
| Las Vegas | United States | 36.17497 | -115.1372 | 293.41 | 22 |
Identify the dependent and independent variables and use reshaping techniques and create a new data frame by joining those variables from your dataset.
#Identify the dependent and independent variables. Dependent = Temperature. Independent = Latitude.
City = cityweather$City
Temperature = cityweather$Temperature
Latitude = cityweather$Latitude
#Use reshaping techniques and create a new data frame.
weatherbylat = cbind(City, Temperature, Latitude)
weatherbylat = as.data.frame(weatherbylat)
#Change string variables back to numeric.
weatherbylat$Temperature = as.numeric(weatherbylat$Temperature)
weatherbylat$Latitude = as.numeric(weatherbylat$Latitude)
#View.
library("knitr")
kable(head(weatherbylat))
| City | Temperature | Latitude |
|---|---|---|
| Vancouver | 284.63 | 49.24966 |
| Portland | 282.08 | 45.52345 |
| San Francisco | 289.48 | 37.77493 |
| Seattle | 281.80 | 47.60621 |
| Los Angeles | 291.87 | 34.05223 |
| San Diego | 291.53 | 32.71533 |
Remove missing values in your data set.
Before.
#Vancouver has a missing value for Pressure.
library("knitr")
kable(head(cityweather))
| City | Country | Latitude | Longitude | Temperature | Wind Speed | Wind Direction | Pressure | Humidity | Weather Description |
|---|---|---|---|---|---|---|---|---|---|
| Vancouver | Canada | 49.24966 | -123.1193 | 284.63 | 0 | 0 | NA | 76 | mist |
| Portland | United States | 45.52345 | -122.6762 | 282.08 | 0 | 0 | 1024 | 81 | scattered clouds |
| San Francisco | United States | 37.77493 | -122.4194 | 289.48 | 2 | 150 | 1009 | 88 | light rain |
| Seattle | United States | 47.60621 | -122.3321 | 281.80 | 0 | 0 | 1027 | 81 | sky is clear |
| Los Angeles | United States | 34.05223 | -118.2437 | 291.87 | 0 | 0 | 1013 | 88 | mist |
| San Diego | United States | 32.71533 | -117.1573 | 291.53 | 0 | 0 | 1013 | 82 | sky is clear |
#Create new data frame to manipulate without affecting the original.
newweather = cityweather
#Extract rows where Pressure is NA.
newweather %>% filter(is.na(Pressure))
## # A tibble: 1 × 10
## City Country Latitude Longitude Temperature `Wind Speed` `Wind Direction`
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Vancouver Canada 49.2 -123. 285. 0 0
## # ℹ 3 more variables: Pressure <dbl>, Humidity <dbl>,
## # `Weather Description` <chr>
#Remove rows where Pressure is NA.
newweather %>% filter(!is.na(Pressure))
## # A tibble: 35 × 10
## City Country Latitude Longitude Temperature `Wind Speed` `Wind Direction`
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Portland United… 45.5 -123. 282. 0 0
## 2 San Fra… United… 37.8 -122. 289. 2 150
## 3 Seattle United… 47.6 -122. 282. 0 0
## 4 Los Ang… United… 34.1 -118. 292. 0 0
## 5 San Die… United… 32.7 -117. 292. 0 0
## 6 Las Veg… United… 36.2 -115. 293. 0 0
## 7 Phoenix United… 33.4 -112. 297. 2 10
## 8 Albuque… United… 35.1 -107. 285. 4 360
## 9 Denver United… 39.7 -105. 285. 4 20
## 10 San Ant… United… 29.4 -98.5 289. 0 0
## # ℹ 25 more rows
## # ℹ 3 more variables: Pressure <dbl>, Humidity <dbl>,
## # `Weather Description` <chr>
newweatherNA = newweather %>% filter(!is.na(Pressure))
#View.
library("knitr")
kable(head(newweatherNA))
| City | Country | Latitude | Longitude | Temperature | Wind Speed | Wind Direction | Pressure | Humidity | Weather Description |
|---|---|---|---|---|---|---|---|---|---|
| Portland | United States | 45.52345 | -122.6762 | 282.08 | 0 | 0 | 1024 | 81 | scattered clouds |
| San Francisco | United States | 37.77493 | -122.4194 | 289.48 | 2 | 150 | 1009 | 88 | light rain |
| Seattle | United States | 47.60621 | -122.3321 | 281.80 | 0 | 0 | 1027 | 81 | sky is clear |
| Los Angeles | United States | 34.05223 | -118.2437 | 291.87 | 0 | 0 | 1013 | 88 | mist |
| San Diego | United States | 32.71533 | -117.1573 | 291.53 | 0 | 0 | 1013 | 82 | sky is clear |
| Las Vegas | United States | 36.17497 | -115.1372 | 293.41 | 0 | 0 | 1018 | 22 | sky is clear |
Identify and remove duplicated data from your dataset.
duplicated(cityweather)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#Results return as FALSE, indicating that no duplicates were found in this dataset.
Reorder multiple rows in descneding order.
newcitybylatitude = weathernew %>% arrange(desc(Latitude))
#View.
library("knitr")
kable(head(newcitybylatitude))
| City | Country | Latitude | Longitude | Temperature | Humidity |
|---|---|---|---|---|---|
| Vancouver | Canada | 49.24966 | -123.11934 | 284.63 | 76 |
| Seattle | United States | 47.60621 | -122.33207 | 281.80 | 81 |
| Portland | United States | 45.52345 | -122.67621 | 282.08 | 81 |
| Montreal | Canada | 45.50884 | -73.58781 | 285.83 | 93 |
| Minneapolis | United States | 44.97997 | -93.26384 | 286.87 | 67 |
| Toronto | Canada | 43.70011 | -79.41630 | 286.26 | 82 |
Rename some of the columns in your dataset.
names(citytemp)[names(citytemp) == "TempC"] <- "Temperature in C"
#View.
library("knitr")
kable(head(citytemp))
| city | temp | Temperature in C |
|---|---|---|
| Vancouver | 284.63 | 11.48 |
| Portland | 282.08 | 8.93 |
| San Francisco | 289.48 | 16.33 |
| Seattle | 281.80 | 8.65 |
| Los Angeles | 291.87 | 18.72 |
| San Diego | 291.53 | 18.38 |
Add new variables in your data frame using a mathematical function (for e.g. multiply an existing column by 2 and add it as a new variable.)
citytempF = citytemp
#New variable is Temperature in Fahrenheit.
#Convert Kelvin to Fahrenheit using the following formula. F = (K − 273.15) × 1.8 + 32
#Create function.
tempF <- function(temp) {
# Calculation for the temperature conversion.
result <- ((temp - 273.15) * 1.8) + 32
# Return the result.
return(TemperatureFahrenheit = result)
}
#Test the function on the dataset.
tempF(citytempF$temp)
## [1] 52.664 48.074 61.394 47.570 65.696 65.084 68.468 74.210 53.546 52.628
## [11] 61.052 61.862 59.216 62.294 56.696 55.454 51.548 57.668 51.260 69.584
## [21] 51.584 77.036 59.900 79.826 46.130 55.598 54.464 59.126 54.824 57.236
## [31] 93.992 90.176 99.374 88.250 88.250 86.630
TempF = tempF(citytempF$temp)
#Add new column for Temperature in F.
citytempF$TempF = tempF(citytempF$temp)
#Rename column.
names(citytempF)[names(citytempF) == "TempF"] <- "Temperature in F"
#View.
library("knitr")
kable(head(citytempF))
| city | temp | Temperature in C | Temperature in F |
|---|---|---|---|
| Vancouver | 284.63 | 11.48 | 52.664 |
| Portland | 282.08 | 8.93 | 48.074 |
| San Francisco | 289.48 | 16.33 | 61.394 |
| Seattle | 281.80 | 8.65 | 47.570 |
| Los Angeles | 291.87 | 18.72 | 65.696 |
| San Diego | 291.53 | 18.38 | 65.084 |
Create a training set using a random number generator engine.
#Initiate random number generator engine.
set.seed(1234)
#Extract 5 random rows without replacement.
cityweather %>% sample_n(5, replace=FALSE)
## # A tibble: 5 × 10
## City Country Latitude Longitude Temperature `Wind Speed` `Wind Direction`
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 New York United… 40.7 -74.0 288. 7 260
## 2 Saint Lo… United… 38.6 -90.2 286. 4 40
## 3 Jacksonv… United… 30.3 -81.7 298. 3 180
## 4 Albuquer… United… 35.1 -107. 285. 4 360
## 5 Los Ange… United… 34.1 -118. 292. 0 0
## # ℹ 3 more variables: Pressure <dbl>, Humidity <dbl>,
## # `Weather Description` <chr>
#Extract 5% of rows, randomly without replacement.
cityweather %>% sample_frac(0.05, replace=FALSE)
## # A tibble: 2 × 10
## City Country Latitude Longitude Temperature `Wind Speed` `Wind Direction`
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Saint Lo… United… 38.6 -90.2 286. 4 40
## 2 Seattle United… 47.6 -122. 282. 0 0
## # ℹ 3 more variables: Pressure <dbl>, Humidity <dbl>,
## # `Weather Description` <chr>
Print the summary statistics of your dataset.
summary(cityweather)
## City Country Latitude Longitude
## Length:36 Length:36 Min. :25.77 Min. :-123.12
## Class :character Class :character 1st Qu.:32.77 1st Qu.:-105.40
## Mode :character Mode :character Median :36.17 Median : -86.47
## Mean :37.07 Mean : -73.54
## 3rd Qu.:41.00 3rd Qu.: -74.87
## Max. :49.25 Max. : 35.22
##
## Temperature Wind Speed Wind Direction Pressure
## Min. :281.0 Min. :0.000 Min. : 0.0 Min. : 984
## 1st Qu.:285.5 1st Qu.:0.000 1st Qu.: 0.0 1st Qu.:1010
## Median :288.5 Median :2.000 Median : 70.0 Median :1012
## Mean :291.1 Mean :2.222 Mean :125.5 Mean :1012
## 3rd Qu.:294.7 3rd Qu.:4.000 3rd Qu.:245.0 3rd Qu.:1014
## Max. :310.6 Max. :8.000 Max. :360.0 Max. :1028
## NA's :1
## Humidity Weather Description
## Min. : 22.00 Length:36
## 1st Qu.: 61.00 Class :character
## Median : 76.00 Mode :character
## Mean : 71.17
## 3rd Qu.: 87.25
## Max. :100.00
##
Use any of the numerical variables from the dataset and perform the following statistical functions: Mean, Median, Mode, Range.
#Using the variable Temperature.
#Calculate mean.
mean(cityweather$Temperature)
## [1] 291.1494
#Determine median.
median(cityweather$Temperature)
## [1] 288.46
#Mode cannot be calculated in R.
#Calculate range.
range(cityweather$Temperature)
## [1] 281.00 310.58
range_value = max(cityweather$Temperature) - min(cityweather$Temperature)
print(range_value)
## [1] 29.58
Plot a scatter plot for any 2 variables in your dataset.
library(ggplot2)
#Plot Latitude and Temperature.
ggplot(cityweather,aes(x = Latitude, y = Temperature)) + geom_point(size = 1, color = "darkred", shape = 20)
#As you can see in the scatter plot, the two variables are negatively correlated. As Latitude increases, Temperature decreases.
Plot a bar plot for any 2 variables in your dataset.
#Plot for City and Humidity.
ggplot(cityweather, aes(y = City, fill = Humidity)) + geom_bar()
#As you can see in the bar plot, darker bars indicate lower humidity and lighter areas indicate higher humidity.
Find the correlation between any 2 variables by applying least square linear regression model.
#Use variables Temperature and Humidity.
Y<-cityweather[,"Temperature"]
X<-cityweather[,"Humidity"]
head(X)
## # A tibble: 6 × 1
## Humidity
## <dbl>
## 1 76
## 2 81
## 3 88
## 4 81
## 5 88
## 6 82
head(Y)
## # A tibble: 6 × 1
## Temperature
## <dbl>
## 1 285.
## 2 282.
## 3 289.
## 4 282.
## 5 292.
## 6 292.
THcorr = xycorr<-cor(Y,X, method="pearson")
print(THcorr)
## Humidity
## Temperature -0.5171696
#xycorr value is -0,517. This is in the middle, but still a bit closer to 1, indicating a correlation. The minus sign indicates a negative correlation, meaning as Temperature increases, Humidity decreases.
#Try for Latitude and Temperature.
W<-cityweather[,"Temperature"]
Z<-cityweather[,"Latitude"]
TLcorr = xycorr<-cor(W,Z, method="pearson")
print(TLcorr)
## Latitude
## Temperature -0.7178985
#Value of -0.718 is closer to 1, indicating strong correlation. The negative symbol indicates a negative correlation. Therefore, the relationship between Temperature and Latitude has a strong negative correlation. As Latitude increases, Temperature decreases.
END