#Lab Session 5 – August 16, 2024
#1. Read the contents of “temp.csv” file into a dataframe.
df<- read.csv("C:/Users/user/Downloads/temp.csv")
head(df)
## CDate Temperature Humidity WindSpeed Precipitation SunshineHours City
## 1 01-08-2023 25.5 60 5.5 NA 8 CityA
## 2 02-08-2023 26.1 65 6.2 2.3 7 CityB
## 3 03-08-2023 NA 63 5.8 0.0 6 CityA
## 4 04-08-2023 24.8 NA 6.1 1.2 NA CityC
## 5 05-08-2023 23.9 58 NA 0.0 9 <NA>
## 6 06-08-2023 25.0 61 5.6 NA 8 CityA
#2. How many rows in the dataframe have missing values?
missing <- sum(rowSums(is.na(df))>0)
missing
## [1] 9
#3. Which rows in the dataframe are complete (i.e., have no missing values)?
complete <- df[rowSums(is.na(df)) == 0, ]
complete
## CDate Temperature Humidity WindSpeed Precipitation SunshineHours City
## 2 02-08-2023 26.1 65 6.2 2.3 7 CityB
#4. Display only the rows that have missing values.
missing_rows <- df[!complete.cases(df), ]
missing_rows
## CDate Temperature Humidity WindSpeed Precipitation SunshineHours City
## 1 01-08-2023 25.5 60 5.5 NA 8 CityA
## 3 03-08-2023 NA 63 5.8 0.0 6 CityA
## 4 04-08-2023 24.8 NA 6.1 1.2 NA CityC
## 5 05-08-2023 23.9 58 NA 0.0 9 <NA>
## 6 06-08-2023 25.0 61 5.6 NA 8 CityA
## 7 07-08-2023 NA 64 5.7 0.5 NA CityB
## 8 08-08-2023 26.7 NA NA 0.0 7 CityC
## 9 09-08-2023 24.5 59 5.9 0.3 8 <NA>
## 10 10-08-2023 25.3 62 6.0 NA 6 CityA
#5. Remove rows where the City attribute is missing. How many rows are left?
city <- df[!is.na(df$City), ]
rows_left <- nrow(city)
rows_left
## [1] 8
#6. Find the average WindSpeed after removing rows with missing WindSpeed values.
avg_wind <- mean(df$WindSpeed, na.rm = TRUE)
avg_windavg_wind <- mean(df$WindSpeed, na.rm = TRUE)
avg_wind
## [1] 5.85
#7. Which columns have the most missing values?
most_missing <- colnames(df)[which.max(colSums(is.na(df)))]
most_missing
## [1] "Precipitation"
#8. Calculate the percentage of missing data in each column.
percentage <- colSums(is.na(df))/nrow(df)*100
percentage
## CDate Temperature Humidity WindSpeed Precipitation
## 0 20 20 20 30
## SunshineHours City
## 20 20
#9. Create a new dataframe excluding any rows with more than 2 missing values. How many rows are in this new dataframe?
df_less <- df[rowSums(is.na(df)) <= 2, ]
new_df <- nrow(df_less)
new_df
## [1] 10
#10. What is the range of dates in the dataframe (i.e., the earliest and latest date)?
range_dates <- range(df$CDate, na.rm = TRUE)
range_dates
## [1] "01-08-2023" "10-08-2023"
#11. Extract and display the rows where the Date is a weekend (i.e., Saturday or Sunday).
df$DayofWeek <- weekdays(as.Date(df$CDate))
weekend_rows <- df[df$DayofWeek %in% c("Saturday", "Sunday"), ]
weekend_rows
## CDate Temperature Humidity WindSpeed Precipitation SunshineHours City
## 5 05-08-2023 23.9 58 NA 0 9 <NA>
## 6 06-08-2023 25.0 61 5.6 NA 8 CityA
## DayofWeek
## 5 Saturday
## 6 Sunday
#12. Add a new column to the dataframe that indicates the day of the week for each Date.
df$DayofWeek <- weekdays(as.Date(df$CDate))
head(df)
## CDate Temperature Humidity WindSpeed Precipitation SunshineHours City
## 1 01-08-2023 25.5 60 5.5 NA 8 CityA
## 2 02-08-2023 26.1 65 6.2 2.3 7 CityB
## 3 03-08-2023 NA 63 5.8 0.0 6 CityA
## 4 04-08-2023 24.8 NA 6.1 1.2 NA CityC
## 5 05-08-2023 23.9 58 NA 0.0 9 <NA>
## 6 06-08-2023 25.0 61 5.6 NA 8 CityA
## DayofWeek
## 1 Monday
## 2 Tuesday
## 3 Wednesday
## 4 Friday
## 5 Saturday
## 6 Sunday
#13. Filter the dataframe to show only rows where the date is in the first week of August 2023.
august <- df[as.Date(df$CDate) >= "01-08-2023" & as.Date(df$CDate) <= "07-08-2023", ]
august
## CDate Temperature Humidity WindSpeed Precipitation SunshineHours City
## 1 01-08-2023 25.5 60 5.5 NA 8 CityA
## 2 02-08-2023 26.1 65 6.2 2.3 7 CityB
## 3 03-08-2023 NA 63 5.8 0.0 6 CityA
## 4 04-08-2023 24.8 NA 6.1 1.2 NA CityC
## 5 05-08-2023 23.9 58 NA 0.0 9 <NA>
## 6 06-08-2023 25.0 61 5.6 NA 8 CityA
## 7 07-08-2023 NA 64 5.7 0.5 NA CityB
## DayofWeek
## 1 Monday
## 2 Tuesday
## 3 Wednesday
## 4 Friday
## 5 Saturday
## 6 Sunday
## 7 Monday
#14. Calculate the average Temperature for each day of the week (e.g., average temperature for all Mondays, all Tuesdays, etc.).
df$DayOfWeek <- weekdays(as.Date(df$CDate))
avg_temp <- aggregate(Temperature ~ DayOfWeek, data = df, FUN = function(x) mean(x, na.rm = TRUE))
avg_temp
## DayOfWeek Temperature
## 1 Friday 25.05
## 2 Monday 25.50
## 3 Saturday 23.90
## 4 Sunday 25.00
## 5 Thursday 24.50
## 6 Tuesday 26.10
## 7 Wednesday 26.70
#15. Create a new dataframe containing only the data from the last 5 days of the dataframe.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
last_5days <- df %>% filter(!is.na(CDate)) %>% slice_max(order_by = as.Date(CDate), n = 5)
last_5days
## CDate Temperature Humidity WindSpeed Precipitation SunshineHours City
## 1 10-08-2023 25.3 62 6.0 NA 6 CityA
## 2 09-08-2023 24.5 59 5.9 0.3 8 <NA>
## 3 08-08-2023 26.7 NA NA 0.0 7 CityC
## 4 07-08-2023 NA 64 5.7 0.5 NA CityB
## 5 06-08-2023 25.0 61 5.6 NA 8 CityA
## DayofWeek DayOfWeek
## 1 Friday Friday
## 2 Thursday Thursday
## 3 Wednesday Wednesday
## 4 Monday Monday
## 5 Sunday Sunday
#16. Find the difference in days between the earliest and latest dates in the dataframe.
difference <- as.numeric(difftime(max(as.Date(df$CDate, na.rm = TRUE)),
min(as.Date(df$CDate, na.rm = TRUE)),
units = "days"))
difference
## [1] 3287
#17. Filter the dataframe to show only rows where the Date is in the second half of the month (i.e., after the 15th of the month).
second_half <- df[as.integer(format(as.Date(df$CDate), "%d")) > 15, ]
second_half
## CDate Temperature Humidity WindSpeed Precipitation SunshineHours City
## 1 01-08-2023 25.5 60 5.5 NA 8 CityA
## 2 02-08-2023 26.1 65 6.2 2.3 7 CityB
## 3 03-08-2023 NA 63 5.8 0.0 6 CityA
## 4 04-08-2023 24.8 NA 6.1 1.2 NA CityC
## 5 05-08-2023 23.9 58 NA 0.0 9 <NA>
## 6 06-08-2023 25.0 61 5.6 NA 8 CityA
## 7 07-08-2023 NA 64 5.7 0.5 NA CityB
## 8 08-08-2023 26.7 NA NA 0.0 7 CityC
## 9 09-08-2023 24.5 59 5.9 0.3 8 <NA>
## 10 10-08-2023 25.3 62 6.0 NA 6 CityA
## DayofWeek DayOfWeek
## 1 Monday Monday
## 2 Tuesday Tuesday
## 3 Wednesday Wednesday
## 4 Friday Friday
## 5 Saturday Saturday
## 6 Sunday Sunday
## 7 Monday Monday
## 8 Wednesday Wednesday
## 9 Thursday Thursday
## 10 Friday Friday
#18. Find the average WindSpeed for dates that are Mondays.
avg_speed <- mean(df$WindSpeed[df$DayOfWeek == "Monday"], na.rm = TRUE)
avg_speed
## [1] 5.6