#Lab Session 5 – August 16, 2024

#1. Read the contents of “temp.csv” file into a dataframe.

df<- read.csv("C:/Users/user/Downloads/temp.csv")
head(df)
##        CDate Temperature Humidity WindSpeed Precipitation SunshineHours  City
## 1 01-08-2023        25.5       60       5.5            NA             8 CityA
## 2 02-08-2023        26.1       65       6.2           2.3             7 CityB
## 3 03-08-2023          NA       63       5.8           0.0             6 CityA
## 4 04-08-2023        24.8       NA       6.1           1.2            NA CityC
## 5 05-08-2023        23.9       58        NA           0.0             9  <NA>
## 6 06-08-2023        25.0       61       5.6            NA             8 CityA

#2. How many rows in the dataframe have missing values?

missing <- sum(rowSums(is.na(df))>0)
missing
## [1] 9

#3. Which rows in the dataframe are complete (i.e., have no missing values)?

complete <- df[rowSums(is.na(df)) == 0, ]
complete
##        CDate Temperature Humidity WindSpeed Precipitation SunshineHours  City
## 2 02-08-2023        26.1       65       6.2           2.3             7 CityB

#4. Display only the rows that have missing values.

missing_rows <- df[!complete.cases(df), ]
missing_rows
##         CDate Temperature Humidity WindSpeed Precipitation SunshineHours  City
## 1  01-08-2023        25.5       60       5.5            NA             8 CityA
## 3  03-08-2023          NA       63       5.8           0.0             6 CityA
## 4  04-08-2023        24.8       NA       6.1           1.2            NA CityC
## 5  05-08-2023        23.9       58        NA           0.0             9  <NA>
## 6  06-08-2023        25.0       61       5.6            NA             8 CityA
## 7  07-08-2023          NA       64       5.7           0.5            NA CityB
## 8  08-08-2023        26.7       NA        NA           0.0             7 CityC
## 9  09-08-2023        24.5       59       5.9           0.3             8  <NA>
## 10 10-08-2023        25.3       62       6.0            NA             6 CityA

#5. Remove rows where the City attribute is missing. How many rows are left?

city <- df[!is.na(df$City), ]
rows_left <- nrow(city)
rows_left
## [1] 8

#6. Find the average WindSpeed after removing rows with missing WindSpeed values.

avg_wind <- mean(df$WindSpeed, na.rm = TRUE)
avg_windavg_wind <- mean(df$WindSpeed, na.rm = TRUE)
avg_wind
## [1] 5.85

#7. Which columns have the most missing values?

most_missing <- colnames(df)[which.max(colSums(is.na(df)))]
most_missing
## [1] "Precipitation"

#8. Calculate the percentage of missing data in each column.

percentage <- colSums(is.na(df))/nrow(df)*100
percentage
##         CDate   Temperature      Humidity     WindSpeed Precipitation 
##             0            20            20            20            30 
## SunshineHours          City 
##            20            20

#9. Create a new dataframe excluding any rows with more than 2 missing values. How many rows are in this new dataframe?

df_less <- df[rowSums(is.na(df)) <= 2, ]
new_df <- nrow(df_less)
new_df
## [1] 10

#10. What is the range of dates in the dataframe (i.e., the earliest and latest date)?

range_dates <- range(df$CDate, na.rm = TRUE)
range_dates
## [1] "01-08-2023" "10-08-2023"

#11. Extract and display the rows where the Date is a weekend (i.e., Saturday or Sunday).

df$DayofWeek <- weekdays(as.Date(df$CDate))
weekend_rows <- df[df$DayofWeek %in% c("Saturday", "Sunday"), ]
weekend_rows
##        CDate Temperature Humidity WindSpeed Precipitation SunshineHours  City
## 5 05-08-2023        23.9       58        NA             0             9  <NA>
## 6 06-08-2023        25.0       61       5.6            NA             8 CityA
##   DayofWeek
## 5  Saturday
## 6    Sunday

#12. Add a new column to the dataframe that indicates the day of the week for each Date.

df$DayofWeek <- weekdays(as.Date(df$CDate))
head(df)
##        CDate Temperature Humidity WindSpeed Precipitation SunshineHours  City
## 1 01-08-2023        25.5       60       5.5            NA             8 CityA
## 2 02-08-2023        26.1       65       6.2           2.3             7 CityB
## 3 03-08-2023          NA       63       5.8           0.0             6 CityA
## 4 04-08-2023        24.8       NA       6.1           1.2            NA CityC
## 5 05-08-2023        23.9       58        NA           0.0             9  <NA>
## 6 06-08-2023        25.0       61       5.6            NA             8 CityA
##   DayofWeek
## 1    Monday
## 2   Tuesday
## 3 Wednesday
## 4    Friday
## 5  Saturday
## 6    Sunday

#13. Filter the dataframe to show only rows where the date is in the first week of August 2023.

august <- df[as.Date(df$CDate) >= "01-08-2023" & as.Date(df$CDate) <= "07-08-2023", ]
august
##        CDate Temperature Humidity WindSpeed Precipitation SunshineHours  City
## 1 01-08-2023        25.5       60       5.5            NA             8 CityA
## 2 02-08-2023        26.1       65       6.2           2.3             7 CityB
## 3 03-08-2023          NA       63       5.8           0.0             6 CityA
## 4 04-08-2023        24.8       NA       6.1           1.2            NA CityC
## 5 05-08-2023        23.9       58        NA           0.0             9  <NA>
## 6 06-08-2023        25.0       61       5.6            NA             8 CityA
## 7 07-08-2023          NA       64       5.7           0.5            NA CityB
##   DayofWeek
## 1    Monday
## 2   Tuesday
## 3 Wednesday
## 4    Friday
## 5  Saturday
## 6    Sunday
## 7    Monday

#14. Calculate the average Temperature for each day of the week (e.g., average temperature for all Mondays, all Tuesdays, etc.).

df$DayOfWeek <- weekdays(as.Date(df$CDate))
avg_temp <- aggregate(Temperature ~ DayOfWeek, data = df, FUN = function(x) mean(x, na.rm = TRUE))
avg_temp
##   DayOfWeek Temperature
## 1    Friday       25.05
## 2    Monday       25.50
## 3  Saturday       23.90
## 4    Sunday       25.00
## 5  Thursday       24.50
## 6   Tuesday       26.10
## 7 Wednesday       26.70

#15. Create a new dataframe containing only the data from the last 5 days of the dataframe.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
last_5days <- df %>% filter(!is.na(CDate)) %>% slice_max(order_by = as.Date(CDate), n = 5)
last_5days
##        CDate Temperature Humidity WindSpeed Precipitation SunshineHours  City
## 1 10-08-2023        25.3       62       6.0            NA             6 CityA
## 2 09-08-2023        24.5       59       5.9           0.3             8  <NA>
## 3 08-08-2023        26.7       NA        NA           0.0             7 CityC
## 4 07-08-2023          NA       64       5.7           0.5            NA CityB
## 5 06-08-2023        25.0       61       5.6            NA             8 CityA
##   DayofWeek DayOfWeek
## 1    Friday    Friday
## 2  Thursday  Thursday
## 3 Wednesday Wednesday
## 4    Monday    Monday
## 5    Sunday    Sunday

#16. Find the difference in days between the earliest and latest dates in the dataframe.

difference <- as.numeric(difftime(max(as.Date(df$CDate, na.rm = TRUE)), 
                                 min(as.Date(df$CDate, na.rm = TRUE)), 
                                 units = "days"))
difference
## [1] 3287

#17. Filter the dataframe to show only rows where the Date is in the second half of the month (i.e., after the 15th of the month).

second_half <- df[as.integer(format(as.Date(df$CDate), "%d")) > 15, ]
second_half
##         CDate Temperature Humidity WindSpeed Precipitation SunshineHours  City
## 1  01-08-2023        25.5       60       5.5            NA             8 CityA
## 2  02-08-2023        26.1       65       6.2           2.3             7 CityB
## 3  03-08-2023          NA       63       5.8           0.0             6 CityA
## 4  04-08-2023        24.8       NA       6.1           1.2            NA CityC
## 5  05-08-2023        23.9       58        NA           0.0             9  <NA>
## 6  06-08-2023        25.0       61       5.6            NA             8 CityA
## 7  07-08-2023          NA       64       5.7           0.5            NA CityB
## 8  08-08-2023        26.7       NA        NA           0.0             7 CityC
## 9  09-08-2023        24.5       59       5.9           0.3             8  <NA>
## 10 10-08-2023        25.3       62       6.0            NA             6 CityA
##    DayofWeek DayOfWeek
## 1     Monday    Monday
## 2    Tuesday   Tuesday
## 3  Wednesday Wednesday
## 4     Friday    Friday
## 5   Saturday  Saturday
## 6     Sunday    Sunday
## 7     Monday    Monday
## 8  Wednesday Wednesday
## 9   Thursday  Thursday
## 10    Friday    Friday

#18. Find the average WindSpeed for dates that are Mondays.

avg_speed <- mean(df$WindSpeed[df$DayOfWeek == "Monday"], na.rm = TRUE)
avg_speed
## [1] 5.6