library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
#1.Read the contents of “temp.csv” file into a dataframe.
df=read.csv("C:/Users/student/Downloads/temp.csv")
head(df)
## CDate Temperature Humidity WindSpeed Precipitation SunshineHours City
## 1 01-08-2023 25.5 60 5.5 NA 8 CityA
## 2 02-08-2023 26.1 65 6.2 2.3 7 CityB
## 3 03-08-2023 NA 63 5.8 0.0 6 CityA
## 4 04-08-2023 24.8 NA 6.1 1.2 NA CityC
## 5 05-08-2023 23.9 58 NA 0.0 9 <NA>
## 6 06-08-2023 25.0 61 5.6 NA 8 CityA
#2.How many rows in the dataframe have missing values?
missing_rows_count=sum(complete.cases(df) == FALSE)
print(paste("Number of rows with missing values:", missing_rows_count))
## [1] "Number of rows with missing values: 9"
#3.Which rows in the dataframe are complete (i.e., have no missing values)?
complete_rows=df %>% filter(complete.cases(.))
print(complete_rows)
## CDate Temperature Humidity WindSpeed Precipitation SunshineHours City
## 1 02-08-2023 26.1 65 6.2 2.3 7 CityB
#4.Display only the rows that have missing values.
missing_values_rows= df %>% filter(!complete.cases(.))
print(missing_values_rows)
## CDate Temperature Humidity WindSpeed Precipitation SunshineHours City
## 1 01-08-2023 25.5 60 5.5 NA 8 CityA
## 2 03-08-2023 NA 63 5.8 0.0 6 CityA
## 3 04-08-2023 24.8 NA 6.1 1.2 NA CityC
## 4 05-08-2023 23.9 58 NA 0.0 9 <NA>
## 5 06-08-2023 25.0 61 5.6 NA 8 CityA
## 6 07-08-2023 NA 64 5.7 0.5 NA CityB
## 7 08-08-2023 26.7 NA NA 0.0 7 CityC
## 8 09-08-2023 24.5 59 5.9 0.3 8 <NA>
## 9 10-08-2023 25.3 62 6.0 NA 6 CityA
#5.Remove rows where the City attribute is missing. How many rows are left?
df_no_city_missing= df %>% filter(!is.na(City))
remaining_rows= nrow(df_no_city_missing)
print(paste("Number of rows left after removing rows with missing City values:", remaining_rows))
## [1] "Number of rows left after removing rows with missing City values: 8"
#6.Find the average WindSpeed after removing rows with missing WindSpeed values.
average_windspeed=df %>% filter(!is.na(WindSpeed)) %>% summarise(avg_windspeed = mean(WindSpeed, na.rm = TRUE))
print(paste("Average WindSpeed:", average_windspeed$avg_windspeed))
## [1] "Average WindSpeed: 5.85"
#7.Which columns have the most missing values?
missing_values_per_column=colSums(is.na(df))
columns_most_missing=names(missing_values_per_column[missing_values_per_column == max(missing_values_per_column)])
print(columns_most_missing)
## [1] "Precipitation"
#8.Calculate the percentage of missing data in each column.
total_rows=nrow(df)
missing_percentage=colSums(is.na(df)) / total_rows * 100
print(missing_percentage)
## CDate Temperature Humidity WindSpeed Precipitation
## 0 20 20 20 30
## SunshineHours City
## 20 20
#9.Create a new dataframe excluding any rows with more than 2 missing values. How many rows are in this new dataframe?
df_less_than_3_missing = df %>% filter(rowSums(is.na(.)) <= 2)
rows_after_exclusion=nrow(df_less_than_3_missing)
print(paste("Number of rows after excluding rows with more than 2 missing values:", rows_after_exclusion))
## [1] "Number of rows after excluding rows with more than 2 missing values: 10"
#10.What is the range of dates in the dataframe (i.e., the earliest and latest date)?
df$Date <- as.Date(df$CDate)
date_range <- range(df$Date, na.rm = TRUE)
print(paste("Date range:", date_range[1], "to", date_range[2]))
## [1] "Date range: 1-08-20 to 10-08-20"
#11.Extract and display the rows where the Date is a weekend (i.e., Saturday or Sunday).
df$DayOfWeek=weekdays(df$Date)
weekend_rows=df %>% filter(DayOfWeek %in% c("Saturday", "Sunday"))
print(weekend_rows)
## CDate Temperature Humidity WindSpeed Precipitation SunshineHours City
## 1 05-08-2023 23.9 58 NA 0 9 <NA>
## 2 06-08-2023 25.0 61 5.6 NA 8 CityA
## Date DayOfWeek
## 1 0005-08-20 Saturday
## 2 0006-08-20 Sunday
#12.Add a new column to the dataframe that indicates the day of the week for each Date.
df$DayOfWeek <- weekdays(df$Date)
print(df)
## CDate Temperature Humidity WindSpeed Precipitation SunshineHours City
## 1 01-08-2023 25.5 60 5.5 NA 8 CityA
## 2 02-08-2023 26.1 65 6.2 2.3 7 CityB
## 3 03-08-2023 NA 63 5.8 0.0 6 CityA
## 4 04-08-2023 24.8 NA 6.1 1.2 NA CityC
## 5 05-08-2023 23.9 58 NA 0.0 9 <NA>
## 6 06-08-2023 25.0 61 5.6 NA 8 CityA
## 7 07-08-2023 NA 64 5.7 0.5 NA CityB
## 8 08-08-2023 26.7 NA NA 0.0 7 CityC
## 9 09-08-2023 24.5 59 5.9 0.3 8 <NA>
## 10 10-08-2023 25.3 62 6.0 NA 6 CityA
## Date DayOfWeek
## 1 0001-08-20 Monday
## 2 0002-08-20 Tuesday
## 3 0003-08-20 Wednesday
## 4 0004-08-20 Friday
## 5 0005-08-20 Saturday
## 6 0006-08-20 Sunday
## 7 0007-08-20 Monday
## 8 0008-08-20 Wednesday
## 9 0009-08-20 Thursday
## 10 0010-08-20 Friday
#13.Filter the dataframe to show only rows where the date is in the first week of August 2023.
df_filtered_august <- df %>% filter(year(Date) == 2023 & month(Date) == 8 & day(Date) <= 7)
print(df_filtered_august)
## [1] CDate Temperature Humidity WindSpeed Precipitation
## [6] SunshineHours City Date DayOfWeek
## <0 rows> (or 0-length row.names)
#14.Calculate the average Temperature for each day of the week (e.g., average temperature for all Mondays, all Tuesdays, etc.).
avg_temperature_per_day <- df %>% group_by(DayOfWeek) %>% summarise(avg_temp = mean(Temperature, na.rm = TRUE))
print(avg_temperature_per_day)
## # A tibble: 7 × 2
## DayOfWeek avg_temp
## <chr> <dbl>
## 1 Friday 25.0
## 2 Monday 25.5
## 3 Saturday 23.9
## 4 Sunday 25
## 5 Thursday 24.5
## 6 Tuesday 26.1
## 7 Wednesday 26.7
#15.Create a new dataframe containing only the data from the last 5 days of the dataframe.
last_5_days <- tail(df, 5)
print(last_5_days)
## CDate Temperature Humidity WindSpeed Precipitation SunshineHours City
## 6 06-08-2023 25.0 61 5.6 NA 8 CityA
## 7 07-08-2023 NA 64 5.7 0.5 NA CityB
## 8 08-08-2023 26.7 NA NA 0.0 7 CityC
## 9 09-08-2023 24.5 59 5.9 0.3 8 <NA>
## 10 10-08-2023 25.3 62 6.0 NA 6 CityA
## Date DayOfWeek
## 6 0006-08-20 Sunday
## 7 0007-08-20 Monday
## 8 0008-08-20 Wednesday
## 9 0009-08-20 Thursday
## 10 0010-08-20 Friday
#16.Find the difference in days between the earliest and latest dates in the dataframe.
date_difference <- as.numeric(difftime(max(df$Date, na.rm = TRUE), min(df$Date, na.rm = TRUE), units = "days"))
print(paste("Difference in days between earliest and latest dates:", date_difference))
## [1] "Difference in days between earliest and latest dates: 3287"
#17.Filter the dataframe to show only rows where the Date is in the second half of the month (i.e.,after the 15th of the month).
df_second_half_month <- df %>% filter(day(Date) > 15)
print(df_second_half_month)
## CDate Temperature Humidity WindSpeed Precipitation SunshineHours City
## 1 01-08-2023 25.5 60 5.5 NA 8 CityA
## 2 02-08-2023 26.1 65 6.2 2.3 7 CityB
## 3 03-08-2023 NA 63 5.8 0.0 6 CityA
## 4 04-08-2023 24.8 NA 6.1 1.2 NA CityC
## 5 05-08-2023 23.9 58 NA 0.0 9 <NA>
## 6 06-08-2023 25.0 61 5.6 NA 8 CityA
## 7 07-08-2023 NA 64 5.7 0.5 NA CityB
## 8 08-08-2023 26.7 NA NA 0.0 7 CityC
## 9 09-08-2023 24.5 59 5.9 0.3 8 <NA>
## 10 10-08-2023 25.3 62 6.0 NA 6 CityA
## Date DayOfWeek
## 1 0001-08-20 Monday
## 2 0002-08-20 Tuesday
## 3 0003-08-20 Wednesday
## 4 0004-08-20 Friday
## 5 0005-08-20 Saturday
## 6 0006-08-20 Sunday
## 7 0007-08-20 Monday
## 8 0008-08-20 Wednesday
## 9 0009-08-20 Thursday
## 10 0010-08-20 Friday
#18. Find the average WindSpeed for dates that are Mondays.
monday_avg_windspeed <- df %>% filter(DayOfWeek == "Monday") %>% summarise(avg_windspeed = mean(WindSpeed, na.rm = TRUE))
print(paste("Average WindSpeed for Mondays:", monday_avg_windspeed$avg_windspeed))
## [1] "Average WindSpeed for Mondays: 5.6"