FOUNDATION OF DATA ANALYTICS-LAB5

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

#1.Read the contents of “temp.csv” file into a dataframe.

df=read.csv("C:/Users/student/Downloads/temp.csv")
head(df)

##        CDate Temperature Humidity WindSpeed Precipitation SunshineHours  City
## 1 01-08-2023        25.5       60       5.5            NA             8 CityA
## 2 02-08-2023        26.1       65       6.2           2.3             7 CityB
## 3 03-08-2023          NA       63       5.8           0.0             6 CityA
## 4 04-08-2023        24.8       NA       6.1           1.2            NA CityC
## 5 05-08-2023        23.9       58        NA           0.0             9  <NA>
## 6 06-08-2023        25.0       61       5.6            NA             8 CityA

#2.How many rows in the dataframe have missing values?

missing_rows_count=sum(complete.cases(df) == FALSE)
print(paste("Number of rows with missing values:", missing_rows_count))

## [1] "Number of rows with missing values: 9"

#3.Which rows in the dataframe are complete (i.e., have no missing values)?

complete_rows=df %>% filter(complete.cases(.))
print(complete_rows)

##        CDate Temperature Humidity WindSpeed Precipitation SunshineHours  City
## 1 02-08-2023        26.1       65       6.2           2.3             7 CityB

#4.Display only the rows that have missing values.

missing_values_rows= df %>% filter(!complete.cases(.))
print(missing_values_rows)

##        CDate Temperature Humidity WindSpeed Precipitation SunshineHours  City
## 1 01-08-2023        25.5       60       5.5            NA             8 CityA
## 2 03-08-2023          NA       63       5.8           0.0             6 CityA
## 3 04-08-2023        24.8       NA       6.1           1.2            NA CityC
## 4 05-08-2023        23.9       58        NA           0.0             9  <NA>
## 5 06-08-2023        25.0       61       5.6            NA             8 CityA
## 6 07-08-2023          NA       64       5.7           0.5            NA CityB
## 7 08-08-2023        26.7       NA        NA           0.0             7 CityC
## 8 09-08-2023        24.5       59       5.9           0.3             8  <NA>
## 9 10-08-2023        25.3       62       6.0            NA             6 CityA

#5.Remove rows where the City attribute is missing. How many rows are left?

df_no_city_missing= df %>% filter(!is.na(City))
remaining_rows= nrow(df_no_city_missing)
print(paste("Number of rows left after removing rows with missing City values:", remaining_rows))

## [1] "Number of rows left after removing rows with missing City values: 8"

#6.Find the average WindSpeed after removing rows with missing WindSpeed values.

average_windspeed=df %>% filter(!is.na(WindSpeed)) %>% summarise(avg_windspeed = mean(WindSpeed, na.rm = TRUE))
print(paste("Average WindSpeed:", average_windspeed$avg_windspeed))

## [1] "Average WindSpeed: 5.85"

#7.Which columns have the most missing values?

missing_values_per_column=colSums(is.na(df))
columns_most_missing=names(missing_values_per_column[missing_values_per_column == max(missing_values_per_column)])
print(columns_most_missing)

## [1] "Precipitation"

#8.Calculate the percentage of missing data in each column.

total_rows=nrow(df)
missing_percentage=colSums(is.na(df)) / total_rows * 100
print(missing_percentage)

##         CDate   Temperature      Humidity     WindSpeed Precipitation 
##             0            20            20            20            30 
## SunshineHours          City 
##            20            20

#9.Create a new dataframe excluding any rows with more than 2 missing values. How many rows are in this new dataframe?

df_less_than_3_missing = df %>% filter(rowSums(is.na(.)) <= 2)
rows_after_exclusion=nrow(df_less_than_3_missing)
print(paste("Number of rows after excluding rows with more than 2 missing values:", rows_after_exclusion))

## [1] "Number of rows after excluding rows with more than 2 missing values: 10"

#10.What is the range of dates in the dataframe (i.e., the earliest and latest date)?

df$Date <- as.Date(df$CDate) 
date_range <- range(df$Date, na.rm = TRUE)
print(paste("Date range:", date_range[1], "to", date_range[2]))

## [1] "Date range: 1-08-20 to 10-08-20"

#11.Extract and display the rows where the Date is a weekend (i.e., Saturday or Sunday).

df$DayOfWeek=weekdays(df$Date)
weekend_rows=df %>% filter(DayOfWeek %in% c("Saturday", "Sunday"))
print(weekend_rows)

##        CDate Temperature Humidity WindSpeed Precipitation SunshineHours  City
## 1 05-08-2023        23.9       58        NA             0             9  <NA>
## 2 06-08-2023        25.0       61       5.6            NA             8 CityA
##         Date DayOfWeek
## 1 0005-08-20  Saturday
## 2 0006-08-20    Sunday

#12.Add a new column to the dataframe that indicates the day of the week for each Date.

df$DayOfWeek <- weekdays(df$Date)
print(df)

##         CDate Temperature Humidity WindSpeed Precipitation SunshineHours  City
## 1  01-08-2023        25.5       60       5.5            NA             8 CityA
## 2  02-08-2023        26.1       65       6.2           2.3             7 CityB
## 3  03-08-2023          NA       63       5.8           0.0             6 CityA
## 4  04-08-2023        24.8       NA       6.1           1.2            NA CityC
## 5  05-08-2023        23.9       58        NA           0.0             9  <NA>
## 6  06-08-2023        25.0       61       5.6            NA             8 CityA
## 7  07-08-2023          NA       64       5.7           0.5            NA CityB
## 8  08-08-2023        26.7       NA        NA           0.0             7 CityC
## 9  09-08-2023        24.5       59       5.9           0.3             8  <NA>
## 10 10-08-2023        25.3       62       6.0            NA             6 CityA
##          Date DayOfWeek
## 1  0001-08-20    Monday
## 2  0002-08-20   Tuesday
## 3  0003-08-20 Wednesday
## 4  0004-08-20    Friday
## 5  0005-08-20  Saturday
## 6  0006-08-20    Sunday
## 7  0007-08-20    Monday
## 8  0008-08-20 Wednesday
## 9  0009-08-20  Thursday
## 10 0010-08-20    Friday

#13.Filter the dataframe to show only rows where the date is in the first week of August 2023.

df_filtered_august <- df %>% filter(year(Date) == 2023 & month(Date) == 8 & day(Date) <= 7)
print(df_filtered_august)

## [1] CDate         Temperature   Humidity      WindSpeed     Precipitation
## [6] SunshineHours City          Date          DayOfWeek    
## <0 rows> (or 0-length row.names)

#14.Calculate the average Temperature for each day of the week (e.g., average temperature for all Mondays, all Tuesdays, etc.).

avg_temperature_per_day <- df %>% group_by(DayOfWeek) %>% summarise(avg_temp = mean(Temperature, na.rm = TRUE))
print(avg_temperature_per_day)

## # A tibble: 7 × 2
##   DayOfWeek avg_temp
##   <chr>        <dbl>
## 1 Friday        25.0
## 2 Monday        25.5
## 3 Saturday      23.9
## 4 Sunday        25  
## 5 Thursday      24.5
## 6 Tuesday       26.1
## 7 Wednesday     26.7

#15.Create a new dataframe containing only the data from the last 5 days of the dataframe.

last_5_days <- tail(df, 5)
print(last_5_days)

##         CDate Temperature Humidity WindSpeed Precipitation SunshineHours  City
## 6  06-08-2023        25.0       61       5.6            NA             8 CityA
## 7  07-08-2023          NA       64       5.7           0.5            NA CityB
## 8  08-08-2023        26.7       NA        NA           0.0             7 CityC
## 9  09-08-2023        24.5       59       5.9           0.3             8  <NA>
## 10 10-08-2023        25.3       62       6.0            NA             6 CityA
##          Date DayOfWeek
## 6  0006-08-20    Sunday
## 7  0007-08-20    Monday
## 8  0008-08-20 Wednesday
## 9  0009-08-20  Thursday
## 10 0010-08-20    Friday

#16.Find the difference in days between the earliest and latest dates in the dataframe.

date_difference <- as.numeric(difftime(max(df$Date, na.rm = TRUE), min(df$Date, na.rm = TRUE), units = "days"))
print(paste("Difference in days between earliest and latest dates:", date_difference))

## [1] "Difference in days between earliest and latest dates: 3287"

#17.Filter the dataframe to show only rows where the Date is in the second half of the month (i.e.,after the 15th of the month).

df_second_half_month <- df %>% filter(day(Date) > 15)
print(df_second_half_month)

##         CDate Temperature Humidity WindSpeed Precipitation SunshineHours  City
## 1  01-08-2023        25.5       60       5.5            NA             8 CityA
## 2  02-08-2023        26.1       65       6.2           2.3             7 CityB
## 3  03-08-2023          NA       63       5.8           0.0             6 CityA
## 4  04-08-2023        24.8       NA       6.1           1.2            NA CityC
## 5  05-08-2023        23.9       58        NA           0.0             9  <NA>
## 6  06-08-2023        25.0       61       5.6            NA             8 CityA
## 7  07-08-2023          NA       64       5.7           0.5            NA CityB
## 8  08-08-2023        26.7       NA        NA           0.0             7 CityC
## 9  09-08-2023        24.5       59       5.9           0.3             8  <NA>
## 10 10-08-2023        25.3       62       6.0            NA             6 CityA
##          Date DayOfWeek
## 1  0001-08-20    Monday
## 2  0002-08-20   Tuesday
## 3  0003-08-20 Wednesday
## 4  0004-08-20    Friday
## 5  0005-08-20  Saturday
## 6  0006-08-20    Sunday
## 7  0007-08-20    Monday
## 8  0008-08-20 Wednesday
## 9  0009-08-20  Thursday
## 10 0010-08-20    Friday

#18. Find the average WindSpeed for dates that are Mondays.

monday_avg_windspeed <- df %>% filter(DayOfWeek == "Monday") %>% summarise(avg_windspeed = mean(WindSpeed, na.rm = TRUE))
print(paste("Average WindSpeed for Mondays:", monday_avg_windspeed$avg_windspeed))

## [1] "Average WindSpeed for Mondays: 5.6"

FOUNDATION OF DATA ANALYTICS-LAB5

HEMAMALINI.R-21MIA1130

2024-08-16