# =====================================================
# Statistics with R Question Paper Solutions
# Duration: 60 minutes, Total Marks: 20 (4 questions × 5 marks each)
# =====================================================
options(repos = c(CRAN = "https://cloud.r-project.org"))
# Load required libraries
install.packages("dplyr")
## Installing package into 'C:/Users/tapes/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'dplyr' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'dplyr'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\tapes\AppData\Local\R\win-library\4.5\00LOCK\dplyr\libs\x64\dplyr.dll
## to C:\Users\tapes\AppData\Local\R\win-library\4.5\dplyr\libs\x64\dplyr.dll:
## Permission denied
## Warning: restored 'dplyr'
##
## The downloaded binary packages are in
## C:\Users\tapes\AppData\Local\Temp\RtmpGgHi6Z\downloaded_packages
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# =====================================================
# Question 1: Data Filtering and Selection (5 marks)
# =====================================================
# a) Load the dplyr package and display the first 6 rows of mtcars (1 mark)
print("Question 1a: First 6 rows of mtcars dataset")
## [1] "Question 1a: First 6 rows of mtcars dataset"
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
# b) Filter cars with mpg greater than 20 and hp less than 150 (2 marks)
print("Question 1b: Filtered cars (mpg > 20 and hp < 150)")
## [1] "Question 1b: Filtered cars (mpg > 20 and hp < 150)"
filtered_cars <- mtcars %>%
filter(mpg > 20, hp < 150)
print(filtered_cars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
# c) Select only the columns: mpg, hp, wt, and gear from the filtered data (1 mark)
print("Question 1c: Selected columns from filtered data")
## [1] "Question 1c: Selected columns from filtered data"
selected_cols <- filtered_cars %>%
select(mpg, hp, wt, gear)
print(selected_cols)
## mpg hp wt gear
## Mazda RX4 21.0 110 2.620 4
## Mazda RX4 Wag 21.0 110 2.875 4
## Datsun 710 22.8 93 2.320 4
## Hornet 4 Drive 21.4 110 3.215 3
## Merc 240D 24.4 62 3.190 4
## Merc 230 22.8 95 3.150 4
## Fiat 128 32.4 66 2.200 4
## Honda Civic 30.4 52 1.615 4
## Toyota Corolla 33.9 65 1.835 4
## Toyota Corona 21.5 97 2.465 3
## Fiat X1-9 27.3 66 1.935 4
## Porsche 914-2 26.0 91 2.140 5
## Lotus Europa 30.4 113 1.513 5
## Volvo 142E 21.4 109 2.780 4
# d) Arrange the results in descending order of mpg (1 mark)
print("Question 1d: Results arranged by descending mpg")
## [1] "Question 1d: Results arranged by descending mpg"
Desc_order <- selected_cols %>%
arrange(desc(mpg))
print(Desc_order)
## mpg hp wt gear
## Toyota Corolla 33.9 65 1.835 4
## Fiat 128 32.4 66 2.200 4
## Honda Civic 30.4 52 1.615 4
## Lotus Europa 30.4 113 1.513 5
## Fiat X1-9 27.3 66 1.935 4
## Porsche 914-2 26.0 91 2.140 5
## Merc 240D 24.4 62 3.190 4
## Datsun 710 22.8 93 2.320 4
## Merc 230 22.8 95 3.150 4
## Toyota Corona 21.5 97 2.465 3
## Hornet 4 Drive 21.4 110 3.215 3
## Volvo 142E 21.4 109 2.780 4
## Mazda RX4 21.0 110 2.620 4
## Mazda RX4 Wag 21.0 110 2.875 4
# =====================================================
# Question 2: Data Transformation and New Variable Creation (5 marks)
# =====================================================
# Create the sample data frame
employee_data <- data.frame(
name = c("John", "Alice", "Bob", "Carol"),
salary = c(50000, 60000, 45000, 70000),
experience = c(2, 5, 1, 8)
)
print("Question 2: Original employee data")
## [1] "Question 2: Original employee data"
print(employee_data)
## name salary experience
## 1 John 50000 2
## 2 Alice 60000 5
## 3 Bob 45000 1
## 4 Carol 70000 8
# a) Add a new column called salary_grade (2 marks)
print("Question 2a: Adding salary_grade column")
## [1] "Question 2a: Adding salary_grade column"
employee_data <- employee_data %>%
mutate(salary_grade = case_when(
salary < 50000 ~ "Low",
salary >= 50000 & salary <= 60000 ~ "Medium",
salary > 60000 ~ "High"
))
print(employee_data)
## name salary experience salary_grade
## 1 John 50000 2 Medium
## 2 Alice 60000 5 Medium
## 3 Bob 45000 1 Low
## 4 Carol 70000 8 High
# b) Create another column salary_per_exp (1 mark)
print("Question 2b: Adding salary_per_exp column")
## [1] "Question 2b: Adding salary_per_exp column"
employee_data <- employee_data %>%
mutate(salary_per_exp = salary / experience)
print(employee_data)
## name salary experience salary_grade salary_per_exp
## 1 John 50000 2 Medium 25000
## 2 Alice 60000 5 Medium 12000
## 3 Bob 45000 1 Low 45000
## 4 Carol 70000 8 High 8750
# c) Filter employees with more than 3 years of experience (1 mark)
print("Question 2c: Employees with more than 3 years experience")
## [1] "Question 2c: Employees with more than 3 years experience"
experienced_employees <- employee_data %>%
filter(experience > 3)
print(experienced_employees)
## name salary experience salary_grade salary_per_exp
## 1 Alice 60000 5 Medium 12000
## 2 Carol 70000 8 High 8750
# d) Calculate the average salary of filtered employees (1 mark)
print("Question 2d: Average salary of experienced employees")
## [1] "Question 2d: Average salary of experienced employees"
avg_salary <- mean(experienced_employees$salary)
print(paste("Average salary:", avg_salary))
## [1] "Average salary: 65000"
# =====================================================
# Question 3: Data Grouping and Summarization (5 marks)
# =====================================================
# a) Group by Species and calculate mean of all numeric variables (3 marks)
print("Question 3a: Mean of numeric variables by Species")
## [1] "Question 3a: Mean of numeric variables by Species"
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
species_means <- iris %>%
group_by(Species) %>%
summarise(
Mean_Sepal_Length = mean(Sepal.Length),
Mean_Sepal_Width = mean(Sepal.Width),
Mean_Petal_Length = mean(Petal.Length),
Mean_Petal_Width = mean(Petal.Width)
)
print(species_means)
## # A tibble: 3 × 5
## Species Mean_Sepal_Length Mean_Sepal_Width Mean_Petal_Length Mean_Petal_Width
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 setosa 5.01 3.43 1.46 0.246
## 2 versico… 5.94 2.77 4.26 1.33
## 3 virgini… 6.59 2.97 5.55 2.03
# b) Find which species has the highest average Petal.Length (1 mark)
print("Question 3b: Species with highest average Petal.Length")
## [1] "Question 3b: Species with highest average Petal.Length"
highest_petal_length <- species_means %>%
arrange(desc(Mean_Petal_Length)) %>%
slice(1)
print(paste("Species with highest average Petal.Length:", highest_petal_length$Species))
## [1] "Species with highest average Petal.Length: virginica"
print(paste("Average Petal.Length:", highest_petal_length$Mean_Petal_Length))
## [1] "Average Petal.Length: 5.552"
# c) Create a summary showing count of observations for each species (1 mark)
print("Question 3c: Count of observations by Species")
## [1] "Question 3c: Count of observations by Species"
species_count <- iris %>%
group_by(Species) %>%
summarise(Count = n())
print(species_count)
## # A tibble: 3 × 2
## Species Count
## <fct> <int>
## 1 setosa 50
## 2 versicolor 50
## 3 virginica 50
# =====================================================
# Question 4: Data Cleaning and Manipulation (5 marks)
# =====================================================
# Create the messy dataset
messy_data <- data.frame(
id = 1:6,
score1 = c(85, NA, 92, 78, NA, 88),
score2 = c(90, 87, NA, 82, 91, NA),
category = c("A", "B", "A", "C", "B", "A")
)
print("Question 4: Original messy data")
## [1] "Question 4: Original messy data"
print(messy_data)
## id score1 score2 category
## 1 1 85 90 A
## 2 2 NA 87 B
## 3 3 92 NA A
## 4 4 78 82 C
## 5 5 NA 91 B
## 6 6 88 NA A
# a) Identify rows with missing values (1 mark)
print("Question 4a: Rows with missing values")
## [1] "Question 4a: Rows with missing values"
rows_with_na <- which(rowSums(is.na(messy_data)) > 0)
print(paste("Rows with missing values:", paste(rows_with_na, collapse = ", ")))
## [1] "Rows with missing values: 2, 3, 5, 6"
# Show which columns have missing values
print("Missing values by column:")
## [1] "Missing values by column:"
print(colSums(is.na(messy_data)))
## id score1 score2 category
## 0 2 2 0
# b) Replace missing values in score1 with the mean of available score1 values (2 marks)
print("Question 4b: Replacing missing values in score1 with mean")
## [1] "Question 4b: Replacing missing values in score1 with mean"
mean_score1 <- mean(messy_data$score1, na.rm = TRUE)
print(paste("Mean of score1 (excluding NA):", mean_score1))
## [1] "Mean of score1 (excluding NA): 85.75"
messy_data$score1[is.na(messy_data$score1)] <- mean_score1
print("Data after replacing score1 missing values:")
## [1] "Data after replacing score1 missing values:"
print(messy_data)
## id score1 score2 category
## 1 1 85.00 90 A
## 2 2 85.75 87 B
## 3 3 92.00 NA A
## 4 4 78.00 82 C
## 5 5 85.75 91 B
## 6 6 88.00 NA A
# c) Remove rows where score2 has missing values (1 mark)
print("Question 4c: Removing rows with missing score2 values")
## [1] "Question 4c: Removing rows with missing score2 values"
clean_data <- messy_data[!is.na(messy_data$score2), ]
print("Data after removing rows with missing score2:")
## [1] "Data after removing rows with missing score2:"
print(clean_data)
## id score1 score2 category
## 1 1 85.00 90 A
## 2 2 85.75 87 B
## 4 4 78.00 82 C
## 5 5 85.75 91 B
# d) Rename the columns to: ID, Test1_Score, Test2_Score, Group (1 mark)
print("Question 4d: Renaming columns")
## [1] "Question 4d: Renaming columns"
names(clean_data) <- c("ID", "Test1_Score", "Test2_Score", "Group")
print("Final cleaned data with renamed columns:")
## [1] "Final cleaned data with renamed columns:"
print(clean_data)
## ID Test1_Score Test2_Score Group
## 1 1 85.00 90 A
## 2 2 85.75 87 B
## 4 4 78.00 82 C
## 5 5 85.75 91 B
# =====================================================
# Summary Statistics
# =====================================================
print("\n=====================================================")
## [1] "\n====================================================="
print("SUMMARY OF ALL RESULTS")
## [1] "SUMMARY OF ALL RESULTS"
print("=====================================================")
## [1] "====================================================="
print("Question 1 Final Result:")
## [1] "Question 1 Final Result:"
final_result_q1 <- Desc_order
print(final_result_q1)
## mpg hp wt gear
## Toyota Corolla 33.9 65 1.835 4
## Fiat 128 32.4 66 2.200 4
## Honda Civic 30.4 52 1.615 4
## Lotus Europa 30.4 113 1.513 5
## Fiat X1-9 27.3 66 1.935 4
## Porsche 914-2 26.0 91 2.140 5
## Merc 240D 24.4 62 3.190 4
## Datsun 710 22.8 93 2.320 4
## Merc 230 22.8 95 3.150 4
## Toyota Corona 21.5 97 2.465 3
## Hornet 4 Drive 21.4 110 3.215 3
## Volvo 142E 21.4 109 2.780 4
## Mazda RX4 21.0 110 2.620 4
## Mazda RX4 Wag 21.0 110 2.875 4
print("\nQuestion 2 - Experienced Employees Average Salary:")
## [1] "\nQuestion 2 - Experienced Employees Average Salary:"
print(avg_salary)
## [1] 65000
print("\nQuestion 3 - Species Means:")
## [1] "\nQuestion 3 - Species Means:"
print(species_means)
## # A tibble: 3 × 5
## Species Mean_Sepal_Length Mean_Sepal_Width Mean_Petal_Length Mean_Petal_Width
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 setosa 5.01 3.43 1.46 0.246
## 2 versico… 5.94 2.77 4.26 1.33
## 3 virgini… 6.59 2.97 5.55 2.03
print("\nQuestion 4 - Final Cleaned Data:")
## [1] "\nQuestion 4 - Final Cleaned Data:"
print(clean_data)
## ID Test1_Score Test2_Score Group
## 1 1 85.00 90 A
## 2 2 85.75 87 B
## 4 4 78.00 82 C
## 5 5 85.75 91 B
print("\n=====================================================")
## [1] "\n====================================================="
print("END OF SOLUTIONS")
## [1] "END OF SOLUTIONS"
print("=====================================================")
## [1] "====================================================="