# =====================================================
# Statistics with R Question Paper Solutions
# Duration: 60 minutes, Total Marks: 20 (4 questions × 5 marks each)
# =====================================================
options(repos = c(CRAN = "https://cloud.r-project.org"))
# Load required libraries
install.packages("dplyr")
## Installing package into 'C:/Users/tapes/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'dplyr' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'dplyr'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\tapes\AppData\Local\R\win-library\4.5\00LOCK\dplyr\libs\x64\dplyr.dll
## to C:\Users\tapes\AppData\Local\R\win-library\4.5\dplyr\libs\x64\dplyr.dll:
## Permission denied
## Warning: restored 'dplyr'
## 
## The downloaded binary packages are in
##  C:\Users\tapes\AppData\Local\Temp\RtmpGgHi6Z\downloaded_packages
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# =====================================================
# Question 1: Data Filtering and Selection (5 marks)
# =====================================================

# a) Load the dplyr package and display the first 6 rows of mtcars (1 mark)
print("Question 1a: First 6 rows of mtcars dataset")
## [1] "Question 1a: First 6 rows of mtcars dataset"
head(mtcars)
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
# b) Filter cars with mpg greater than 20 and hp less than 150 (2 marks)
print("Question 1b: Filtered cars (mpg > 20 and hp < 150)")
## [1] "Question 1b: Filtered cars (mpg > 20 and hp < 150)"
filtered_cars <- mtcars %>%
  filter(mpg > 20, hp < 150)
print(filtered_cars)
##                 mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4      21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag  21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710     22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## Merc 240D      24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## Merc 230       22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## Fiat 128       32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## Honda Civic    30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## Toyota Corolla 33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## Toyota Corona  21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
## Fiat X1-9      27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## Porsche 914-2  26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
## Lotus Europa   30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
## Volvo 142E     21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2
# c) Select only the columns: mpg, hp, wt, and gear from the filtered data (1 mark)
print("Question 1c: Selected columns from filtered data")
## [1] "Question 1c: Selected columns from filtered data"
selected_cols <- filtered_cars %>%
  select(mpg, hp, wt, gear)
print(selected_cols)
##                 mpg  hp    wt gear
## Mazda RX4      21.0 110 2.620    4
## Mazda RX4 Wag  21.0 110 2.875    4
## Datsun 710     22.8  93 2.320    4
## Hornet 4 Drive 21.4 110 3.215    3
## Merc 240D      24.4  62 3.190    4
## Merc 230       22.8  95 3.150    4
## Fiat 128       32.4  66 2.200    4
## Honda Civic    30.4  52 1.615    4
## Toyota Corolla 33.9  65 1.835    4
## Toyota Corona  21.5  97 2.465    3
## Fiat X1-9      27.3  66 1.935    4
## Porsche 914-2  26.0  91 2.140    5
## Lotus Europa   30.4 113 1.513    5
## Volvo 142E     21.4 109 2.780    4
# d) Arrange the results in descending order of mpg (1 mark)
print("Question 1d: Results arranged by descending mpg")
## [1] "Question 1d: Results arranged by descending mpg"
Desc_order <- selected_cols %>%
  arrange(desc(mpg))
print(Desc_order)
##                 mpg  hp    wt gear
## Toyota Corolla 33.9  65 1.835    4
## Fiat 128       32.4  66 2.200    4
## Honda Civic    30.4  52 1.615    4
## Lotus Europa   30.4 113 1.513    5
## Fiat X1-9      27.3  66 1.935    4
## Porsche 914-2  26.0  91 2.140    5
## Merc 240D      24.4  62 3.190    4
## Datsun 710     22.8  93 2.320    4
## Merc 230       22.8  95 3.150    4
## Toyota Corona  21.5  97 2.465    3
## Hornet 4 Drive 21.4 110 3.215    3
## Volvo 142E     21.4 109 2.780    4
## Mazda RX4      21.0 110 2.620    4
## Mazda RX4 Wag  21.0 110 2.875    4
# =====================================================
# Question 2: Data Transformation and New Variable Creation (5 marks)
# =====================================================

# Create the sample data frame
employee_data <- data.frame(
  name = c("John", "Alice", "Bob", "Carol"),
  salary = c(50000, 60000, 45000, 70000),
  experience = c(2, 5, 1, 8)
)

print("Question 2: Original employee data")
## [1] "Question 2: Original employee data"
print(employee_data)
##    name salary experience
## 1  John  50000          2
## 2 Alice  60000          5
## 3   Bob  45000          1
## 4 Carol  70000          8
# a) Add a new column called salary_grade (2 marks)
print("Question 2a: Adding salary_grade column")
## [1] "Question 2a: Adding salary_grade column"
employee_data <- employee_data %>%
  mutate(salary_grade = case_when(
    salary < 50000 ~ "Low",
    salary >= 50000 & salary <= 60000 ~ "Medium",
    salary > 60000 ~ "High"
  ))
print(employee_data)
##    name salary experience salary_grade
## 1  John  50000          2       Medium
## 2 Alice  60000          5       Medium
## 3   Bob  45000          1          Low
## 4 Carol  70000          8         High
# b) Create another column salary_per_exp (1 mark)
print("Question 2b: Adding salary_per_exp column")
## [1] "Question 2b: Adding salary_per_exp column"
employee_data <- employee_data %>%
  mutate(salary_per_exp = salary / experience)
print(employee_data)
##    name salary experience salary_grade salary_per_exp
## 1  John  50000          2       Medium          25000
## 2 Alice  60000          5       Medium          12000
## 3   Bob  45000          1          Low          45000
## 4 Carol  70000          8         High           8750
# c) Filter employees with more than 3 years of experience (1 mark)
print("Question 2c: Employees with more than 3 years experience")
## [1] "Question 2c: Employees with more than 3 years experience"
experienced_employees <- employee_data %>%
  filter(experience > 3)
print(experienced_employees)
##    name salary experience salary_grade salary_per_exp
## 1 Alice  60000          5       Medium          12000
## 2 Carol  70000          8         High           8750
# d) Calculate the average salary of filtered employees (1 mark)
print("Question 2d: Average salary of experienced employees")
## [1] "Question 2d: Average salary of experienced employees"
avg_salary <- mean(experienced_employees$salary)
print(paste("Average salary:", avg_salary))
## [1] "Average salary: 65000"
# =====================================================
# Question 3: Data Grouping and Summarization (5 marks)
# =====================================================



# a) Group by Species and calculate mean of all numeric variables (3 marks)
print("Question 3a: Mean of numeric variables by Species")
## [1] "Question 3a: Mean of numeric variables by Species"
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
species_means <- iris %>%
  group_by(Species) %>%
  summarise(
    Mean_Sepal_Length = mean(Sepal.Length),
    Mean_Sepal_Width = mean(Sepal.Width),
    Mean_Petal_Length = mean(Petal.Length),
    Mean_Petal_Width = mean(Petal.Width)
  )
print(species_means)
## # A tibble: 3 × 5
##   Species  Mean_Sepal_Length Mean_Sepal_Width Mean_Petal_Length Mean_Petal_Width
##   <fct>                <dbl>            <dbl>             <dbl>            <dbl>
## 1 setosa                5.01             3.43              1.46            0.246
## 2 versico…              5.94             2.77              4.26            1.33 
## 3 virgini…              6.59             2.97              5.55            2.03
# b) Find which species has the highest average Petal.Length (1 mark)
print("Question 3b: Species with highest average Petal.Length")
## [1] "Question 3b: Species with highest average Petal.Length"
highest_petal_length <- species_means %>%
  arrange(desc(Mean_Petal_Length)) %>%
  slice(1)
print(paste("Species with highest average Petal.Length:", highest_petal_length$Species))
## [1] "Species with highest average Petal.Length: virginica"
print(paste("Average Petal.Length:", highest_petal_length$Mean_Petal_Length))
## [1] "Average Petal.Length: 5.552"
# c) Create a summary showing count of observations for each species (1 mark)
print("Question 3c: Count of observations by Species")
## [1] "Question 3c: Count of observations by Species"
species_count <- iris %>%
  group_by(Species) %>%
  summarise(Count = n())
print(species_count)
## # A tibble: 3 × 2
##   Species    Count
##   <fct>      <int>
## 1 setosa        50
## 2 versicolor    50
## 3 virginica     50
# =====================================================
# Question 4: Data Cleaning and Manipulation (5 marks)
# =====================================================

# Create the messy dataset
messy_data <- data.frame(
  id = 1:6,
  score1 = c(85, NA, 92, 78, NA, 88),
  score2 = c(90, 87, NA, 82, 91, NA),
  category = c("A", "B", "A", "C", "B", "A")
)

print("Question 4: Original messy data")
## [1] "Question 4: Original messy data"
print(messy_data)
##   id score1 score2 category
## 1  1     85     90        A
## 2  2     NA     87        B
## 3  3     92     NA        A
## 4  4     78     82        C
## 5  5     NA     91        B
## 6  6     88     NA        A
# a) Identify rows with missing values (1 mark)
print("Question 4a: Rows with missing values")
## [1] "Question 4a: Rows with missing values"
rows_with_na <- which(rowSums(is.na(messy_data)) > 0)
print(paste("Rows with missing values:", paste(rows_with_na, collapse = ", ")))
## [1] "Rows with missing values: 2, 3, 5, 6"
# Show which columns have missing values
print("Missing values by column:")
## [1] "Missing values by column:"
print(colSums(is.na(messy_data)))
##       id   score1   score2 category 
##        0        2        2        0
# b) Replace missing values in score1 with the mean of available score1 values (2 marks)
print("Question 4b: Replacing missing values in score1 with mean")
## [1] "Question 4b: Replacing missing values in score1 with mean"
mean_score1 <- mean(messy_data$score1, na.rm = TRUE)
print(paste("Mean of score1 (excluding NA):", mean_score1))
## [1] "Mean of score1 (excluding NA): 85.75"
messy_data$score1[is.na(messy_data$score1)] <- mean_score1
print("Data after replacing score1 missing values:")
## [1] "Data after replacing score1 missing values:"
print(messy_data)
##   id score1 score2 category
## 1  1  85.00     90        A
## 2  2  85.75     87        B
## 3  3  92.00     NA        A
## 4  4  78.00     82        C
## 5  5  85.75     91        B
## 6  6  88.00     NA        A
# c) Remove rows where score2 has missing values (1 mark)
print("Question 4c: Removing rows with missing score2 values")
## [1] "Question 4c: Removing rows with missing score2 values"
clean_data <- messy_data[!is.na(messy_data$score2), ]
print("Data after removing rows with missing score2:")
## [1] "Data after removing rows with missing score2:"
print(clean_data)
##   id score1 score2 category
## 1  1  85.00     90        A
## 2  2  85.75     87        B
## 4  4  78.00     82        C
## 5  5  85.75     91        B
# d) Rename the columns to: ID, Test1_Score, Test2_Score, Group (1 mark)
print("Question 4d: Renaming columns")
## [1] "Question 4d: Renaming columns"
names(clean_data) <- c("ID", "Test1_Score", "Test2_Score", "Group")
print("Final cleaned data with renamed columns:")
## [1] "Final cleaned data with renamed columns:"
print(clean_data)
##   ID Test1_Score Test2_Score Group
## 1  1       85.00          90     A
## 2  2       85.75          87     B
## 4  4       78.00          82     C
## 5  5       85.75          91     B
# =====================================================
# Summary Statistics
# =====================================================
print("\n=====================================================")
## [1] "\n====================================================="
print("SUMMARY OF ALL RESULTS")
## [1] "SUMMARY OF ALL RESULTS"
print("=====================================================")
## [1] "====================================================="
print("Question 1 Final Result:")
## [1] "Question 1 Final Result:"
final_result_q1 <- Desc_order
print(final_result_q1)
##                 mpg  hp    wt gear
## Toyota Corolla 33.9  65 1.835    4
## Fiat 128       32.4  66 2.200    4
## Honda Civic    30.4  52 1.615    4
## Lotus Europa   30.4 113 1.513    5
## Fiat X1-9      27.3  66 1.935    4
## Porsche 914-2  26.0  91 2.140    5
## Merc 240D      24.4  62 3.190    4
## Datsun 710     22.8  93 2.320    4
## Merc 230       22.8  95 3.150    4
## Toyota Corona  21.5  97 2.465    3
## Hornet 4 Drive 21.4 110 3.215    3
## Volvo 142E     21.4 109 2.780    4
## Mazda RX4      21.0 110 2.620    4
## Mazda RX4 Wag  21.0 110 2.875    4
print("\nQuestion 2 - Experienced Employees Average Salary:")
## [1] "\nQuestion 2 - Experienced Employees Average Salary:"
print(avg_salary)
## [1] 65000
print("\nQuestion 3 - Species Means:")
## [1] "\nQuestion 3 - Species Means:"
print(species_means)
## # A tibble: 3 × 5
##   Species  Mean_Sepal_Length Mean_Sepal_Width Mean_Petal_Length Mean_Petal_Width
##   <fct>                <dbl>            <dbl>             <dbl>            <dbl>
## 1 setosa                5.01             3.43              1.46            0.246
## 2 versico…              5.94             2.77              4.26            1.33 
## 3 virgini…              6.59             2.97              5.55            2.03
print("\nQuestion 4 - Final Cleaned Data:")
## [1] "\nQuestion 4 - Final Cleaned Data:"
print(clean_data)
##   ID Test1_Score Test2_Score Group
## 1  1       85.00          90     A
## 2  2       85.75          87     B
## 4  4       78.00          82     C
## 5  5       85.75          91     B
print("\n=====================================================")
## [1] "\n====================================================="
print("END OF SOLUTIONS")
## [1] "END OF SOLUTIONS"
print("=====================================================")
## [1] "====================================================="