# Given data
hours <- c(2.5, 3, 0, 4, 1.5, 5, 0)
# (a) Total study hours for the week
total_hours <- sum(hours)
cat("Total study hours for the week:", total_hours, "hours\n")
## Total study hours for the week: 16 hours
# (b) Average hours studied per day
average_hours <- mean(hours)
cat("Average hours studied per day:", average_hours, "hours\n")
## Average hours studied per day: 2.285714 hours
# (c) Deviation from average
deviation_from_average <- hours - average_hours
# Create data frame
Student_Score <- data.frame(
Student = paste("Day", 1:7),
Week = rep(1, 7),
Hours_Studied = hours,
Deviation = deviation_from_average
)
print(Student_Score)
## Student Week Hours_Studied Deviation
## 1 Day 1 1 2.5 0.2142857
## 2 Day 2 1 3.0 0.7142857
## 3 Day 3 1 0.0 -2.2857143
## 4 Day 4 1 4.0 1.7142857
## 5 Day 5 1 1.5 -0.7857143
## 6 Day 6 1 5.0 2.7142857
## 7 Day 7 1 0.0 -2.2857143
Explanation of Results:
For example, Day 6 has a deviation of 2.71 hours, meaning the student studied about 2.71 hours more than their average on that day. Days 3 and 7 (with 0 hours) show significant negative deviations, suggesting inconsistent study patterns.
# Given expense matrix
expenses <- matrix(
c(500, 520, 510, 530,
300, 290, 310, 305,
200, 220, 210, 215),
nrow = 3,
byrow = TRUE
)
# Label rows and columns for clarity
rownames(expenses) <- c("Groceries", "Transport", "Entertainment")
colnames(expenses) <- c("Month1", "Month2", "Month3", "Month4")
print("Expense Matrix:")
## [1] "Expense Matrix:"
print(expenses)
## Month1 Month2 Month3 Month4
## Groceries 500 520 510 530
## Transport 300 290 310 305
## Entertainment 200 220 210 215
# (a) Total spending per category
total_per_category <- rowSums(expenses)
cat("\nTotal spending per category (INR):\n")
##
## Total spending per category (INR):
print(total_per_category)
## Groceries Transport Entertainment
## 2060 1205 845
# (b) Average spending per month
average_per_month <- colMeans(expenses)
cat("\nAverage spending per month (INR):\n")
##
## Average spending per month (INR):
print(average_per_month)
## Month1 Month2 Month3 Month4
## 333.3333 343.3333 343.3333 350.0000
# (c) Category with highest total expenditure
highest_category <- names(which.max(total_per_category))
highest_amount <- max(total_per_category)
cat("\nCategory with highest total expenditure:", highest_category,
"with INR", highest_amount, "\n")
##
## Category with highest total expenditure: Groceries with INR 2060
Key Insights:
# Given data
usage <- c(18, 25, 12, 30, 20)
household_size <- c(2, 4, 3, 5, 2)
# (a) Creating data frame
electricity_df <- data.frame(
Household = paste("HH", 1:5, sep = ""),
Usage_kWh = usage,
Household_Size = household_size
)
# (b) Computing electricity usage per person
electricity_df$Usage_Per_Person <- electricity_df$Usage_kWh / electricity_df$Household_Size
print(electricity_df)
## Household Usage_kWh Household_Size Usage_Per_Person
## 1 HH1 18 2 9.00
## 2 HH2 25 4 6.25
## 3 HH3 12 3 4.00
## 4 HH4 30 5 6.00
## 5 HH5 20 2 10.00
# (c) Identifying households with above average per person usage
avg_per_person <- mean(electricity_df$Usage_Per_Person)
cat("\nAverage per-person usage:", round(avg_per_person, 2), "kWh\n")
##
## Average per-person usage: 7.05 kWh
above_average <- electricity_df[electricity_df$Usage_Per_Person > avg_per_person, ]
cat("\nHouseholds with above-average per-person usage:\n")
##
## Households with above-average per-person usage:
print(above_average)
## Household Usage_kWh Household_Size Usage_Per_Person
## 1 HH1 18 2 9
## 5 HH5 20 2 10
Data frame are more Appropriate:
Data frames are superior to separate vectors in this context because:
Each row represents a complete observation (one household), keeping related data together
We can easily add derived columns (like Usage_Per_Person) while maintaining the relationship with original data
We can filter entire rows based on conditions (households above average) without managing multiple vector indices
Changes to one variable automatically align with others, preventing index mismatches
The tabular structure is more intuitive and mirrors how we naturally think about structured data
In contrast, separate vectors would require manual index management and make it harder to ensure all related information stays synchronized.
# Given data
classes_held <- c(40, 40, 40, 40)
classes_attended <- c(38, 30, 35, 28)
# (a) Creating data frame
attendance_df <- data.frame(
Student = paste("Student", 1:4),
Classes_Held = classes_held,
Classes_Attended = classes_attended
)
# (b) Compute attendance percentage
attendance_df$Attendance_Percentage <- (attendance_df$Classes_Attended /
attendance_df$Classes_Held) * 100
print(attendance_df)
## Student Classes_Held Classes_Attended Attendance_Percentage
## 1 Student 1 40 38 95.0
## 2 Student 2 40 30 75.0
## 3 Student 3 40 35 87.5
## 4 Student 4 40 28 70.0
# (c) Identifying students with attendance below 75%
below_threshold <- attendance_df[attendance_df$Attendance_Percentage < 75, ]
cat("\nStudents with attendance below 75%:\n")
##
## Students with attendance below 75%:
print(below_threshold)
## Student Classes_Held Classes_Attended Attendance_Percentage
## 4 Student 4 40 28 70
How Column-wise Operations Simplify This Analysis:
Column-wise operations provide significant advantages:
The division Classes_Attended / Classes_Held
operates on entire columns simultaneously, eliminating the need for
loops. This is both faster and more concise.
R automatically aligns elements by row position, so each student’s attended classes are divided by their held classes without manual indexing.
We can compute all percentages in one line rather than iterating through each student individually.
Logical comparisons work on entire columns, making subsetting
straightforward
(attendance_df$Attendance_Percentage < 75).
If we add more students, the same code works without modification—it naturally scales to any number of rows.
This vectorized approach is a core strength of R, making data frame operations elegant and efficient.
# Given data
rainfall <- c(0, 3, 12, 7, 0, 22, 15)
# (a) Function to classify a single rainfall value
classify_rainfall <- function(rain) {
if (rain < 5) {
return("Light")
} else if (rain >= 5 & rain <= 20) {
return("Moderate")
} else {
return("Heavy")
}
}
# Example of single value classification
single_value <- rainfall[3]
cat("Rainfall of", single_value, "mm is classified as:",
classify_rainfall(single_value), "\n\n")
## Rainfall of 12 mm is classified as: Moderate
# (b) Applying to all days
rainfall_category <- sapply(rainfall, classify_rainfall)
# Creating data frame
rainfall_df <- data.frame(
Day = paste("Day", 1:7),
Rainfall_mm = rainfall,
Category = rainfall_category
)
print(rainfall_df)
## Day Rainfall_mm Category
## 1 Day 1 0 Light
## 2 Day 2 3 Light
## 3 Day 3 12 Moderate
## 4 Day 4 7 Moderate
## 5 Day 5 0 Light
## 6 Day 6 22 Heavy
## 7 Day 7 15 Moderate
# Summary
cat("\nSummary of rainfall categories:\n")
##
## Summary of rainfall categories:
print(table(rainfall_df$Category))
##
## Heavy Light Moderate
## 1 3 3
Explanation:
The classification system uses nested if/else logic:
Light (< 5mm): Days 1, 2, and 5 with minimal precipitation
Moderate (5-20mm): Days 3, 4, and 7 with substantial rainfall
Heavy (> 20mm): Day 6 with intense precipitation
The sapply() function applies our classification
function to each element of the rainfall vector, demonstrating how we
can automate repetitive conditional logic across datasets.
# (a) Function to calculate travel cost
calculate_travel_cost <- function(distance) {
# Base cost
cost <- 15
# Add cost for distance
if (distance <= 30) {
# All distance charged at 1.50 per km
cost <- cost + (distance * 1.50)
} else {
# First 30 km at 1.50, remainder at 1.00
cost <- cost + (30 * 1.50) + ((distance - 30) * 1.00)
}
return(cost)
}
# (b) Compute costs for given distances
distances <- c(10, 25, 30, 45, 60)
# Applying function to all distances
costs <- sapply(distances, calculate_travel_cost)
# Creating summary table
travel_cost_df <- data.frame(
Distance_km = distances,
Total_Cost_INR = costs
)
print(travel_cost_df)
## Distance_km Total_Cost_INR
## 1 10 30.0
## 2 25 52.5
## 3 30 60.0
## 4 45 75.0
## 5 60 90.0
# Detailed breakdown for clarity
cat("\nDetailed Cost Breakdown:\n")
##
## Detailed Cost Breakdown:
for (i in 1:length(distances)) {
cat(sprintf("Distance: %d km → Cost: ₹%.2f\n", distances[i], costs[i]))
}
## Distance: 10 km → Cost: ₹30.00
## Distance: 25 km → Cost: ₹52.50
## Distance: 30 km → Cost: ₹60.00
## Distance: 45 km → Cost: ₹75.00
## Distance: 60 km → Cost: ₹90.00
Logic Explanation:
The pricing structure implements a two tier system:
Examples: - 10 km: ₹15 + (10 × ₹1.50) = ₹30.00 - 30 km: ₹15 + (30 × ₹1.50) = ₹60.00 - 45 km: ₹15 + (30 × ₹1.50) + (15 × ₹1.00) = ₹75.00
This tiered structure provides a volume discount, encouraging longer-distance travel while maintaining profitability on shorter trips.
# Loading the dataset
data("USArrests")
# Display first few rows
cat("First 6 rows of USArrests dataset:\n")
## First 6 rows of USArrests dataset:
head(USArrests)
## Murder Assault UrbanPop Rape
## Alabama 13.2 236 58 21.2
## Alaska 10.0 263 48 44.5
## Arizona 8.1 294 80 31.0
## Arkansas 8.8 190 50 19.5
## California 9.0 276 91 40.6
## Colorado 7.9 204 78 38.7
# Get dataset information
cat("\nDataset structure:\n")
##
## Dataset structure:
str(USArrests)
## 'data.frame': 50 obs. of 4 variables:
## $ Murder : num 13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
## $ Assault : int 236 263 294 190 276 204 110 238 335 211 ...
## $ UrbanPop: int 58 48 80 50 91 78 77 72 80 60 ...
## $ Rape : num 21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
# Create a violent crime indicator
# We'll combine Murder, Assault, and Rape (all violent crimes)
# Weighted approach: these are all serious but different magnitudes
# Simple approach: sum of all violent crime rates
USArrests$Total_Violent_Crime <- USArrests$Murder +
USArrests$Assault +
USArrests$Rape
# Alternative: weighted index (normalized to 0-100 scale)
# This creates a composite score
USArrests$Violent_Crime_Index <- (
(USArrests$Murder / max(USArrests$Murder)) * 33.33 +
(USArrests$Assault / max(USArrests$Assault)) * 33.33 +
(USArrests$Rape / max(USArrests$Rape)) * 33.33
)
cat("States with highest total violent crime:\n")
## States with highest total violent crime:
head(USArrests[order(-USArrests$Total_Violent_Crime),
c("Total_Violent_Crime", "Violent_Crime_Index")], 5)
## Total_Violent_Crime Violent_Crime_Index
## Florida 382.3 85.74479
## North Carolina 366.1 69.89722
## Maryland 339.1 71.45888
## Arizona 333.1 67.05442
## New Mexico 328.5 73.28253
# Creating reusable classification function
classify_share <- function(share_vector,
high_threshold = 3,
medium_low = 2,
medium_high = 3) {
# Initializing result vector
classification <- character(length(share_vector))
# Loop through and classify
for (i in 1:length(share_vector)) {
if (share_vector[i] > high_threshold) {
classification[i] <- "High"
} else if (share_vector[i] >= medium_low & share_vector[i] <= medium_high) {
classification[i] <- "Medium"
} else {
classification[i] <- "Low"
}
}
return(classification)
}
# Test the function on USArrests Murder Share
test_classification <- classify_share(USArrests$Murder_Share,
high_threshold = 3,
medium_low = 2,
medium_high = 3)
cat("Verification - Function produces same results:\n")
## Verification - Function produces same results:
print(all(test_classification == USArrests$Murder_Classification))
## [1] TRUE
# Apply to LifeCycleSavings dataset
data("LifeCycleSavings")
cat("\n\nLifeCycleSavings dataset - Savings Rate (sr) variable:\n")
##
##
## LifeCycleSavings dataset - Savings Rate (sr) variable:
head(LifeCycleSavings)
## sr pop15 pop75 dpi ddpi
## Australia 11.43 29.35 2.87 2329.68 2.87
## Austria 12.07 23.32 4.41 1507.99 3.93
## Belgium 13.17 23.80 4.43 2108.47 3.82
## Bolivia 5.75 41.89 1.67 189.13 0.22
## Brazil 12.88 42.19 0.83 728.47 4.56
## Canada 8.79 31.72 2.85 2982.88 2.43
# Classify savings rates with appropriate thresholds
# sr is aggregate personal savings divided by disposable income
# Typical thresholds for savings rates:
LifeCycleSavings$SR_Classification <- classify_share(
LifeCycleSavings$sr,
high_threshold = 12, # >12% is high savings
medium_low = 8, # 8-12% is medium savings
medium_high = 12
)
cat("\nSavings Rate Classification:\n")
##
## Savings Rate Classification:
print(LifeCycleSavings[, c("sr", "SR_Classification")])
## sr SR_Classification
## Australia 11.43 Medium
## Austria 12.07 High
## Belgium 13.17 High
## Bolivia 5.75 Low
## Brazil 12.88 High
## Canada 8.79 Medium
## Chile 0.60 Low
## China 11.90 Medium
## Colombia 4.98 Low
## Costa Rica 10.78 Medium
## Denmark 16.85 High
## Ecuador 3.59 Low
## Finland 11.24 Medium
## France 12.64 High
## Germany 12.55 High
## Greece 10.67 Medium
## Guatamala 3.01 Low
## Honduras 7.70 Low
## Iceland 1.27 Low
## India 9.00 Medium
## Ireland 11.34 Medium
## Italy 14.28 High
## Japan 21.10 High
## Korea 3.98 Low
## Luxembourg 10.35 Medium
## Malta 15.48 High
## Norway 10.25 Medium
## Netherlands 14.65 High
## New Zealand 10.67 Medium
## Nicaragua 7.30 Low
## Panama 4.44 Low
## Paraguay 2.02 Low
## Peru 12.70 High
## Philippines 12.78 High
## Portugal 12.49 High
## South Africa 11.14 Medium
## South Rhodesia 13.30 High
## Spain 11.77 Medium
## Sweden 6.86 Low
## Switzerland 14.13 High
## Turkey 5.13 Low
## Tunisia 2.81 Low
## United Kingdom 7.81 Low
## United States 7.56 Low
## Venezuela 9.22 Medium
## Zambia 18.56 High
## Jamaica 7.72 Low
## Uruguay 9.24 Medium
## Libya 8.89 Medium
## Malaysia 4.71 Low
cat("\nSummary of Savings Classifications:\n")
##
## Summary of Savings Classifications:
print(table(LifeCycleSavings$SR_Classification))
##
## High Low Medium
## 16 18 16
# Show countries in each category
cat("\nCountries by Savings Category:\n")
##
## Countries by Savings Category:
cat("\nHigh Savers (>12%):\n")
##
## High Savers (>12%):
print(rownames(LifeCycleSavings[LifeCycleSavings$SR_Classification == "High", ]))
## [1] "Austria" "Belgium" "Brazil" "Denmark"
## [5] "France" "Germany" "Italy" "Japan"
## [9] "Malta" "Netherlands" "Peru" "Philippines"
## [13] "Portugal" "South Rhodesia" "Switzerland" "Zambia"
cat("\nMedium Savers (8-12%):\n")
##
## Medium Savers (8-12%):
print(rownames(LifeCycleSavings[LifeCycleSavings$SR_Classification == "Medium", ]))
## [1] "Australia" "Canada" "China" "Costa Rica" "Finland"
## [6] "Greece" "India" "Ireland" "Luxembourg" "Norway"
## [11] "New Zealand" "South Africa" "Spain" "Venezuela" "Uruguay"
## [16] "Libya"
# 1. Correlation Analysis between Crime Variables
cat("Correlation Matrix of Crime Variables:\n")
## Correlation Matrix of Crime Variables:
crime_vars <- USArrests[, c("Murder", "Assault", "Rape", "UrbanPop")]
correlation_matrix <- cor(crime_vars)
print(round(correlation_matrix, 3))
## Murder Assault Rape UrbanPop
## Murder 1.000 0.802 0.564 0.070
## Assault 0.802 1.000 0.665 0.259
## Rape 0.564 0.665 1.000 0.411
## UrbanPop 0.070 0.259 0.411 1.000
# 2. Urban vs Rural Crime Analysis
cat("\n\nUrban vs Rural Crime Patterns:\n")
##
##
## Urban vs Rural Crime Patterns:
USArrests$Urban_Category <- ifelse(USArrests$UrbanPop > median(USArrests$UrbanPop),
"Highly Urban", "More Rural")
urban_stats <- aggregate(. ~ Urban_Category,
data = USArrests[, c("Murder", "Assault", "Rape", "Urban_Category")],
FUN = mean)
print(urban_stats)
## Urban_Category Murder Assault Rape
## 1 Highly Urban 8.025000 198.3333 25.50833
## 2 More Rural 7.569231 145.3077 17.28462
# 3. Safety Score (inverse of crime)
# Lower crime = higher safety score
USArrests$Safety_Score <- 100 - USArrests$Violent_Crime_Index
cat("\n\nTop 10 Safest States:\n")
##
##
## Top 10 Safest States:
safest_states <- head(USArrests[order(-USArrests$Safety_Score),
c("Safety_Score", "Murder", "Assault", "Rape")], 10)
print(safest_states)
## Safety_Score Murder Assault Rape
## North Dakota 88.72767 0.8 45 7.3
## New Hampshire 83.45663 2.1 57 9.5
## Vermont 82.92343 2.2 48 11.2
## Maine 82.11693 2.1 83 7.8
## Iowa 82.05976 2.2 56 11.3
## Wisconsin 81.95254 2.6 53 10.8
## Minnesota 76.91113 2.7 72 14.9
## South Dakota 74.94102 3.8 86 12.8
## Connecticut 74.75689 3.3 110 11.1
## West Virginia 74.33203 5.7 81 9.3
cat("\nBottom 10 (Most Dangerous) States:\n")
##
## Bottom 10 (Most Dangerous) States:
dangerous_states <- head(USArrests[order(USArrests$Safety_Score),
c("Safety_Score", "Murder", "Assault", "Rape")], 10)
print(dangerous_states)
## Safety_Score Murder Assault Rape
## Florida 14.25521 15.4 335 31.9
## Nevada 18.37737 12.2 252 46.0
## Alaska 22.59043 10.0 263 44.5
## California 26.04602 9.0 276 40.6
## Michigan 26.16997 12.1 255 35.1
## New Mexico 26.71747 11.4 285 32.1
## Georgia 27.10788 17.4 211 25.8
## South Carolina 28.52015 14.4 279 22.5
## Maryland 28.54112 11.3 300 27.8
## Louisiana 29.78907 15.4 249 22.2
# 4. Regional Pattern Visualization Data
cat("\n\nCrime Statistics Summary:\n")
##
##
## Crime Statistics Summary:
summary_stats <- data.frame(
Metric = c("Mean Murder Rate", "Mean Assault Rate", "Mean Rape Rate",
"Mean Urban Pop %", "Mean Violent Crime"),
Value = c(mean(USArrests$Murder),
mean(USArrests$Assault),
mean(USArrests$Rape),
mean(USArrests$UrbanPop),
mean(USArrests$Total_Violent_Crime))
)
print(summary_stats)
## Metric Value
## 1 Mean Murder Rate 7.788
## 2 Mean Assault Rate 170.760
## 3 Mean Rape Rate 21.232
## 4 Mean Urban Pop % 65.540
## 5 Mean Violent Crime 199.780
# 5. Create risk categories combining multiple factors
USArrests$Risk_Level <- ifelse(
USArrests$Violent_Crime_Index > quantile(USArrests$Violent_Crime_Index, 0.75),
"Very High Risk",
ifelse(USArrests$Violent_Crime_Index > quantile(USArrests$Violent_Crime_Index, 0.5),
"High Risk",
ifelse(USArrests$Violent_Crime_Index > quantile(USArrests$Violent_Crime_Index, 0.25),
"Moderate Risk",
"Low Risk"))
)
cat("\n\nOverall Risk Level Distribution:\n")
##
##
## Overall Risk Level Distribution:
print(table(USArrests$Risk_Level))
##
## High Risk Low Risk Moderate Risk Very High Risk
## 12 13 12 13
End of Assignment