Question 1: (Long-to-Wide Data format translation): It’s in a long format where 7 years of statewide data are shown on top of each other (long format), i.e., each state has 7 rows in the data (covering 2000-2006 data). Convert the original data from long to wide, and then back to long format. Note that in the wide format, each state will have a single row of data, with seven times the number of original columns, i.e., the original variables, HPI, UR, Region, Pop, and Percent will be tagged with the year, e.g., HPI2000, …, HPI2006.

# necessary libraries
library(dplyr)
library(tidyr)
library(rvest)
library(ggplot2)
library(plotly)
library(knitr)
library(DT)

# Web scraping to get the data
wiki_url <- read_html("https://wiki.socr.umich.edu/index.php/SOCR_Data_Dinov_010309_HousingPriceIndex")
pd_data <- html_table(html_nodes(wiki_url, "table")[[1]])

# Converting the long format data to wide format
wide_data <- pd_data %>%
  pivot_wider(names_from = Year, values_from = c(HPI, UR, Pop, Percent), names_sep = "")

# Displaying the first 5 rows of the wide format data
datatable(head(wide_data, 5))
# Converting the wide format back to long format
long_data <- wide_data %>%
  pivot_longer(cols = -c(State, Region), names_to = c(".value", "Year"), names_pattern = "([A-Za-z]+)(\\d+)")

# Displaying the first 5 rows of the long format data
datatable(head(long_data, 5))

Question 2: (Data stratification): Complete the following data-manipulation steps in R. These steps need not be concatenated (i.e., applied sequentially). 1. Extract the first 10 subjects 2. Find the cases for which L_caudate < 160. 3. Sort the subjects based on L_caudate values in descending and ascending order. 4. Generate frequency and probability tables for Age, FS_IQ, and Sex. 5. Compute the mean Age and the correlation between Age and FS_IQ. 6. Plot Histogram and density of R_fusiform_gyrus, and draw scatterplot L_fusiform_gyrus and L_insular_cortex.

# web scraping
wiki_url <- read_html("https://wiki.socr.umich.edu/index.php/SOCR_Data_Oct2009_ID_NI")
html_nodes(wiki_url, "#content")
## {xml_nodeset (1)}
## [1] <div id="content" class="mw-body" role="main">\n\t\t\t<a id="top"></a>\n\ ...
pd_data <- html_table(html_nodes(wiki_url, "table")[[1]])
#2.1 Extract the first 10 subjects
tensubs<- pd_data %>%
  slice(1:10)
datatable(head(tensubs,10))
#2.2 Find the cases for which L_caudate < 160.
# Filter the cases where L_caudate is less than 160
filtered_cases <- pd_data %>%
  filter(L_caudate < 160)
# Display the filtered cases
datatable(filtered_cases)
#2.3 Sort the subjects based on L_caudate values in descending and ascending order.
#descending 
sort_desc<- pd_data %>% 
  arrange(desc(L_caudate))
datatable(head(sort_desc,5))
#asscending
sort_asscen<- pd_data %>% 
  arrange((L_caudate))
datatable(head(sort_asscen,5))
##2.4 Generate frequency and probability tables for Age, FS_IQ, and Sex.
#frequency Tables
age_freq <- as.data.frame(table(pd_data$Age))
colnames(age_freq) <- c("Age", "Frequency")
datatable(head(age_freq,5))
fs_iq_freq <- as.data.frame(table(pd_data$FS_IQ))
colnames(fs_iq_freq) <- c("FS_IQ", "Frequency")
datatable(head(fs_iq_freq,5))
sex_freq <- as.data.frame(table(pd_data$Sex))
colnames(sex_freq) <- c("Sex", "Frequency")
datatable(head(sex_freq))
#Probability Tables
age_prob <- as.data.frame(prop.table(table(pd_data$Age)))
colnames(age_prob) <- c("Age", "Probability")
datatable(head(age_prob,5))
fs_iq_prob <- as.data.frame(prop.table(table(pd_data$FS_IQ)))
colnames(fs_iq_prob) <- c("FS_IQ", "Probability")
datatable(head(fs_iq_prob,5))
sex_prob <- as.data.frame(prop.table(table(pd_data$Sex)))
colnames(sex_prob) <- c("Sex", "Probability")
datatable(head(sex_prob))
#2.5 Compute the mean Age and the correlation between Age and FS_IQ.
mean_age <- mean(pd_data$Age, na.rm = TRUE)  # na.rm = TRUE removes missing values (NA)
print(mean_age)
## [1] 13.95238
correlation <- cor(pd_data$Age, pd_data$FS_IQ, use = "complete.obs")
print(correlation) 
## [1] 0.1961842
#2.6 Plot Histogram and density of R_fusiform_gyrus, and draw scatterplot        L_fusiform_gyrus and L_insular_cortex.

# Histogram for R_fusiform_gyrus
hist_plot <- plot_ly(x = pd_data$R_fusiform_gyrus, type = "histogram") %>%
  layout(
    title = "Histogram of R_fusiform_gyrus",  # Simplified title argument
    xaxis = list(title = "R_fusiform_gyrus"), 
    yaxis = list(title = "Count")
  )

# Density plot for R_fusiform_gyrus
density_data <- density(pd_data$R_fusiform_gyrus, na.rm = TRUE)

# Density plot
density_plot <- plot_ly(x = density_data$x, y = density_data$y, type = 'scatter', mode = 'lines') %>%
  layout(
    title = "Density Plot of R_fusiform_gyrus",  # Simplified title argument
    xaxis = list(title = "R_fusiform_gyrus"),
    yaxis = list(title = "Density")
  )

# Scatterplot between L_fusiform_gyrus and L_insular_cortex
scatter_plot <- plot_ly(
  data = pd_data, 
  x = ~L_fusiform_gyrus, 
  y = ~L_insular_cortex, 
  type = "scatter", 
  mode = "markers"
) %>%
  layout(
    title = "Scatterplot of L_fusiform_gyrus vs L_insular_cortex",  # Simplified title argument
    xaxis = list(title = "L_fusiform_gyrus"),
    yaxis = list(title = "L_insular_cortex")
  )

# Display the plots
hist_plot
density_plot
scatter_plot

Question 3: (Simulation) Generate 10,000 standard normal variables and another 5,000 student t-distributed random variables with df=5. Generate a quantile-quantile (Q-Q) probability plot of the two samples. Then, compare it with qqnorm() or plot_ly() of student t simulation and interpret the findings.

Solution 1.3 Step 1: To generate 10,000 standard normal variables. Step 2: To generate 5,000 Student’s t-distributed random variables with df=5. Step 3: To create a Q-Q plot to compare the two distributions. Step 4: To compare the t-distribution with the standard normal distribution using qqnorm().

##QUESTION3
# Set seed for reproducibility
set.seed(123)
# Step 1: Generating 10,000 standard normal variables
standard_normal <- rnorm(10000)
# Step 2: Generating 5,000 Student's t-distributed random variables with df = 5
student_t <- rt(5000, df = 5)
# Step 3: Q-Q plot of the two samples (Standard Normal vs Student t)
qqplot(standard_normal, student_t, main = "Q-Q Plot: Standard Normal vs Student's t (df = 5)", 
       xlab = "Standard Normal Quantiles", ylab = "Student t Quantiles", 
       col = "blue", pch = 19)
abline(0, 1, col = "red", lwd = 2)

Interpretation: This plot reinforces the previous finding. The central points lie close to the line (suggesting similarity in the center of both distributions), but the tails of the Student’s t-distribution diverge significantly from the normal distribution, highlighting the heavier tails. These differences are more pronounced at extreme values.

# Step 4: Comparing t-distribution with normal using qqnorm()
# Q-Q plot of the Student's t-distribution against the normal distribution
qqnorm(student_t, main = "Q-Q Plot of Student's t (df = 5) vs Normal", col = "blue", pch = 19)
qqline(student_t, col = "red", lwd = 2)

Interpretation: The plot shows that while the central portion of the t-distribution aligns reasonably well with the normal distribution (indicated by points near the red line), the tails deviate significantly. Specifically, the points corresponding to extreme quantiles (both in the left and right tails) deviate from the line, suggesting that the t-distribution has heavier tails than the normal distribution. This is a well-known property of the t-distribution, especially with a lower degree of freedom (df=5).

Question 4: (Define an R mode function) Define a new function myMode() that computes the sample mode(s). Test your function using the simulation data you generate in the last question (#1.3). Did you cover all possible situations for the input data? Does your function work with qualitative (numeric), qualitative (character), and tensor (array) inputs? Handle mixed type data-frame inputs gracefully.

#Question 4
myMode <- function(x) {
  # Handle NULL or empty inputs
  if (is.null(x) || length(x) == 0) {
    return(NULL)
  }
  
  # If x is a data frame, apply mode calculation to each column
  if (is.data.frame(x)) {
    return(lapply(x, myMode))
  }
  
  # If x is an array (tensor), flatten it into a vector
  if (is.array(x)) {
    x <- as.vector(x)
  }
  
  # If x is numeric and continuous, round the values
  if (is.numeric(x)) {
    x <- round(x, digits = 2)  # Adjust the digits as needed
  }
  
  # Calculate mode for numeric or character vectors
  unique_vals <- unique(x)  # Get unique values
  counts <- tabulate(match(x, unique_vals))  # Count occurrences of each unique value
  
  # Return the value(s) with the highest frequency
  mode_vals <- unique_vals[counts == max(counts)]
  
  return(mode_vals)
}


# Testing the function with numeric data
cat("Mode of standard normal data:\n")
## Mode of standard normal data:
print(myMode(standard_normal))
## [1] -0.13
cat("Mode of student t data:\n")
## Mode of student t data:
print(myMode(student_t))
## [1] 0.01
# Testing the function with character data
char_data <- c("apple", "banana", "apple", "banana", "apple", "cherry")
print(myMode(char_data))
## [1] "apple"
# Testing with array (tensor) data
array_data <- array(rnorm(100), dim = c(10, 10))
print(myMode(array_data))
## [1] -0.02
# Testing with mixed type data frame
df <- data.frame(
  nums = c(1, 2, 2, 3, 3, 3, 4, 4, 4, 4),
  chars = c("a", "b", "a", "b", "a", "b", "a", "b", "a", "b")
)
print(myMode(df))
## $nums
## [1] 4
## 
## $chars
## [1] "a" "b"