Load Required Libraries

library(lme4)

## Loading required package: Matrix

library(nlme)

## 
## Attaching package: 'nlme'

## The following object is masked from 'package:lme4':
## 
##     lmList

library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:nlme':
## 
##     collapse

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(lattice)
library(geepack)
library(readxl)
library(tinytex)
library(grid)  # For grid.text()

Working with a Dataset

# Data input
data <- c(7, 4, 6, 8, 9, 1, 0, 3, 2, 5, 0)

# a) Rank the dataset
data_rank <- rank(data)

# b) Sort the data
data_sorted <- sort(data)

# c) Order the data (indices of sorted data)
data_order <- order(data)

# d) Create a data frame to view all results together
result <- data.frame(
  Original_Data = data,
  Rank = data_rank,
  Sorted_Data = data_sorted,
  Order = data_order
)

# Print the results
print(result)

##    Original_Data Rank Sorted_Data Order
## 1              7  9.0           0     7
## 2              4  6.0           0    11
## 3              6  8.0           1     6
## 4              8 10.0           2     9
## 5              9 11.0           3     8
## 6              1  3.0           4     2
## 7              0  1.5           5    10
## 8              3  5.0           6     3
## 9              2  4.0           7     1
## 10             5  7.0           8     4
## 11             0  1.5           9     5

Analysis of the Diamonds Dataset in R library

data(diamonds)

# a) Identify two nominal variables and two ratio variables
nominal_vars <- c("cut", "color")  # Nominal variables
ratio_vars <- c("carat", "price") # Ratio variables

# Print the variables
print("Nominal variables: cut, color")

## [1] "Nominal variables: cut, color"

print("Ratio variables: carat, price")

## [1] "Ratio variables: carat, price"

Plot a histogram of price values

# Create the histogram
hist(
  diamonds$price,
  main = "Histogram of Diamond Prices",
  xlab = "Price (in US dollars)",
  col = "pink",
  border = "red",
  breaks = 50
)

# Add a comment in the bottom margin
mtext(
  "Comment: The price distribution is right-skewed, with most diamonds priced below $5000.",
  side = 1, # Bottom margin
  line = 4, # Distance from the axis
  col = "darkgreen" # Text color
)

Determine the mean of the “depth” variable

mean_depth <- mean(diamonds$depth)
print(paste("Mean depth:", mean_depth))

## [1] "Mean depth: 61.749404894327"

# Plot a histogram of diamond depths
hist(
  diamonds$depth,
  main = "Histogram of Diamond Depths",
  xlab = "Depth (%)",
  col = "yellow",
  border = "black",
  breaks = 30
)

mtext(
  "Comment:The depth distribution is bell-shaped, with most values between 57% and 65%.",
  side = 1, # Bottom margin
  line = 4, # Distance from the axis
  col = "darkgreen" # Text color
)

Scatter plot of carat vs price to check the relationship

plot(
  diamonds$carat, diamonds$price,
  main = "Scatter Plot of Carat vs Price",
  xlab = "Carat",
  ylab = "Price (in US dollars)",
  col = "blue",
  pch = 16,
  cex = 0.5
)

# Add a regression line to the scatter plot
abline(lm(price ~ carat, data = diamonds), col = "black", lwd = 2)

# Calculate and print the correlation coefficient
correlation <- cor(diamonds$carat, diamonds$price)
print(paste("Correlation between carat and price:", correlation))

## [1] "Correlation between carat and price: 0.921591301193477"

mtext(
  "The correlation coefficient (0.92) indicates a positive relationship between carat and price.",
  side = 1, # Bottom margin
  line = 4, # Distance from the axis
  col = "darkgreen" # Text color
)

Telco Dataset Analysis

# Load customer churn dataset
file_path <- "C:/Users/USER/Downloads/School/Stastitical Computing sta1040/telco data.xlsx"
data <- read_excel(file_path, sheet = "telco")

Relationship between age and income

# Calculate correlation coefficient
correlation_age_income <- cor(data$age, data$income, use = "complete.obs")
print(paste("Correlation between age and income:", round(correlation_age_income, 2)))

## [1] "Correlation between age and income: 0.33"

# Scatter plot for age vs income with annotation
ggplot(data, aes(x = age, y = income)) +
  geom_point(color = "blue", alpha = 0.6) +
  labs(
    title = paste("Scatter Plot of Age vs Income\nCorrelation Coefficient:", round(correlation_age_income, 2)),
    x = "Age",
    y = "Income",
    caption = "There is a weak correlation between age and income as indicated by the correlation coefficient.."
  ) +
  theme_minimal() +
  theme(
    plot.caption = element_text(color = "darkgreen", hjust = 1) # Customize caption
  )

Boxplot of tenure by gender

ggplot(data, aes(x = factor(gender), y = tenure, fill = factor(gender))) +
  geom_boxplot() +
  labs(
    title = "Boxplot of Tenure by Gender",
    x = "Gender (0: Male, 1: Female)",
    y = "Tenure",
    caption = "The boxplot shows gender-wise differences in tenure, which can be analyzed further for significance."
  ) +
  scale_fill_manual(values = c("skyblue", "pink")) +
  theme_minimal() +
  theme(
    plot.caption = element_text(color = "darkgreen", hjust =0.1, size = 11)
  )

Percentage of married people residing in region 3

# Filter data for Region 3
region3_data <- data %>% filter(region == 3)
total_region3 <- nrow(region3_data)
married_region3 <- sum(region3_data$marital == 1)

# Calculate and print percentage
married_percentage <- (married_region3 / total_region3) * 100
print(paste("Percentage of married people in Region 3:", round(married_percentage, 2), "%"))

## [1] "Percentage of married people in Region 3: 46.22 %"

Orthodontic Data Analysis The Orthodontic dataset provides information on dental growth measurements.

# Load Orthodont dataset
PRlong <- Orthodont

# Display the first few rows of the data
head(PRlong)

Open the data in a separate viewer

View(PRlong)

Further analysis of the dataset

# Show the structure of the data
str(PRlong)

## Classes 'nfnGroupedData', 'nfGroupedData', 'groupedData' and 'data.frame':   108 obs. of  4 variables:
##  $ distance: num  26 25 29 31 21.5 22.5 23 26.5 23 22.5 ...
##  $ age     : num  8 10 12 14 8 10 12 14 8 10 ...
##  $ Subject : Ord.factor w/ 27 levels "M16"<"M05"<"M02"<..: 15 15 15 15 3 3 3 3 7 7 ...
##  $ Sex     : Factor w/ 2 levels "Male","Female": 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "outer")=Class 'formula'  language ~Sex
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##  - attr(*, "formula")=Class 'formula'  language distance ~ age | Subject
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##  - attr(*, "labels")=List of 2
##   ..$ x: chr "Age"
##   ..$ y: chr "Distance from pituitary to pterygomaxillary fissure"
##  - attr(*, "units")=List of 2
##   ..$ x: chr "(yr)"
##   ..$ y: chr "(mm)"
##  - attr(*, "FUN")=function (x)  
##   ..- attr(*, "source")= chr "function (x) max(x, na.rm = TRUE)"
##  - attr(*, "order.groups")= logi TRUE

# Display the dimensions of the data
dim(PRlong)

## [1] 108   4

# Comment: The dataset contains orthodontic measurements, including age, gender, and distance values for analysis.

Gender column created to allow for simplified grouping and analysis based on sex.

# Add a gender column
PRlong$gender <- ifelse(PRlong$Sex == "Male", "M", "F")
data.frame(PRlong$gender)

Display the last 20 rows of selected columns

tail(PRlong[, c("Subject", "age", "distance", "gender", "Sex")], 20)

Comment: The data is arranged by Subject for better visualization of individual records.

# Arrange data by Subject
PRlong <- PRlong %>% arrange(Subject)

# View the arranged data
PRlong

Variables transformed for analysis, including scaling age and converting key fields to factors.

# Perform transformations for further analysis
PRlong$Sage <- 2 * PRlong$age + 6 
PRlong$Sage.c.8 <- PRlong$age - 8

# Convert variables to factors for categorical analysis
PRlong$age <- as.factor(PRlong$age)
PRlong$Sgender <- as.factor(PRlong$gender)
PRlong$Subject <- as.factor(PRlong$Subject)

Createng a line plot of distance by age, grouped by gender

# Line plot of distance by age, grouped by gender
xyplot(
  distance ~ age | gender,
  data = PRlong,
  type = "b",
  lwd = 2,
  pch = 19,
  cex = 1.2,
  groups = Subject
)

# Add caption for the xyplot
grid.text(
 "This plot shows individual dental growth trajectories by age, grouped by gender.",
  x = 0.5,  # Horizontal center
  y = 0.02, # Near bottom of the plot
  gp = gpar(col = "darkgreen", fontsize = 12) # Style the text
)

Grouped data plot for dental growth

dental.grouped <- groupedData(distance ~ age | Subject, outer = ~ gender, data = PRlong)
plot(
  dental.grouped, 
  display = "subject", 
  outer = TRUE, 
  aspect = 1, 
  key = FALSE,
  xlab = "Age", 
  ylab = "Dental Growth (mm)", 
  pch = 19, 
  cex = 0.8,
  main = "Potthoff & Roy (1964) Orthodontic Measurements on Children"
)

# Add caption for the grouped data plot
grid.text(
  "This grouped data plot highlights dental growth trends across subjects and gender.",
  x = 0.5,  # Horizontal center
  y = 0.02, # Near bottom of the plot
  gp = gpar(col = "darkgreen", fontsize = 12, fontface = "italic") # Style the text
)

Data Analysis With R

Mark Mayana

2024-12-10

Load Required Libraries