# Read the data from the file
data <- read.table("T12-4.DAT", header = FALSE)
data
## V1 V2 V3 V4 V5 V6 V7 V8 V9
## 1 1.06 9.2 151 54.4 1.6 9077 0.0 0.628 Arizona
## 2 0.89 10.3 202 57.9 2.2 5088 25.3 1.555 Boston
## 3 1.43 15.4 113 53.0 3.4 9212 0.0 1.058 Central
## 4 1.02 11.2 168 56.0 0.3 6423 34.3 0.700 Common
## 5 1.49 8.8 192 51.2 1.0 3300 15.6 2.044 Consolid
## 6 1.32 13.5 111 60.0 -2.2 11127 22.5 1.241 Florida
## 7 1.22 12.2 175 67.6 2.2 7642 0.0 1.652 Hawaiian
## 8 1.10 9.2 245 57.0 3.3 13082 0.0 0.309 Idaho
## 9 1.34 13.0 168 60.4 7.2 8406 0.0 0.862 Kentucky
## 10 1.12 12.4 197 53.0 2.7 6455 39.2 0.623 Madison
## 11 0.75 7.5 173 51.5 6.5 17441 0.0 0.768 Nevada
## 12 1.13 10.9 178 62.0 3.7 6154 0.0 1.897 NewEngla
## 13 1.15 12.7 199 53.7 6.4 7179 50.2 0.527 Northern
## 14 1.09 12.0 96 49.8 1.4 9673 0.0 0.588 Oklahoma
## 15 0.96 7.6 164 62.2 -0.1 6468 0.9 1.400 Pacific
## 16 1.16 9.9 252 56.0 9.2 15991 0.0 0.620 Puget
## 17 0.76 6.4 136 61.9 9.0 5714 8.3 1.920 SanDiego
## 18 1.05 12.6 150 56.7 2.7 10140 0.0 1.108 Southern
## 19 1.16 11.7 104 54.0 -2.1 13507 0.0 0.636 Texas
## 20 1.20 11.8 148 59.9 3.5 7287 41.1 0.702 Wisconsi
## 21 1.04 8.6 204 61.0 3.5 6650 0.0 2.116 United
## 22 1.07 9.3 174 54.3 5.9 10093 26.6 1.306 Virginia
# Load required libraries
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Rename the columns with appropriate variable names
colnames(data) <- c("FixedChargeCoverageRatio",
"RateOfReturnOnCapital",
"CostPerKWCapacityInPlace",
"AnnualLoadFactor",
"PeakKWhDemandGrowthFrom1974To1975",
"SalesKWhPerYear",
"PercentNuclear",
"TotalFuelCostsCentsPerKWh",
"Company")
# Plot the raw data for each variable with correct color names
ggplot(data, aes(x = 1:nrow(data))) +
geom_point(aes(y = FixedChargeCoverageRatio, color = "Fixed-charge coverage ratio"), size = 3) +
geom_point(aes(y = RateOfReturnOnCapital, color = "Rate of return on capital"), size = 3) +
geom_point(aes(y = CostPerKWCapacityInPlace, color = "Cost per KW capacity in place"), size = 3) +
geom_point(aes(y = AnnualLoadFactor, color = "Annual load factor"), size = 3) +
geom_point(aes(y = PeakKWhDemandGrowthFrom1974To1975, color = "Peak kWh demand growth from 1974 to 1975"), size = 3) +
geom_point(aes(y = SalesKWhPerYear, color = "Sales (kWh use per year)"), size = 3) +
geom_point(aes(y = PercentNuclear, color = "Percent nuclear"), size = 3) +
geom_point(aes(y = TotalFuelCostsCentsPerKWh, color = "Total fuel costs (cents per kWh)"), size = 3) +
labs(title = "Raw Data Plot",
x = "Observation",
y = "Value",
color = "Variable",
subtitle = "Company Names") +
theme_minimal() +
theme(legend.position = "bottom", # Adjust the position of the legend for better readability
axis.text.x = element_blank()) +
facet_wrap(~Company, ncol = 2) # Facet the plot by company names if they are unique for each row
The plot will show the distribution of each variable’s values for each
company, allowing you to observe patterns, trends, and differences
across the different variables and companies.
# Assuming you have already loaded the required libraries and read the data
# Calculate summary measures for each variable
summary_data <- data %>%
summarise(
Mean = mean(FixedChargeCoverageRatio),
Median = median(FixedChargeCoverageRatio),
SD = sd(FixedChargeCoverageRatio),
Q1 = quantile(FixedChargeCoverageRatio, 0.25),
Q3 = quantile(FixedChargeCoverageRatio, 0.75)
)
# Print the summary data
print(summary_data)
## Mean Median SD Q1 Q3
## 1 1.114091 1.11 0.1845112 1.0425 1.19
# Create box plot for each variable
ggplot(data, aes(x = Company, y = FixedChargeCoverageRatio)) +
geom_boxplot(fill = "red", color = "black") +
labs(title = "Box Plot - Fixed Charge Coverage Ratio",
x = "Company",
y = "Fixed Charge Coverage Ratio")
This code calculates summary measures (Mean, Median, Standard Deviation,
Q1, and Q3) for the “FixedChargeCoverageRatio” variable and then creates
a box plot to visualize its distribution across different companies.
This code will calculate summary measures (Mean, Median, Standard Deviation, Q1, and Q3) for all variables in your dataset and create a set of box plots, each representing a different variable. The box plots will provide insights into the distribution, central tendency, and variability of each variable across different companies.
# Assuming you have already loaded the required libraries and read the data
# Compute the variance-covariance matrix
cov_matrix <- cov(data[, 1:8]) # Considering only the numeric columns
# Check the eigenvalues of the covariance matrix
eigenvalues <- eigen(cov_matrix)$values
# Check if all eigenvalues are positive
is_positive_definite <- all(eigenvalues > 0)
# Print the result
if (is_positive_definite) {
print("The variance-covariance matrix is positive definite.")
} else {
print("The variance-covariance matrix is not positive definite.")
}
## [1] "The variance-covariance matrix is positive definite."
This code will calculate the variance-covariance matrix of the numeric columns from dataset and then check if all its eigenvalues are positive. If all eigenvalues are positive, it will print “The variance-covariance matrix is positive definite.” Otherwise, it will print “The variance-covariance matrix is not positive definite.”
# Assuming you have already loaded the required libraries and read the data
# Compute the variance-covariance matrix
cov_matrix <- cov(data[, 1:8]) # Considering only the numeric columns
# Perform eigenvalue decomposition
eigen_decomp <- eigen(cov_matrix)
# Extract eigenvalues and eigenvectors
eigenvalues <- eigen_decomp$values
eigenvectors <- eigen_decomp$vectors
# Report the results
cat("Eigenvalues:\n")
## Eigenvalues:
print(eigenvalues)
## [1] 1.260243e+07 1.703141e+03 2.400068e+02 1.600796e+01 7.743577e+00
## [6] 3.906911e+00 1.225821e-01 1.499089e-02
cat("\nEigenvectors:\n")
##
## Eigenvectors:
print(eigenvectors)
## [,1] [,2] [,3] [,4] [,5]
## [1,] -7.883140e-06 0.0004460932 -0.0001146357 0.0057978329 -0.0198566131
## [2,] -6.081397e-06 0.0186257078 -0.0412535878 -0.0292444838 -0.2028309715
## [3,] 3.247724e-04 -0.9974928360 0.0566502956 0.0179103135 -0.0355836487
## [4,] -3.618357e-04 -0.0111104272 0.0964680806 -0.9930009368 -0.0495177973
## [5,] 1.549616e-04 -0.0326730808 0.0038575008 -0.0544730799 0.9768581322
## [6,] 9.999983e-01 0.0002209801 -0.0017377455 -0.0005270008 -0.0001471164
## [7,] -1.767632e-03 -0.0589056695 -0.9927317841 -0.0949073699 0.0057261758
## [8,] -8.780470e-05 -0.0001659524 0.0157634569 -0.0276496391 0.0215054038
## [,6] [,7] [,8]
## [1,] 0.0583722527 1.002990e-01 9.930280e-01
## [2,] 0.9735822744 5.984233e-02 -6.717166e-02
## [3,] 0.0144563569 9.986723e-04 -1.312104e-03
## [4,] -0.0333700701 -2.930752e-02 9.745357e-03
## [5,] 0.2038187554 -8.898790e-03 8.784363e-03
## [6,] -0.0001237088 9.721241e-05 5.226863e-06
## [7,] -0.0430954352 1.043775e-02 2.059461e-03
## [8,] -0.0633116915 9.926283e-01 -9.594372e-02
# Load the required library for the Shapiro-Wilk test
library(mvnormtest)
# Assuming you have already loaded the required libraries and read the data
# Consider only the numeric columns for the test
numeric_data <- data[, 1:8]
# Perform Shapiro-Wilk test for multivariate normality
#shapiro_test <- mvShapiroTest(numeric_data)
# Print the test results
#print(shapiro_test)
# Check the p-value of the test
#p_value <- shapiro_test$p.value
# Set the significance level (alpha) for the test
alpha <- 0.05
# Take necessary steps if the data is non-normal
#if (p_value < alpha) {
# print("The data is not multivariate normal.")
# If the data is non-normal, consider applying data transformations
# or using non-parametric methods for further analysis.
#} else {
# print("The data is multivariate normal.")
# If the data is normal, you can proceed with normality assumptions
# in statistical analyses, such as linear regression or MANOVA.
#}
# Assuming you have already loaded the required libraries and read the data
# Select the numeric columns for clustering
numeric_data <- data[, 1:8]
# Scale the data to have mean 0 and standard deviation 1
scaled_data <- scale(numeric_data)
# Determine the number of clusters (k) for k-means
k <- 3 # You can change this value based on your analysis and requirements
# Apply k-means clustering
set.seed(123) # For reproducibility of results
kmeans_result <- kmeans(scaled_data, centers = k)
# Add the cluster information to the original dataset
data$Cluster <- factor(kmeans_result$cluster)
# Visualize the grouping information by a plot
library(ggplot2)