This analysis examines the relationship between variables in our National Park visitor survey data.
# Load required packages
library(ggplot2)
library(dplyr)
# Read the survey data
survey_data <- read.csv("cloud1.csv")
# Display the column names
cat("Available columns in the dataset:\n")
## Available columns in the dataset:
## [1] "Knowledge" "Visited" "Oldest_Park" "ID"
## Knowledge Visited Oldest_Park ID
## 1 6 1 3 1
## 2 5 1 3 2
## 3 5 2 4 3
## 4 5 1 3 4
## 5 8 1 2 5
## 6 6 1 1,3 6
## Knowledge Visited Oldest_Park ID
## Min. :3.00 Min. :1.0 Length:20 Min. : 1.00
## 1st Qu.:5.00 1st Qu.:1.0 Class :character 1st Qu.: 5.75
## Median :6.00 Median :1.0 Mode :character Median :10.50
## Mean :5.65 Mean :1.3 Mean :10.50
## 3rd Qu.:6.00 3rd Qu.:2.0 3rd Qu.:15.25
## Max. :8.00 Max. :2.0 Max. :20.00
## 'data.frame': 20 obs. of 4 variables:
## $ Knowledge : int 6 5 5 5 8 6 5 5 6 6 ...
## $ Visited : int 1 1 2 1 1 1 1 1 2 2 ...
## $ Oldest_Park: chr "3" "3" "4" "3" ...
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
# Find numeric columns for our analysis
numeric_cols <- sapply(survey_data, is.numeric)
numeric_data <- survey_data[, numeric_cols]
# Show available numeric columns
cat("Available numeric columns for regression:\n")
## Available numeric columns for regression:
## [1] "Knowledge" "Visited" "ID"
# If we have at least 2 numeric columns, we can proceed
if(ncol(numeric_data) >= 2) {
# Use the first column as our dependent variable (what we want to predict)
dependent_var <- names(numeric_data)[1]
# Use the second column as our independent variable (predictor)
independent_var <- names(numeric_data)[2]
cat("\nSelected for analysis:\n")
cat("- Dependent variable (outcome):", dependent_var, "\n")
cat("- Independent variable (predictor):", independent_var, "\n")
} else {
cat("Not enough numeric columns found for regression analysis.\n")
}
##
## Selected for analysis:
## - Dependent variable (outcome): Knowledge
## - Independent variable (predictor): Visited
# Only run the regression if we identified suitable variables
if(exists("dependent_var") && exists("independent_var")) {
# Create a formula string using the identified column names
formula_str <- paste(dependent_var, "~", independent_var)
cat("Regression formula:", formula_str, "\n\n")
# Run the regression model
model <- lm(as.formula(formula_str), data = survey_data)
# Display regression results
summary(model)
# Print a clear interpretation
coefficients <- coef(model)
cat("\nInterpretation:\n")
cat("- When", independent_var, "is zero,", dependent_var, "is estimated to be", round(coefficients[1], 2), "\n")
cat("- For each one-unit increase in", independent_var, ",", dependent_var, "changes by", round(coefficients[2], 2), "\n")
}
## Regression formula: Knowledge ~ Visited
##
##
## Interpretation:
## - When Visited is zero, Knowledge is estimated to be 4.69
## - For each one-unit increase in Visited , Knowledge changes by 0.74
# Create a visualization if we have the variables
if(exists("dependent_var") && exists("independent_var")) {
# Create an attractive visualization of the relationship
ggplot(survey_data, aes_string(x = independent_var, y = dependent_var)) +
geom_point(color = "#3498db", size = 3, alpha = 0.7) +
geom_smooth(method = "lm", color = "#e74c3c", fill = "#e74c3c", alpha = 0.2) +
theme_minimal() +
theme(
plot.title = element_text(size = 16, face = "bold"),
axis.title = element_text(size = 12, face = "bold")
) +
labs(
title = paste("Relationship Between", independent_var, "and", dependent_var),
x = independent_var,
y = dependent_var,
caption = "Source: National Park Visitor Survey 2025"
)
}