Introduction

This analysis examines the relationship between variables in our National Park visitor survey data.

Loading and Exploring the Data

# Load required packages
library(ggplot2)
library(dplyr)

# Read the survey data
survey_data <- read.csv("cloud1.csv")

# Display the column names
cat("Available columns in the dataset:\n")
## Available columns in the dataset:
names(survey_data)
## [1] "Knowledge"   "Visited"     "Oldest_Park" "ID"
# Display the first few rows
head(survey_data)
##   Knowledge Visited Oldest_Park ID
## 1         6       1           3  1
## 2         5       1           3  2
## 3         5       2           4  3
## 4         5       1           3  4
## 5         8       1           2  5
## 6         6       1         1,3  6
# Summary statistics
summary(survey_data)
##    Knowledge       Visited    Oldest_Park              ID       
##  Min.   :3.00   Min.   :1.0   Length:20          Min.   : 1.00  
##  1st Qu.:5.00   1st Qu.:1.0   Class :character   1st Qu.: 5.75  
##  Median :6.00   Median :1.0   Mode  :character   Median :10.50  
##  Mean   :5.65   Mean   :1.3                      Mean   :10.50  
##  3rd Qu.:6.00   3rd Qu.:2.0                      3rd Qu.:15.25  
##  Max.   :8.00   Max.   :2.0                      Max.   :20.00

Identifying Variables for Analysis

# Check the structure of the data
str(survey_data)
## 'data.frame':    20 obs. of  4 variables:
##  $ Knowledge  : int  6 5 5 5 8 6 5 5 6 6 ...
##  $ Visited    : int  1 1 2 1 1 1 1 1 2 2 ...
##  $ Oldest_Park: chr  "3" "3" "4" "3" ...
##  $ ID         : int  1 2 3 4 5 6 7 8 9 10 ...
# Find numeric columns for our analysis
numeric_cols <- sapply(survey_data, is.numeric)
numeric_data <- survey_data[, numeric_cols]

# Show available numeric columns
cat("Available numeric columns for regression:\n")
## Available numeric columns for regression:
names(numeric_data)
## [1] "Knowledge" "Visited"   "ID"
# If we have at least 2 numeric columns, we can proceed
if(ncol(numeric_data) >= 2) {
  # Use the first column as our dependent variable (what we want to predict)
  dependent_var <- names(numeric_data)[1]
  
  # Use the second column as our independent variable (predictor)
  independent_var <- names(numeric_data)[2]
  
  cat("\nSelected for analysis:\n")
  cat("- Dependent variable (outcome):", dependent_var, "\n")
  cat("- Independent variable (predictor):", independent_var, "\n")
} else {
  cat("Not enough numeric columns found for regression analysis.\n")
}
## 
## Selected for analysis:
## - Dependent variable (outcome): Knowledge 
## - Independent variable (predictor): Visited

Running the Regression Analysis

# Only run the regression if we identified suitable variables
if(exists("dependent_var") && exists("independent_var")) {
  # Create a formula string using the identified column names
  formula_str <- paste(dependent_var, "~", independent_var)
  
  cat("Regression formula:", formula_str, "\n\n")
  
  # Run the regression model
  model <- lm(as.formula(formula_str), data = survey_data)
  
  # Display regression results
  summary(model)
  
  # Print a clear interpretation
  coefficients <- coef(model)
  cat("\nInterpretation:\n")
  cat("- When", independent_var, "is zero,", dependent_var, "is estimated to be", round(coefficients[1], 2), "\n")
  cat("- For each one-unit increase in", independent_var, ",", dependent_var, "changes by", round(coefficients[2], 2), "\n")
}
## Regression formula: Knowledge ~ Visited 
## 
## 
## Interpretation:
## - When Visited is zero, Knowledge is estimated to be 4.69 
## - For each one-unit increase in Visited , Knowledge changes by 0.74

Visualizing the Relationship

# Create a visualization if we have the variables
if(exists("dependent_var") && exists("independent_var")) {
  # Create an attractive visualization of the relationship
  ggplot(survey_data, aes_string(x = independent_var, y = dependent_var)) +
    geom_point(color = "#3498db", size = 3, alpha = 0.7) +
    geom_smooth(method = "lm", color = "#e74c3c", fill = "#e74c3c", alpha = 0.2) +
    theme_minimal() +
    theme(
      plot.title = element_text(size = 16, face = "bold"),
      axis.title = element_text(size = 12, face = "bold")
    ) +
    labs(
      title = paste("Relationship Between", independent_var, "and", dependent_var),
      x = independent_var,
      y = dependent_var,
      caption = "Source: National Park Visitor Survey 2025"
    )
}