library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(readr)
lunarcrater <-read.csv('lunar_crater_database_robbins_2018.csv', header = TRUE)

Works Cited:

Direct dataset: https://dwjtvz5c9xobz.cloudfront.net/moon_lro.kaguya_multi_craterdatabase_robbins_2018/data /lunar_crater_database_robbins_2018.csv

Website: https://www.data-is-plural.com/archive/2023-11-22-edition/

Question: How does the crater diameter and latitude predict crater rim?

Crater Rim = Diameter+Latitude

Introduction: The dataset I chose talks about dimensions and the locations of lunar impact craters. The orignal dataset has 1296796 cases and 21 columns. Each case is one crater. The variables used for this project are, “LAT_CIRC_IMG,PTS_RIM_IMG, DIAM_CIRC_IMG”. LAT_CIRC_IMG stands for the latitude of crater, PTS_RIM_IMG represents the crater rim, and lastly DIAM_CIRC_IMG represents crater diameter. I chose this topic because I personally, enjoy learning about space and the planets. It’s interesting to get some insight about these craters and see how big they are and if these variables can predict the crater rim.

#1 Data exploration

dim(lunarcrater)
## [1] 1296796      21
colnames(lunarcrater)
##  [1] "CRATER_ID"              "LAT_CIRC_IMG"           "LON_CIRC_IMG"          
##  [4] "LAT_ELLI_IMG"           "LON_ELLI_IMG"           "DIAM_CIRC_IMG"         
##  [7] "DIAM_CIRC_SD_IMG"       "DIAM_ELLI_MAJOR_IMG"    "DIAM_ELLI_MINOR_IMG"   
## [10] "DIAM_ELLI_ECCEN_IMG"    "DIAM_ELLI_ELLIP_IMG"    "DIAM_ELLI_ANGLE_IMG"   
## [13] "LAT_ELLI_SD_IMG"        "LON_ELLI_SD_IMG"        "DIAM_ELLI_MAJOR_SD_IMG"
## [16] "DIAM_ELLI_MINOR_SD_IMG" "DIAM_ELLI_ANGLE_SD_IMG" "DIAM_ELLI_ECCEN_SD_IMG"
## [19] "DIAM_ELLI_ELLIP_SD_IMG" "ARC_IMG"                "PTS_RIM_IMG"
colSums(is.na(lunarcrater))
##              CRATER_ID           LAT_CIRC_IMG           LON_CIRC_IMG 
##                      0                      0                      0 
##           LAT_ELLI_IMG           LON_ELLI_IMG          DIAM_CIRC_IMG 
##                     38                     38                      0 
##       DIAM_CIRC_SD_IMG    DIAM_ELLI_MAJOR_IMG    DIAM_ELLI_MINOR_IMG 
##                      0                     38                     38 
##    DIAM_ELLI_ECCEN_IMG    DIAM_ELLI_ELLIP_IMG    DIAM_ELLI_ANGLE_IMG 
##                     38                     38                     38 
##        LAT_ELLI_SD_IMG        LON_ELLI_SD_IMG DIAM_ELLI_MAJOR_SD_IMG 
##                      0                      0                     38 
## DIAM_ELLI_MINOR_SD_IMG DIAM_ELLI_ANGLE_SD_IMG DIAM_ELLI_ECCEN_SD_IMG 
##                     38                     38                     38 
## DIAM_ELLI_ELLIP_SD_IMG                ARC_IMG            PTS_RIM_IMG 
##                     38                      0                      0

#2 Cleaning the data

lunarcrater_2 <-lunarcrater %>% 
filter(!is.na(LAT_CIRC_IMG)) %>%
filter(!is.na(PTS_RIM_IMG))%>%
filter(!is.na(DIAM_CIRC_IMG))%>%
select(LAT_CIRC_IMG,PTS_RIM_IMG, DIAM_CIRC_IMG)
dim(lunarcrater_2)
## [1] 1296796       3
head(lunarcrater_2)
##   LAT_CIRC_IMG PTS_RIM_IMG DIAM_CIRC_IMG
## 1    -19.83040        8088       940.960
## 2     44.77630        2785       249.840
## 3     57.08660        5199       599.778
## 4      1.96124        4341       558.762
## 5    -49.14960        5933       654.332
## 6    -35.60240        4433       505.312

#3 Summarization

library(dplyr)

summary_mean <- lunarcrater_2 %>% 
summarize(count = n(), 
mean_lat= mean(LAT_CIRC_IMG), 
mean_rim= mean(PTS_RIM_IMG), 
mean_diam= mean(DIAM_CIRC_IMG))
print(summary_mean)
##     count  mean_lat mean_rim mean_diam
## 1 1296796 -1.317424 18.37816  2.436963

Data Analysis: In the first chunk, we are exploring the data. We found that originally in the “lunarcrater” dataset we had 1296796 cases of craters and 21 columns, we got all the 21 column names and then, we used colSums to double check if any columns had any missing values and found the three columns we will need “PTS_RIM_IMG ~ DIAM_CIRC_IMG + LAT_CIRC_IMG” had 0 missing values. In the next chunk, we renamed (lunarcrater_2), cleaned the dataset and got rid of the columns we do not need by using select and as standard eda proceedure used filter to remove any na’s from the three columns we will be using (we had 0 na’s in either 3 columns being used). Lastly, we summarized the data by using summarize, and get four columns, count, mean_lat, mean_rim and mean_dia. On an average, craters have a latitiude of -1.32, rim of 18.38 and diameter of 2.44.

#Multiple Linear Regression (Fitting and Interpretation)

multiple_model <- lm( PTS_RIM_IMG ~ DIAM_CIRC_IMG + LAT_CIRC_IMG, data = lunarcrater_2)

summary(multiple_model)
## 
## Call:
## lm(formula = PTS_RIM_IMG ~ DIAM_CIRC_IMG + LAT_CIRC_IMG, data = lunarcrater_2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8811.9    -3.8    -2.2     0.3  4276.8 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   8.5463429  0.0212666  401.87   <2e-16 ***
## DIAM_CIRC_IMG 4.0415467  0.0035238 1146.93   <2e-16 ***
## LAT_CIRC_IMG  0.0131176  0.0004464   29.38   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.15 on 1296793 degrees of freedom
## Multiple R-squared:  0.5037, Adjusted R-squared:  0.5037 
## F-statistic: 6.58e+05 on 2 and 1296793 DF,  p-value: < 2.2e-16

#3 Checking Assumptions (Homoscedasticity and Normality)

par(mfrow=c(2,2))
plot(multiple_model)

par(mfrow=c(1,1))

#3 Checking Multicollinearity

library(corrplot)
## corrplot 0.95 loaded
 cor(lunarcrater_2[, c("DIAM_CIRC_IMG","LAT_CIRC_IMG")], use = "complete.obs")
##               DIAM_CIRC_IMG LAT_CIRC_IMG
## DIAM_CIRC_IMG   1.000000000 -0.004646066
## LAT_CIRC_IMG   -0.004646066  1.000000000

Statistical Analysis: The method I selected was multiple linear regression. I selected this stastical approach because my dependent variable (crater rim) is continous and I wanted to find out how two independent variables (crater diameter and latitude) predict the crater rim. With opting for multiple linear regression, I was able to see the set how much each variable had an effect on crater rim while being able to isolate the effect Also, this approach worked well with my large dataset and was able to effectivively provide the stastical significance, able to understand the coefficient’s meaning in relation to this question amongst many other insightful stastical explanations which was able to help me answer my question.

Conclusion and Future Directions: The key findings of my anaylsis were that creator diameter and latitude does predict the crater rim. The meaning of the results essentially showed that the larger in size craters tended to have higher rims. We found that diameter had the most effect on prediction of crater rim and latitude had the least effect on rim prediction. In regard to potential avenues for future reasearch, it would be very helpful to look at the impacts of other variables like crater depth to get a more accurate prediction because with only two variables we are only limited to so much. Researchers could look at also how much factors like erosion or tectonics play into the prediction of crater rim.