library(readxl)
DATA_SET_2 <- read_excel("C:/Users/micha/Desktop/SCHOOL/Data/DATA SET 2.xlsx", sheet = "LAND USE")
LAND_USE <- DATA_SET_2
DATA_SET_2 <- read_excel("C:/Users/micha/Desktop/SCHOOL/Data/DATA SET 2.xlsx", sheet = "LAND USE")
LAND_USE <- DATA_SET_2
ACRES <- read_excel("C:/Users/micha/Desktop/SCHOOL/Data/ACRES.xlsx")
ACRES$Value <- as.numeric(ACRES$Value)
## Warning: NAs introduced by coercion
ACRES$Geo.Level <- as.factor(ACRES$Geo.Level)
ACRES$Data.Item <- as.factor(ACRES$Data.Item)
ACRES$Domain <- as.factor(ACRES$Domain)
ACRES[is.na(ACRES$Value),]
## # A tibble: 2 × 6
## Year Geo.Level Data.Item Domain Value CV.PERCENT
## <dbl> <fct> <fct> <fct> <dbl> <dbl>
## 1 2017 NATIONAL AG LAND, OWNED, IN FARMS - ACRES PRODU… NA 506030406
## 2 2017 NATIONAL AG LAND, RENTED FROM OTHERS, IN FARMS… PRODU… NA 341202221
ACRES_sub <- ACRES
ACRES_sub_clean <- na.omit(ACRES)
This R Markdown document summarizes the code and process output for the original data collected to research the decline of U.S.The data tested was collected from USDA Cropland Data Layer, and the U.S. Census of Agriculture.
##T-Test of Datasets
The following processes were executed in R to evaluate if there is a significant difference between variables within the CDL and U.S. Census of Agriculture Data sets.
summary(CROPLAND_ALLDATA)
## Value Category Pixel.count Area..acres.
## Min. : 1.00 Length:97 Min. :1.000e+00 Min. : 0
## 1st Qu.: 35.00 Class :character 1st Qu.:7.309e+03 1st Qu.: 1626
## Median : 66.00 Mode :character Median :1.348e+05 Median : 29974
## Mean : 98.82 Mean :8.060e+07 Mean : 17924349
## 3rd Qu.:176.00 3rd Qu.:1.219e+07 3rd Qu.: 2711993
## Max. :250.00 Max. :3.084e+09 Max. :685820981
t.test(CROPLAND_ALLDATA$Pixel.count,CROPLAND_ALLDATA$Area..acres.)
##
## Welch Two Sample t-test
##
## data: CROPLAND_ALLDATA$Pixel.count and CROPLAND_ALLDATA$Area..acres.
## t = 1.6375, df = 105.47, p-value = 0.1045
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -13212415 138557529
## sample estimates:
## mean of x mean of y
## 80596906 17924349
The p-value = 0.1045 indicated no statistical significance between the variables.
##Wilcox Test Wilcox Test on CENSUS_DATA and CDL CROPLAND_ALLDATA,Comparing Acre Variables
CENSUS_DATA$Value <- as.numeric(as.character(CENSUS_DATA$Value))
## Warning: NAs introduced by coercion
wilcox.test(CENSUS_DATA$Value,CROPLAND_ALLDATA$Area..acres.,paired = FALSE)
##
## Wilcoxon rank sum test with continuity correction
##
## data: CENSUS_DATA$Value and CROPLAND_ALLDATA$Area..acres.
## W = 131143, p-value = 9.695e-05
## alternative hypothesis: true location shift is not equal to 0
##Wilcox Test on US Census Land_USE and CENSUS_DATA Variables Wilcox Test on CENSUS_DATA and CDL CROPLAND_USE,Comparing Acre and Total Variables
wilcox.test(LAND_USE$TOTAL, CENSUS_DATA$Value, paired = FALSE)
##
## Wilcoxon rank sum test with continuity correction
##
## data: LAND_USE$TOTAL and CENSUS_DATA$Value
## W = 43165, p-value = 0.0002141
## alternative hypothesis: true location shift is not equal to 0
##Summary The p-value = 0.0002141 is smaller than p < 0.05 which indicates statistical significance between variables
The following is the summary for the linear model for the LAND_USE Data set
model_LAND_USE <- lm(`TOTAL` ~ `1.TO.9` + `2000.OR.MORE`, data = LAND_USE)
summary(model_LAND_USE)
##
## Call:
## lm(formula = TOTAL ~ `1.TO.9` + `2000.OR.MORE`, data = LAND_USE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -59449822 -10742131 1293920 4476144 55430198
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.192e+06 6.157e+06 -0.681 0.50219
## `1.TO.9` 1.996e+02 6.690e+01 2.984 0.00628 **
## `2000.OR.MORE` 1.283e+00 1.059e-01 12.109 5.91e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23710000 on 25 degrees of freedom
## Multiple R-squared: 0.9578, Adjusted R-squared: 0.9544
## F-statistic: 283.7 on 2 and 25 DF, p-value: < 2.2e-16
Adjusted R Squared = 0.9544: Suggests that the predictors are meaningful and not over fitted.
Multiple R Squared = 0.9578: The R squared value suggests model is a good fit.
Adjusted R Squared = 0.9544: The Adjusted R squared: The value suggests a good fit.
P value = 2.2e-16 indicated statistical significance and that the a large portion of variance can be explained by the linear model.
##Random forest The following is a summary of The Random Forest Model on The ACRES Data set
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
rf_model <- randomForest(Value ~ ., data = ACRES_sub_clean, importance = TRUE)
print(rf_model)
##
## Call:
## randomForest(formula = Value ~ ., data = ACRES_sub_clean, importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 4.57948e+15
## % Var explained: 48.95
Random Forest Plot of variables
varImpPlot(rf_model)
##Model Plots
plot(model_LAND_USE)