library(readxl)
DATA_SET_2 <- read_excel("C:/Users/micha/Desktop/SCHOOL/Data/DATA SET 2.xlsx", sheet = "LAND USE")
LAND_USE <- DATA_SET_2
DATA_SET_2 <- read_excel("C:/Users/micha/Desktop/SCHOOL/Data/DATA SET 2.xlsx", sheet = "LAND USE")
LAND_USE <- DATA_SET_2
ACRES <- read_excel("C:/Users/micha/Desktop/SCHOOL/Data/ACRES.xlsx")
ACRES$Value <- as.numeric(ACRES$Value)
## Warning: NAs introduced by coercion
ACRES$Geo.Level <- as.factor(ACRES$Geo.Level)
ACRES$Data.Item <- as.factor(ACRES$Data.Item)
ACRES$Domain <- as.factor(ACRES$Domain)
ACRES[is.na(ACRES$Value),]
## # A tibble: 2 × 6
##    Year Geo.Level Data.Item                              Domain Value CV.PERCENT
##   <dbl> <fct>     <fct>                                  <fct>  <dbl>      <dbl>
## 1  2017 NATIONAL  AG LAND, OWNED, IN FARMS - ACRES       PRODU…    NA  506030406
## 2  2017 NATIONAL  AG LAND, RENTED FROM OTHERS, IN FARMS… PRODU…    NA  341202221
ACRES_sub <- ACRES
ACRES_sub_clean <- na.omit(ACRES)

Introduction

This R Markdown document summarizes the code and process output for the original data collected to research the decline of U.S.The data tested was collected from USDA Cropland Data Layer, and the U.S. Census of Agriculture.

##T-Test of Datasets

The following processes were executed in R to evaluate if there is a significant difference between variables within the CDL and U.S. Census of Agriculture Data sets.

summary(CROPLAND_ALLDATA)
##      Value          Category          Pixel.count         Area..acres.      
##  Min.   :  1.00   Length:97          Min.   :1.000e+00   Min.   :        0  
##  1st Qu.: 35.00   Class :character   1st Qu.:7.309e+03   1st Qu.:     1626  
##  Median : 66.00   Mode  :character   Median :1.348e+05   Median :    29974  
##  Mean   : 98.82                      Mean   :8.060e+07   Mean   : 17924349  
##  3rd Qu.:176.00                      3rd Qu.:1.219e+07   3rd Qu.:  2711993  
##  Max.   :250.00                      Max.   :3.084e+09   Max.   :685820981
t.test(CROPLAND_ALLDATA$Pixel.count,CROPLAND_ALLDATA$Area..acres.)
## 
##  Welch Two Sample t-test
## 
## data:  CROPLAND_ALLDATA$Pixel.count and CROPLAND_ALLDATA$Area..acres.
## t = 1.6375, df = 105.47, p-value = 0.1045
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -13212415 138557529
## sample estimates:
## mean of x mean of y 
##  80596906  17924349

Summary of t-test

The p-value = 0.1045 indicated no statistical significance between the variables.

##Wilcox Test Wilcox Test on CENSUS_DATA and CDL CROPLAND_ALLDATA,Comparing Acre Variables

CENSUS_DATA$Value <- as.numeric(as.character(CENSUS_DATA$Value))
## Warning: NAs introduced by coercion
wilcox.test(CENSUS_DATA$Value,CROPLAND_ALLDATA$Area..acres.,paired = FALSE)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  CENSUS_DATA$Value and CROPLAND_ALLDATA$Area..acres.
## W = 131143, p-value = 9.695e-05
## alternative hypothesis: true location shift is not equal to 0

The p-value = 9.695e-05 is larger than (p<0.05) and indicates no statistical significance between variables.

##Wilcox Test on US Census Land_USE and CENSUS_DATA Variables Wilcox Test on CENSUS_DATA and CDL CROPLAND_USE,Comparing Acre and Total Variables

wilcox.test(LAND_USE$TOTAL, CENSUS_DATA$Value, paired = FALSE)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  LAND_USE$TOTAL and CENSUS_DATA$Value
## W = 43165, p-value = 0.0002141
## alternative hypothesis: true location shift is not equal to 0

##Summary The p-value = 0.0002141 is smaller than p < 0.05 which indicates statistical significance between variables

Model for Land Use

The following is the summary for the linear model for the LAND_USE Data set

model_LAND_USE <- lm(`TOTAL` ~ `1.TO.9` + `2000.OR.MORE`, data = LAND_USE)
summary(model_LAND_USE)
## 
## Call:
## lm(formula = TOTAL ~ `1.TO.9` + `2000.OR.MORE`, data = LAND_USE)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -59449822 -10742131   1293920   4476144  55430198 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -4.192e+06  6.157e+06  -0.681  0.50219    
## `1.TO.9`        1.996e+02  6.690e+01   2.984  0.00628 ** 
## `2000.OR.MORE`  1.283e+00  1.059e-01  12.109 5.91e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23710000 on 25 degrees of freedom
## Multiple R-squared:  0.9578, Adjusted R-squared:  0.9544 
## F-statistic: 283.7 on 2 and 25 DF,  p-value: < 2.2e-16

Summary of Linear Model Values

Adjusted R Squared = 0.9544: Suggests that the predictors are meaningful and not over fitted.

Multiple R Squared = 0.9578: The R squared value suggests model is a good fit.

Adjusted R Squared = 0.9544: The Adjusted R squared: The value suggests a good fit.

P value = 2.2e-16 indicated statistical significance and that the a large portion of variance can be explained by the linear model.

##Random forest The following is a summary of The Random Forest Model on The ACRES Data set

library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
rf_model <- randomForest(Value ~ ., data = ACRES_sub_clean, importance = TRUE)
print(rf_model)
## 
## Call:
##  randomForest(formula = Value ~ ., data = ACRES_sub_clean, importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 4.57948e+15
##                     % Var explained: 48.95

Model Plots

Random Forest Plot of variables

varImpPlot(rf_model)

##Model Plots

plot(model_LAND_USE)