project03

Author

rachael berghahn

clouds <- read.csv("clouds.csv", stringsAsFactors = TRUE) ##reading in dataset

library(dplyr)

Warning: package 'dplyr' was built under R version 4.3.3


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

library(ggplot2)
clouds %>%
  group_by(seeding) %>%
  summarise(
    mean_rainfall = mean(rainfall),
    sd_rainfall = sd(rainfall),
    n = n()
  ) %>%
print()

# A tibble: 2 × 4
  seeding mean_rainfall sd_rainfall     n
  <fct>           <dbl>       <dbl> <int>
1 no               4.17        3.52    12
2 yes              4.63        2.78    12

##grouping by seeding vs non seeding to see mean and sd

ggplot(clouds, aes(x = seeding, y = rainfall, fill = seeding)) +
  geom_boxplot() +
  labs(title = "Difference in seeding and nonseeding", y = "Rainfall", x = "Seeding")

##visualizing the data (differences between seeding and non seeding)

t.test(rainfall ~ seeding, data = clouds)


    Welch Two Sample t-test

data:  rainfall by seeding
t = -0.3574, df = 20.871, p-value = 0.7244
alternative hypothesis: true difference in means between group no and group yes is not equal to 0
95 percent confidence interval:
 -3.154691  2.229691
sample estimates:
 mean in group no mean in group yes 
         4.171667          4.634167

##p-value is not below 0.05, meaning there is no significant difference between seeding and non seeding

clouds$seeding <- as.factor(clouds$seeding) ##changing seeding into a factor
clouds$echomotion <- as.factor(clouds$echomotion) ##changing echo motion into a factor

model1 <- lm(rainfall ~ seeding + cloudcover + prewetness + echomotion + sne, data = clouds) ##creating a linear regression model to see cloud cover, pre-wetness, suitability critereon, seeding and echo motion effect rainfall
summary(model1) ##no p-values are small enough to be significant, however the smallest p-value is for sne (which is close to being significant)


Call:
lm(formula = rainfall ~ seeding + cloudcover + prewetness + echomotion + 
    sne, data = clouds)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.1158 -1.7078 -0.2422  1.3368  6.4827 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)  
(Intercept)           6.37680    2.43432   2.620   0.0174 *
seedingyes            1.12011    1.20725   0.928   0.3658  
cloudcover            0.01821    0.11508   0.158   0.8761  
prewetness            2.55109    2.70090   0.945   0.3574  
echomotionstationary  2.59855    1.54090   1.686   0.1090  
sne                  -1.27530    0.68015  -1.875   0.0771 .
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.855 on 18 degrees of freedom
Multiple R-squared:  0.3403,    Adjusted R-squared:  0.157 
F-statistic: 1.857 on 5 and 18 DF,  p-value: 0.1524

anova(model1) ##results here show echo motion and sne with smallest p values, showing that these two influence rainfall the most

Analysis of Variance Table

Response: rainfall
           Df  Sum Sq Mean Sq F value  Pr(>F)  
seeding     1   1.283  1.2834  0.1575 0.69613  
cloudcover  1  15.738 15.7377  1.9313 0.18157  
prewetness  1   0.003  0.0027  0.0003 0.98557  
echomotion  1  29.985 29.9853  3.6798 0.07108 .
sne         1  28.649 28.6491  3.5158 0.07711 .
Residuals  18 146.677  8.1487                  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

 ##filtering data to make seeded and not seeded data sets
clouds_seeded <- filter(clouds, seeding == "yes") 
clouds_notseeded <- filter(clouds, seeding == "no")

##creating linear regression models for seeded and not seeded datasets 
model_seeded <- lm(rainfall ~ sne, data = clouds_seeded)
model_notseeded <- lm(rainfall ~ sne, data = clouds_notseeded)

##creating a way to compare coefficents
summary(model_seeded)


Call:
lm(formula = rainfall ~ sne, data = clouds_seeded)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.0134 -1.3297 -0.3276  0.6171  4.3867 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)   
(Intercept)  12.0202     2.9774   4.037  0.00237 **
sne          -2.2180     0.8722  -2.543  0.02921 * 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.27 on 10 degrees of freedom
Multiple R-squared:  0.3927,    Adjusted R-squared:  0.332 
F-statistic: 6.467 on 1 and 10 DF,  p-value: 0.02921

summary(model_notseeded)


Call:
lm(formula = rainfall ~ sne, data = clouds_notseeded)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.4892 -2.1762  0.2958  1.4902  7.3616 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)  
(Intercept)    7.319      3.160   2.317    0.043 *
sne           -1.046      0.995  -1.052    0.318  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 3.502 on 10 degrees of freedom
Multiple R-squared:  0.09957,   Adjusted R-squared:  0.009528 
F-statistic: 1.106 on 1 and 10 DF,  p-value: 0.3177

##the coefficents for sne for the seeded model is -2.21 while for the not seeded it is -1.046, showing the decrease on the slope for seeded will be more on the seeded model

##p value is also significant for seeded models, while it is not signficant not seeded models, showing there is a relationship between higher sne and lower rainfall in seeded clouds, while there is not in not seeded clouds

##creating visual of data using ggplot
ggplot(clouds, aes(x = sne, y = rainfall, color = seeding)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "rainfall vs sne for seeded or not seeded")

`geom_smooth()` using formula = 'y ~ x'

##both slopes are decreasing, seeded data decreases with lower rainfall and higher sne at a faster rate then not seeded data does