W2FinalOutcome

"Question 4.1
Describe a situation or problem from your job, everyday life, current events, etc., for which a clustering model would be appropriate. List some (up to 5) predictors that you might use."

## [1] "Question 4.1\nDescribe a situation or problem from your job, everyday life, current events, etc., for which a clustering model would be appropriate. List some (up to 5) predictors that you might use."

"Police officers raided a house and found 100 illegally bagged finds of sharks. Categorize them based on the skin color, length, width, weight, age of victims"

## [1] "Police officers raided a house and found 100 illegally bagged finds of sharks. Categorize them based on the skin color, length, width, weight, age of victims"

"Now, let's load the dataset."

## [1] "Now, let's load the dataset."

# Load the datasets library
library(datasets)

# Load the iris dataset
data(iris)

#Now, let's make sure it has been successfully loaded. 

head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

#Now, let's have a look at the structure of the data. 

str(iris)

## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

#Now, let's have a look at the summary of the data. 
summary(iris)

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
##

#Next, let's have a look at the data visually in a scatterplot to find out what categories are grouped together best. 

pairs(iris[, 1:4])

"The conlusion of the graph is that Petal Length vs Petal Width have a very clean gap between groups. 
Sepal Length vs Petal Length also shows moderate separation, but we see some kind of an overlap too. 
"

## [1] "The conlusion of the graph is that Petal Length vs Petal Width have a very clean gap between groups. \nSepal Length vs Petal Length also shows moderate separation, but we see some kind of an overlap too. \n"

"Now, let's find out K Means with different center points."

## [1] "Now, let's find out K Means with different center points."

# Run k-means clustering using Petal Length and Petal Width
kmeans_result1 <- kmeans(iris[, 3:4], centers = 1)

kmeans_result1

## K-means clustering with 1 clusters of sizes 150
## 
## Cluster means:
##   Petal.Length Petal.Width
## 1        3.758    1.199333
## 
## Clustering vector:
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1
## 
## Within cluster sum of squares by cluster:
## [1] 550.8953
##  (between_SS / total_SS =  -0.0 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

# I just realized putting 1 as the center means everything is put together as one cluster/group!
#We really don't want that!

kmeans_result2 <- kmeans(iris[, 3:4], centers = 2)

kmeans_result2

## K-means clustering with 2 clusters of sizes 99, 51
## 
## Cluster means:
##   Petal.Length Petal.Width
## 1     4.925253   1.6818182
## 2     1.492157   0.2627451
## 
## Clustering vector:
##   [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [38] 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1
## 
## Within cluster sum of squares by cluster:
## [1] 81.334141  5.056078
##  (between_SS / total_SS =  84.3 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

"Putting centers as 2 reveals that there is a good separation between the data. 
84.3% of the total spread in the data is explained by the separation between clusters which is a good value."

## [1] "Putting centers as 2 reveals that there is a good separation between the data. \n84.3% of the total spread in the data is explained by the separation between clusters which is a good value."

kmeans_result3 <- kmeans(iris[, 3:4], centers = 3)

kmeans_result3

## K-means clustering with 3 clusters of sizes 46, 54, 50
## 
## Cluster means:
##   Petal.Length Petal.Width
## 1     5.626087    2.047826
## 2     4.292593    1.359259
## 3     1.462000    0.246000
## 
## Clustering vector:
##   [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [38] 3 3 3 3 3 3 3 3 3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [75] 2 2 2 1 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 2 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 2 1 1 1 2 1 1 2 2 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1
## [149] 1 1
## 
## Within cluster sum of squares by cluster:
## [1] 15.16348 14.22741  2.02200
##  (between_SS / total_SS =  94.3 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

"Choosing centers as 3 means 94.3 % of data are well separated from each other which is even better than 2."

## [1] "Choosing centers as 3 means 94.3 % of data are well separated from each other which is even better than 2."

kmeans_result4 <- kmeans(iris[, 3:4], centers = 4)

kmeans_result4

## K-means clustering with 4 clusters of sizes 33, 52, 48, 17
## 
## Cluster means:
##   Petal.Length Petal.Width
## 1     1.372727   0.2060606
## 2     4.269231   1.3423077
## 3     5.595833   2.0375000
## 4     1.635294   0.3235294
## 
## Clustering vector:
##   [1] 1 1 1 1 1 4 1 1 1 1 1 4 1 1 1 4 1 1 4 4 4 4 1 4 4 4 4 1 1 4 4 4 1 1 1 1 1
##  [38] 1 1 1 1 1 1 4 4 1 4 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [75] 2 2 2 3 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 2 3 3 3 3
## [112] 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3
## [149] 3 3
## 
## Within cluster sum of squares by cluster:
## [1]  0.6042424 13.0576923 16.2916667  0.4894118
##  (between_SS / total_SS =  94.5 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

"Choosing centers as 4 means 96.5 % of data are well separated from each other which is even better than 3."

## [1] "Choosing centers as 4 means 96.5 % of data are well separated from each other which is even better than 3."

"Now, you might ask how to interpret the difference between 2,3, and 4?

A percentage below 60% shows Poor clustering.

A percentage btw 60% - 75% is regarded as decent clustering.

A value btw 75% - 85%Good clustering

And, anything above 85% reflect an excellent clustering."

## [1] "Now, you might ask how to interpret the difference between 2,3, and 4?\n\nA percentage below 60% shows Poor clustering.\n\nA percentage btw 60% - 75% is regarded as decent clustering.\n\nA value btw 75% - 85%Good clustering\n\nAnd, anything above 85% reflect an excellent clustering."

#So, in our case, inputting centers as 3 and 4 result in great separation/clustering of our data.

#Now, let's have a look at how well our species in the data are clustered.

# To do that, we need to create a table that shows kmean results based on each species. 

table(kmeans_result3$cluster, iris$Species)

##    
##     setosa versicolor virginica
##   1      0          2        44
##   2      0         48         6
##   3     50          0         0

#The above table shows the result with centers as 3. 
#It means we have 5 setosa, 48 versicolor and 46 virginica. 

table(kmeans_result4$cluster, iris$Species)

##    
##     setosa versicolor virginica
##   1     33          0         0
##   2      0         48         4
##   3      0          2        46
##   4     17          0         0

"Interpreting the difference between centers being 3 and 4.

Our dataset has three types of flowers. So, putting centers as four forces our data to be 

clustered into four categories while we only have three categories.


So, we should always customize our centers based on the actual types of our data.

Even though 4 gives a higher percentage, it wrongfully categorizes our flowers into four types when there are only three types."

## [1] "Interpreting the difference between centers being 3 and 4.\n\nOur dataset has three types of flowers. So, putting centers as four forces our data to be \n\nclustered into four categories while we only have three categories.\n\n\nSo, we should always customize our centers based on the actual types of our data.\n\nEven though 4 gives a higher percentage, it wrongfully categorizes our flowers into four types when there are only three types."

# Adding 50+48+46 means that 146 of our items were grouped correctly while 6 of them were misidentified. 

"Now, let's say we had 5k datapoints and 600 of them were misclassified. 

The outcome would be different if our datapoints were from medical field. 

It means a huge redflag in that situation. So, we need to evaluate & interpret the outcome based on the field we are running the kmeans for."

## [1] "Now, let's say we had 5k datapoints and 600 of them were misclassified. \n\nThe outcome would be different if our datapoints were from medical field. \n\nIt means a huge redflag in that situation. So, we need to evaluate & interpret the outcome based on the field we are running the kmeans for."

"Now, let's move to the next part of the assignment."

## [1] "Now, let's move to the next part of the assignment."

"Using crime data from the file uscrime.txt, 
test to see whether there are any outliers in the last column (number of crimes per 100,000 people).
Use the grubbs.test function in the outliers package in R."

## [1] "Using crime data from the file uscrime.txt, \ntest to see whether there are any outliers in the last column (number of crimes per 100,000 people).\nUse the grubbs.test function in the outliers package in R."

#Let's load the data. 

uscrime <- read.table("C:/Users/moham/Downloads/Georgia Tech University/Week 2/week 2 hw-summer2026/week 2 data-summer/data 5.1/uscrime.txt", header = TRUE)

#Data was loaded after a fixing a few errors. 

#Now, let's look at the rows and dims. 

head(uscrime)

##      M So   Ed  Po1  Po2    LF   M.F Pop   NW    U1  U2 Wealth Ineq     Prob
## 1 15.1  1  9.1  5.8  5.6 0.510  95.0  33 30.1 0.108 4.1   3940 26.1 0.084602
## 2 14.3  0 11.3 10.3  9.5 0.583 101.2  13 10.2 0.096 3.6   5570 19.4 0.029599
## 3 14.2  1  8.9  4.5  4.4 0.533  96.9  18 21.9 0.094 3.3   3180 25.0 0.083401
## 4 13.6  0 12.1 14.9 14.1 0.577  99.4 157  8.0 0.102 3.9   6730 16.7 0.015801
## 5 14.1  0 12.1 10.9 10.1 0.591  98.5  18  3.0 0.091 2.0   5780 17.4 0.041399
## 6 12.1  0 11.0 11.8 11.5 0.547  96.4  25  4.4 0.084 2.9   6890 12.6 0.034201
##      Time Crime
## 1 26.2011   791
## 2 25.2999  1635
## 3 24.3006   578
## 4 29.9012  1969
## 5 21.2998  1234
## 6 20.9995   682

dim(uscrime)

## [1] 47 16

str(uscrime)

## 'data.frame':    47 obs. of  16 variables:
##  $ M     : num  15.1 14.3 14.2 13.6 14.1 12.1 12.7 13.1 15.7 14 ...
##  $ So    : int  1 0 1 0 0 0 1 1 1 0 ...
##  $ Ed    : num  9.1 11.3 8.9 12.1 12.1 11 11.1 10.9 9 11.8 ...
##  $ Po1   : num  5.8 10.3 4.5 14.9 10.9 11.8 8.2 11.5 6.5 7.1 ...
##  $ Po2   : num  5.6 9.5 4.4 14.1 10.1 11.5 7.9 10.9 6.2 6.8 ...
##  $ LF    : num  0.51 0.583 0.533 0.577 0.591 0.547 0.519 0.542 0.553 0.632 ...
##  $ M.F   : num  95 101.2 96.9 99.4 98.5 ...
##  $ Pop   : int  33 13 18 157 18 25 4 50 39 7 ...
##  $ NW    : num  30.1 10.2 21.9 8 3 4.4 13.9 17.9 28.6 1.5 ...
##  $ U1    : num  0.108 0.096 0.094 0.102 0.091 0.084 0.097 0.079 0.081 0.1 ...
##  $ U2    : num  4.1 3.6 3.3 3.9 2 2.9 3.8 3.5 2.8 2.4 ...
##  $ Wealth: int  3940 5570 3180 6730 5780 6890 6200 4720 4210 5260 ...
##  $ Ineq  : num  26.1 19.4 25 16.7 17.4 12.6 16.8 20.6 23.9 17.4 ...
##  $ Prob  : num  0.0846 0.0296 0.0834 0.0158 0.0414 ...
##  $ Time  : num  26.2 25.3 24.3 29.9 21.3 ...
##  $ Crime : int  791 1635 578 1969 1234 682 963 1555 856 705 ...

#the above line helps us to look at the structure of the data. 

#Now, let's look at the summary of our data. 

summary(uscrime)

##        M               So               Ed             Po1       
##  Min.   :11.90   Min.   :0.0000   Min.   : 8.70   Min.   : 4.50  
##  1st Qu.:13.00   1st Qu.:0.0000   1st Qu.: 9.75   1st Qu.: 6.25  
##  Median :13.60   Median :0.0000   Median :10.80   Median : 7.80  
##  Mean   :13.86   Mean   :0.3404   Mean   :10.56   Mean   : 8.50  
##  3rd Qu.:14.60   3rd Qu.:1.0000   3rd Qu.:11.45   3rd Qu.:10.45  
##  Max.   :17.70   Max.   :1.0000   Max.   :12.20   Max.   :16.60  
##       Po2               LF              M.F              Pop        
##  Min.   : 4.100   Min.   :0.4800   Min.   : 93.40   Min.   :  3.00  
##  1st Qu.: 5.850   1st Qu.:0.5305   1st Qu.: 96.45   1st Qu.: 10.00  
##  Median : 7.300   Median :0.5600   Median : 97.70   Median : 25.00  
##  Mean   : 8.023   Mean   :0.5612   Mean   : 98.30   Mean   : 36.62  
##  3rd Qu.: 9.700   3rd Qu.:0.5930   3rd Qu.: 99.20   3rd Qu.: 41.50  
##  Max.   :15.700   Max.   :0.6410   Max.   :107.10   Max.   :168.00  
##        NW              U1                U2            Wealth    
##  Min.   : 0.20   Min.   :0.07000   Min.   :2.000   Min.   :2880  
##  1st Qu.: 2.40   1st Qu.:0.08050   1st Qu.:2.750   1st Qu.:4595  
##  Median : 7.60   Median :0.09200   Median :3.400   Median :5370  
##  Mean   :10.11   Mean   :0.09547   Mean   :3.398   Mean   :5254  
##  3rd Qu.:13.25   3rd Qu.:0.10400   3rd Qu.:3.850   3rd Qu.:5915  
##  Max.   :42.30   Max.   :0.14200   Max.   :5.800   Max.   :6890  
##       Ineq            Prob              Time           Crime       
##  Min.   :12.60   Min.   :0.00690   Min.   :12.20   Min.   : 342.0  
##  1st Qu.:16.55   1st Qu.:0.03270   1st Qu.:21.60   1st Qu.: 658.5  
##  Median :17.60   Median :0.04210   Median :25.80   Median : 831.0  
##  Mean   :19.40   Mean   :0.04709   Mean   :26.60   Mean   : 905.1  
##  3rd Qu.:22.75   3rd Qu.:0.05445   3rd Qu.:30.45   3rd Qu.:1057.5  
##  Max.   :27.60   Max.   :0.11980   Max.   :44.00   Max.   :1993.0

#Now, let's have a look at the very last column.

uscrime$Crime

##  [1]  791 1635  578 1969 1234  682  963 1555  856  705 1674  849  511  664  798
## [16]  946  539  929  750 1225  742  439 1216  968  523 1993  342 1216 1043  696
## [31]  373  754 1072  923  653 1272  831  566  826 1151  880  542  823 1030  455
## [46]  508  849

#Now, let's have a look at its outliers by creating a boxplot.

boxplot(uscrime$Crime, main = "Crime Rate per 100,000 people",  ylab = "Crime Rate",
        xlab = "Crime Data")

#It is a bit hard to see the values of median and percentiles. Let's make the chart more detailed. 

# Add the actual data points
stripchart(uscrime$Crime, 
           vertical = TRUE, 
           method = "jitter",
           add = TRUE, 
           pch = 20, 
           col = "blue")

#Let's experiment with it a bit more. 

# Boxplot with custom axis intervals
boxplot(uscrime$Crime, 
        main = "Crime Rate per 100,000 people",
        ylab = "Crime Rate",
        yaxt = "n")  # turns off the default y axis

# Add custom y axis with intervals of 300
axis(side = 2, 
     at = seq(0, 2100, by = 300))

# The boxplot confirms that some states had a suspiciously high crime rate compared to the rest.

# We would ideally need to investigate and find out why that is the case to reduce crime. 

#Question 6.2
"Describe a situation or problem from your job, everyday life, current events, etc., for which a 
Change Detection model would be appropriate. Applying the CUSUM technique, how would you choose the
critical value and the threshold?"

## [1] "Describe a situation or problem from your job, everyday life, current events, etc., for which a \nChange Detection model would be appropriate. Applying the CUSUM technique, how would you choose the\ncritical value and the threshold?"

" There is a meat store chain across my city. They sell not only meet, but they also offer dairy prodocuts such as milk. 
They need to make sure that their refrigerators' temprature stays within a certain range. 
So, we need to have a change detection method in place."

## [1] " There is a meat store chain across my city. They sell not only meet, but they also offer dairy prodocuts such as milk. \nThey need to make sure that their refrigerators' temprature stays within a certain range. \nSo, we need to have a change detection method in place."

#Q6.2.1
"Using July through October daily-high-temperature data for Atlanta for 1996 through 2015,
use a CUSUM approach to identify when unofficial summer ends (i.e., when the weather 
starts cooling off) each year. You can get the data that you need from the file temps.txt 
or online, for example at http://www.iweathernet.com/atlanta-weather-records or
https://www.wunderground.com/history/airport/KFTY/2015/7/1/CustomHistory.html.
You can use R if you’d like, but it’s straightforward enough that an Excel spreadsheet can easily do the job too."

## [1] "Using July through October daily-high-temperature data for Atlanta for 1996 through 2015,\nuse a CUSUM approach to identify when unofficial summer ends (i.e., when the weather \nstarts cooling off) each year. You can get the data that you need from the file temps.txt \nor online, for example at http://www.iweathernet.com/atlanta-weather-records or\nhttps://www.wunderground.com/history/airport/KFTY/2015/7/1/CustomHistory.html.\nYou can use R if you’d like, but it’s straightforward enough that an Excel spreadsheet can easily do the job too."

#6.2.1 Answer: The unofficial summer ends around late September. Please see the uploaded file in my account for further information.  


#6.2.2Use a CUSUM approach to make a judgment of whether Atlanta’s summer climate has gotten warmer in that time (and if so, when).

#6.2.2 Answer: The weather has become consistently warmer over the years. Please see the attached Excel file for detailed information. 


# Now, we are going to answer the above questions using RSTudio by Claude. 

temps <- read.table("C:/Users/moham/Downloads/Georgia Tech University/Week 2/week 2 hw-summer2026/week 2 data-summer/data 6.2/temps.txt", header = TRUE)

#Now, let's look at the rows and dims. 

head(temps)

##     DAY X1996 X1997 X1998 X1999 X2000 X2001 X2002 X2003 X2004 X2005 X2006 X2007
## 1 1-Jul    98    86    91    84    89    84    90    73    82    91    93    95
## 2 2-Jul    97    90    88    82    91    87    90    81    81    89    93    85
## 3 3-Jul    97    93    91    87    93    87    87    87    86    86    93    82
## 4 4-Jul    90    91    91    88    95    84    89    86    88    86    91    86
## 5 5-Jul    89    84    91    90    96    86    93    80    90    89    90    88
## 6 6-Jul    93    84    89    91    96    87    93    84    90    82    81    87
##   X2008 X2009 X2010 X2011 X2012 X2013 X2014 X2015
## 1    85    95    87    92   105    82    90    85
## 2    87    90    84    94    93    85    93    87
## 3    91    89    83    95    99    76    87    79
## 4    90    91    85    92    98    77    84    85
## 5    88    80    88    90   100    83    86    84
## 6    82    87    89    90    98    83    87    84

dim(temps)

## [1] 123  21

str(temps)

## 'data.frame':    123 obs. of  21 variables:
##  $ DAY  : chr  "1-Jul" "2-Jul" "3-Jul" "4-Jul" ...
##  $ X1996: int  98 97 97 90 89 93 93 91 93 93 ...
##  $ X1997: int  86 90 93 91 84 84 75 87 84 87 ...
##  $ X1998: int  91 88 91 91 91 89 93 95 95 91 ...
##  $ X1999: int  84 82 87 88 90 91 82 86 87 87 ...
##  $ X2000: int  89 91 93 95 96 96 96 91 96 99 ...
##  $ X2001: int  84 87 87 84 86 87 87 89 91 87 ...
##  $ X2002: int  90 90 87 89 93 93 89 89 90 91 ...
##  $ X2003: int  73 81 87 86 80 84 87 90 89 84 ...
##  $ X2004: int  82 81 86 88 90 90 89 87 88 89 ...
##  $ X2005: int  91 89 86 86 89 82 76 88 89 78 ...
##  $ X2006: int  93 93 93 91 90 81 80 82 84 84 ...
##  $ X2007: int  95 85 82 86 88 87 82 82 89 86 ...
##  $ X2008: int  85 87 91 90 88 82 88 90 89 87 ...
##  $ X2009: int  95 90 89 91 80 87 86 82 84 84 ...
##  $ X2010: int  87 84 83 85 88 89 94 97 96 90 ...
##  $ X2011: int  92 94 95 92 90 90 94 94 91 92 ...
##  $ X2012: int  105 93 99 98 100 98 93 95 97 95 ...
##  $ X2013: int  82 85 76 77 83 83 79 88 88 87 ...
##  $ X2014: int  90 93 87 84 86 87 89 90 90 87 ...
##  $ X2015: int  85 87 79 85 84 84 90 90 91 93 ...

#the above line helps us to look at the structure of the data. 

#Now, let's look at the summary of our data. 

summary(uscrime)

##        M               So               Ed             Po1       
##  Min.   :11.90   Min.   :0.0000   Min.   : 8.70   Min.   : 4.50  
##  1st Qu.:13.00   1st Qu.:0.0000   1st Qu.: 9.75   1st Qu.: 6.25  
##  Median :13.60   Median :0.0000   Median :10.80   Median : 7.80  
##  Mean   :13.86   Mean   :0.3404   Mean   :10.56   Mean   : 8.50  
##  3rd Qu.:14.60   3rd Qu.:1.0000   3rd Qu.:11.45   3rd Qu.:10.45  
##  Max.   :17.70   Max.   :1.0000   Max.   :12.20   Max.   :16.60  
##       Po2               LF              M.F              Pop        
##  Min.   : 4.100   Min.   :0.4800   Min.   : 93.40   Min.   :  3.00  
##  1st Qu.: 5.850   1st Qu.:0.5305   1st Qu.: 96.45   1st Qu.: 10.00  
##  Median : 7.300   Median :0.5600   Median : 97.70   Median : 25.00  
##  Mean   : 8.023   Mean   :0.5612   Mean   : 98.30   Mean   : 36.62  
##  3rd Qu.: 9.700   3rd Qu.:0.5930   3rd Qu.: 99.20   3rd Qu.: 41.50  
##  Max.   :15.700   Max.   :0.6410   Max.   :107.10   Max.   :168.00  
##        NW              U1                U2            Wealth    
##  Min.   : 0.20   Min.   :0.07000   Min.   :2.000   Min.   :2880  
##  1st Qu.: 2.40   1st Qu.:0.08050   1st Qu.:2.750   1st Qu.:4595  
##  Median : 7.60   Median :0.09200   Median :3.400   Median :5370  
##  Mean   :10.11   Mean   :0.09547   Mean   :3.398   Mean   :5254  
##  3rd Qu.:13.25   3rd Qu.:0.10400   3rd Qu.:3.850   3rd Qu.:5915  
##  Max.   :42.30   Max.   :0.14200   Max.   :5.800   Max.   :6890  
##       Ineq            Prob              Time           Crime       
##  Min.   :12.60   Min.   :0.00690   Min.   :12.20   Min.   : 342.0  
##  1st Qu.:16.55   1st Qu.:0.03270   1st Qu.:21.60   1st Qu.: 658.5  
##  Median :17.60   Median :0.04210   Median :25.80   Median : 831.0  
##  Mean   :19.40   Mean   :0.04709   Mean   :26.60   Mean   : 905.1  
##  3rd Qu.:22.75   3rd Qu.:0.05445   3rd Qu.:30.45   3rd Qu.:1057.5  
##  Max.   :27.60   Max.   :0.11980   Max.   :44.00   Max.   :1993.0

#Next step is to extract the data from the summer months. 
summer_temps <- temps[1:62, 2:21]

#Step 2: Calculate The Mean For Each Year
#Calculate yearly averages for July and August only
yearly_avg <- colMeans(summer_temps)
yearly_avg

##    X1996    X1997    X1998    X1999    X2000    X2001    X2002    X2003 
## 89.61290 86.53226 88.24194 89.62903 91.40323 86.74194 89.20968 86.22581 
##    X2004    X2005    X2006    X2007    X2008    X2009    X2010    X2011 
## 86.50000 86.98387 89.96774 91.20968 87.70968 87.11290 91.30645 92.70968 
##    X2012    X2013    X2014    X2015 
## 90.87097 84.83871 87.43548 89.41935

# Overall mean of yearly averages
overall_mean <- mean(yearly_avg)

# Standard deviation
std_dev <- sd(yearly_avg)

# K value
k <- std_dev / 2

# H threshold
h <- 5 * std_dev

# Print all values
overall_mean

## [1] 88.68306

## [1] 1.075585

## [1] 10.75585

# Initialize CUSUM
cusum <- rep(0, 20)

# Calculate CUSUM for each year
for(i in 2:20) {
  cusum[i] <- max(0, cusum[i-1] + (yearly_avg[i] - overall_mean - k))
}

# Print results
cusum

##  [1] 0.0000000 0.0000000 0.0000000 0.0000000 1.6445761 0.0000000 0.0000000
##  [8] 0.0000000 0.0000000 0.0000000 0.2090923 1.6601200 0.0000000 0.0000000
## [15] 1.5478019 4.4988297 5.6111478 0.6912078 0.0000000 0.0000000

cusum <- rep(0, 20)
for(i in 2:20) {
  cusum[i] <- max(0, cusum[i-1] + (yearly_avg[i] - overall_mean - k))
}
cusum

##  [1] 0.0000000 0.0000000 0.0000000 0.0000000 1.6445761 0.0000000 0.0000000
##  [8] 0.0000000 0.0000000 0.0000000 0.2090923 1.6601200 0.0000000 0.0000000
## [15] 1.5478019 4.4988297 5.6111478 0.6912078 0.0000000 0.0000000

#Observation:

#I compared the results from my own work in Excel and what Claude helped me to do in R Studio. 
#The two approaches give different conclusions because of the mean calculation difference.
#It shows how sensitive CUSUM is to your choice of baseline!