#1- Loading Data, Data Summary, Mean, and Median

#Importing Dataset

# Load the readr package
library(readr)

# The URL of the raw version of the CSV file on GitHub
url = "https://raw.githubusercontent.com/sleepysloth12/JJim_2023RSummerBridge/e3aeac38e50b05827d573b1393bef38c65369279/Week2/nuclear.csv"

# Read the CSV data from the URL
nuclear <- read_csv(url)
## New names:
## Rows: 32 Columns: 12
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," dbl
## (12): ...1, cost, date, t1, t2, cap, pr, ne, ct, bw, cum.n, pt
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
# View the data
print(nuclear)
## # A tibble: 32 × 12
##     ...1  cost  date    t1    t2   cap    pr    ne    ct    bw cum.n    pt
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1     1  460.  68.6    14    46   687     0     1     0     0    14     0
##  2     2  453.  67.3    10    73  1065     0     0     1     0     1     0
##  3     3  443.  67.3    10    85  1065     1     0     1     0     1     0
##  4     4  652.  68      11    67  1065     0     1     1     0    12     0
##  5     5  642.  68      11    78  1065     1     1     1     0    12     0
##  6     6  345.  67.9    13    51   514     0     1     1     0     3     0
##  7     7  272.  68.2    12    50   822     0     0     0     0     5     0
##  8     8  317.  68.4    14    59   457     0     0     0     0     1     0
##  9     9  457.  68.4    15    55   822     1     0     0     0     5     0
## 10    10  690.  68.3    12    71   792     0     1     1     1     2     0
## # ℹ 22 more rows
summary(nuclear)
##       ...1            cost            date             t1       
##  Min.   : 1.00   Min.   :207.5   Min.   :67.17   Min.   : 7.00  
##  1st Qu.: 8.75   1st Qu.:310.3   1st Qu.:67.90   1st Qu.:11.75  
##  Median :16.50   Median :448.1   Median :68.42   Median :13.00  
##  Mean   :16.50   Mean   :461.6   Mean   :68.58   Mean   :13.75  
##  3rd Qu.:24.25   3rd Qu.:612.0   3rd Qu.:68.92   3rd Qu.:15.25  
##  Max.   :32.00   Max.   :881.2   Max.   :71.08   Max.   :22.00  
##        t2             cap               pr               ne      
##  Min.   :44.00   Min.   : 457.0   Min.   :0.0000   Min.   :0.00  
##  1st Qu.:56.50   1st Qu.: 745.0   1st Qu.:0.0000   1st Qu.:0.00  
##  Median :62.50   Median : 822.0   Median :0.0000   Median :0.00  
##  Mean   :62.38   Mean   : 825.4   Mean   :0.3125   Mean   :0.25  
##  3rd Qu.:70.25   3rd Qu.: 947.2   3rd Qu.:1.0000   3rd Qu.:0.25  
##  Max.   :85.00   Max.   :1130.0   Max.   :1.0000   Max.   :1.00  
##        ct               bw             cum.n              pt        
##  Min.   :0.0000   Min.   :0.0000   Min.   : 1.000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.: 3.000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median : 7.500   Median :0.0000  
##  Mean   :0.4062   Mean   :0.1875   Mean   : 8.531   Mean   :0.1875  
##  3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:12.500   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :21.000   Max.   :1.0000

I chose the data set nuclear, which has different variables regarding the construction of nuclear power plants in the United States. As seen from the summary function, the data set has 32 rows and 12 columns. The summary function displays the Mean, Median, and IQR of each column.

Displaying the mean and median Net Power Capacity of the nuclear power plants:

meancap=mean(nuclear$cap)
medcap=median(nuclear$cap)

print(paste('The mean net power capacity is',as.character(meancap)))
## [1] "The mean net power capacity is 825.375"
print(paste('The median net power capacity is',as.character(medcap)))
## [1] "The median net power capacity is 822"

Displaying the mean and median Net Cost of Construction Per Nuclear Power Plant in Millions of Dollars:

meancost=mean(nuclear$cost)
medcost=median(nuclear$cost)

print(paste('The mean net cost of construction for each power plant is $',as.character(meancost),'million dollars'))
## [1] "The mean net cost of construction for each power plant is $ 461.5603125 million dollars"
print(paste('The Median net cost of construction for each power plant is $',as.character(medcost),'million dollars'))
## [1] "The Median net cost of construction for each power plant is $ 448.105 million dollars"

#2- Creating New Data Frame with subset Rows and Columns

To answer this question, I will create a new data frame named ‘northeast’. ‘northeast’ will be a data frame of all nuclear power plants located in the north east also with a cooling tower.

north_east=subset(nuclear,ne==1)

north_east=subset(north_east,ct==1)

north_east$ne=NULL
north_east$ct=NULL

print(north_east)
## # A tibble: 4 × 10
##    ...1  cost  date    t1    t2   cap    pr    bw cum.n    pt
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1     4  652.  68      11    67  1065     0     0    12     0
## 2     5  642.  68      11    78  1065     1     0    12     0
## 3     6  345.  67.9    13    51   514     0     0     3     0
## 4    10  690.  68.3    12    71   792     0     1     2     0

#3- Creating a New Column for North East

North East has a column named ‘date’ which is the amount of years from January 1st, 1990, that have passed since the approval of the construction permit for the nuclear power plant.

I will create a new column named ‘time_occupied’, which will have the number of years passed since the approval of the construction permit but relative to the age of the computer running the code.

#Create new Column
#New column is the amount of time that has passed since the original construction permit for the powerplant was given in the north east

#calculating num of years from 2023 to 1990(date of dataset)
now=Sys.Date()
orig_str = '01-01-1990'
orig_date =as.Date(orig_str, format = '%m-%d-%Y')
time_since=now-orig_date

time_since=as.numeric(time_since)
time_since_yrs=time_since/365.25

north_east$time_occupied=(north_east$date)+time_since_yrs

print(north_east)
## # A tibble: 4 × 11
##    ...1  cost  date    t1    t2   cap    pr    bw cum.n    pt time_occupied
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>         <dbl>
## 1     4  652.  68      11    67  1065     0     0    12     0          102.
## 2     5  642.  68      11    78  1065     1     0    12     0          102.
## 3     6  345.  67.9    13    51   514     0     0     3     0          101.
## 4    10  690.  68.3    12    71   792     0     1     2     0          102.

I actually misinterpreted the question.

Here is my fix. Renaming all columns:

north_east_2=north_east
column_names = names(north_east_2)
print(column_names)
##  [1] "...1"          "cost"          "date"          "t1"           
##  [5] "t2"            "cap"           "pr"            "bw"           
##  [9] "cum.n"         "pt"            "time_occupied"
#New Column Names
new_names=c('no','cost of construction','date since permit','time for regulatory approval','time of inspection','power capacity','prior plant at sight','Babcock-Wilcox Steam Supply',
          'cumul Num of powerplants built by each architect engineer' , 'partial turnkey','time occupied' )
names(north_east_2)=new_names
print(north_east_2)
## # A tibble: 4 × 11
##      no `cost of construction` `date since permit` time for regulatory approva…¹
##   <dbl>                  <dbl>               <dbl>                         <dbl>
## 1     4                   652.                68                              11
## 2     5                   642.                68                              11
## 3     6                   345.                67.9                            13
## 4    10                   690.                68.3                            12
## # ℹ abbreviated name: ¹​`time for regulatory approval`
## # ℹ 7 more variables: `time of inspection` <dbl>, `power capacity` <dbl>,
## #   `prior plant at sight` <dbl>, `Babcock-Wilcox Steam Supply` <dbl>,
## #   `cumul Num of powerplants built by each architect engineer` <dbl>,
## #   `partial turnkey` <dbl>, `time occupied` <dbl>

#4- Overview and Comparison of NorthEast

summary(north_east)
##       ...1            cost            date             t1       
##  Min.   : 4.00   Min.   :345.4   Min.   :67.92   Min.   :11.00  
##  1st Qu.: 4.75   1st Qu.:568.0   1st Qu.:67.98   1st Qu.:11.00  
##  Median : 5.50   Median :647.3   Median :68.00   Median :11.50  
##  Mean   : 6.25   Mean   :582.5   Mean   :68.06   Mean   :11.75  
##  3rd Qu.: 7.00   3rd Qu.:661.8   3rd Qu.:68.08   3rd Qu.:12.25  
##  Max.   :10.00   Max.   :690.2   Max.   :68.33   Max.   :13.00  
##        t2             cap               pr             bw           cum.n      
##  Min.   :51.00   Min.   : 514.0   Min.   :0.00   Min.   :0.00   Min.   : 2.00  
##  1st Qu.:63.00   1st Qu.: 722.5   1st Qu.:0.00   1st Qu.:0.00   1st Qu.: 2.75  
##  Median :69.00   Median : 928.5   Median :0.00   Median :0.00   Median : 7.50  
##  Mean   :66.75   Mean   : 859.0   Mean   :0.25   Mean   :0.25   Mean   : 7.25  
##  3rd Qu.:72.75   3rd Qu.:1065.0   3rd Qu.:0.25   3rd Qu.:0.25   3rd Qu.:12.00  
##  Max.   :78.00   Max.   :1065.0   Max.   :1.00   Max.   :1.00   Max.   :12.00  
##        pt    time_occupied  
##  Min.   :0   Min.   :101.5  
##  1st Qu.:0   1st Qu.:101.5  
##  Median :0   Median :101.5  
##  Mean   :0   Mean   :101.6  
##  3rd Qu.:0   3rd Qu.:101.6  
##  Max.   :0   Max.   :101.9
#Mean and median capacity of nuclear power plants in the north east

mean_ne_cap=mean(north_east$cap)
med_ne_cap=median(north_east$cap)

#The mean capacity of north eastern power plants was slightly bigger than nation wide power plants
#The median capacity of north eastern power plants was ~100 MWe higher than the nationwide median
sd(north_east$cap)
## [1] 263.5564
sd(nuclear$cap)
## [1] 189.3591
#North Eastern Nuclear Power Plants have a higher spread in variability of capacitance compared to the nationwide spread

print(paste('The mean net power capacity is',as.character(mean_ne_cap),'MWe'))
## [1] "The mean net power capacity is 859 MWe"
print(paste('The median net power capacity is',as.character(med_ne_cap),'MWe'))
## [1] "The median net power capacity is 928.5 MWe"

The mean capacity of north eastern power plants was slightly bigger than nation wide power plants.

The median capacity of north eastern power plants was ~100 MWe higher than the nationwide median.

North Eastern Nuclear Power Plants have a higher spread in variability of capacitance compared to the nationwide spread.

Mean and Median Cost of Nuclear Power Plants in the North East:

mean_ne_cost=mean(north_east$cost)
med_ne_cost=median(north_east$cost)
avg_dif=mean_ne_cost-meancost

print(paste('The mean net cost of construction for each power plant is $',as.character(mean_ne_cost),'million dollars'))
## [1] "The mean net cost of construction for each power plant is $ 582.5325 million dollars"
print(paste('The Median net cost of construction for each power plant is $',as.character(med_ne_cost),'million dollars'))
## [1] "The Median net cost of construction for each power plant is $ 647.275 million dollars"
print(paste('Constructing a Nuclear power plant in the north east is more expensive with an average difference of $',as.character(avg_dif),'million dollars'))
## [1] "Constructing a Nuclear power plant in the north east is more expensive with an average difference of $ 120.9721875 million dollars"

The mean and median cost of nuclear power plants in the North East is more than the mean and median throughout the whole country

Constructing a Nuclear power plant in the north east is more expensive with an average difference of ~$121 million dollars

#5- Replace a Column’s values with something else

cum.n is a column in nuclear that represents the cumulative number of nuclear power plants already built by the architect and engineer of each power plant.

Lets make 3 Categories:

0-5 will be replaced with ‘no xp’

6-10 will be replaced with ‘adv xp’

>11 will be replaced with ‘xpert’

I am making a new data frame named nuclear 2 to demonstrate the function while not changing anything I already have.

nuclear2=nuclear

nuclear2$cum.n=ifelse(nuclear2$cum.n >= 0 & nuclear2$cum.n <= 5 , 'no xp', 
                      ifelse(nuclear2$cum.n >= 6 & nuclear2$cum.n <= 10, 'adv xp', 
                             ifelse(nuclear2$cum.n >=11, 'xpert',NA_character_)) )
print(nuclear2)
## # A tibble: 32 × 12
##     ...1  cost  date    t1    t2   cap    pr    ne    ct    bw cum.n    pt
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl>
##  1     1  460.  68.6    14    46   687     0     1     0     0 xpert     0
##  2     2  453.  67.3    10    73  1065     0     0     1     0 no xp     0
##  3     3  443.  67.3    10    85  1065     1     0     1     0 no xp     0
##  4     4  652.  68      11    67  1065     0     1     1     0 xpert     0
##  5     5  642.  68      11    78  1065     1     1     1     0 xpert     0
##  6     6  345.  67.9    13    51   514     0     1     1     0 no xp     0
##  7     7  272.  68.2    12    50   822     0     0     0     0 no xp     0
##  8     8  317.  68.4    14    59   457     0     0     0     0 no xp     0
##  9     9  457.  68.4    15    55   822     1     0     0     0 no xp     0
## 10    10  690.  68.3    12    71   792     0     1     1     1 no xp     0
## # ℹ 22 more rows

As you can see now, the cum.n column has been replaced with the above three values.

nuclear2$cum.n
##  [1] "xpert"  "no xp"  "no xp"  "xpert"  "xpert"  "no xp"  "no xp"  "no xp" 
##  [9] "no xp"  "no xp"  "no xp"  "adv xp" "no xp"  "adv xp" "xpert"  "no xp" 
## [17] "xpert"  "no xp"  "no xp"  "adv xp" "xpert"  "xpert"  "xpert"  "no xp" 
## [25] "xpert"  "xpert"  "adv xp" "adv xp" "xpert"  "xpert"  "adv xp" "xpert"