#1- Loading Data, Data Summary, Mean, and Median
#Importing Dataset
# Load the readr package
library(readr)
# The URL of the raw version of the CSV file on GitHub
url = "https://raw.githubusercontent.com/sleepysloth12/JJim_2023RSummerBridge/e3aeac38e50b05827d573b1393bef38c65369279/Week2/nuclear.csv"
# Read the CSV data from the URL
nuclear <- read_csv(url)
## New names:
## Rows: 32 Columns: 12
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," dbl
## (12): ...1, cost, date, t1, t2, cap, pr, ne, ct, bw, cum.n, pt
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
# View the data
print(nuclear)
## # A tibble: 32 × 12
## ...1 cost date t1 t2 cap pr ne ct bw cum.n pt
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 460. 68.6 14 46 687 0 1 0 0 14 0
## 2 2 453. 67.3 10 73 1065 0 0 1 0 1 0
## 3 3 443. 67.3 10 85 1065 1 0 1 0 1 0
## 4 4 652. 68 11 67 1065 0 1 1 0 12 0
## 5 5 642. 68 11 78 1065 1 1 1 0 12 0
## 6 6 345. 67.9 13 51 514 0 1 1 0 3 0
## 7 7 272. 68.2 12 50 822 0 0 0 0 5 0
## 8 8 317. 68.4 14 59 457 0 0 0 0 1 0
## 9 9 457. 68.4 15 55 822 1 0 0 0 5 0
## 10 10 690. 68.3 12 71 792 0 1 1 1 2 0
## # ℹ 22 more rows
summary(nuclear)
## ...1 cost date t1
## Min. : 1.00 Min. :207.5 Min. :67.17 Min. : 7.00
## 1st Qu.: 8.75 1st Qu.:310.3 1st Qu.:67.90 1st Qu.:11.75
## Median :16.50 Median :448.1 Median :68.42 Median :13.00
## Mean :16.50 Mean :461.6 Mean :68.58 Mean :13.75
## 3rd Qu.:24.25 3rd Qu.:612.0 3rd Qu.:68.92 3rd Qu.:15.25
## Max. :32.00 Max. :881.2 Max. :71.08 Max. :22.00
## t2 cap pr ne
## Min. :44.00 Min. : 457.0 Min. :0.0000 Min. :0.00
## 1st Qu.:56.50 1st Qu.: 745.0 1st Qu.:0.0000 1st Qu.:0.00
## Median :62.50 Median : 822.0 Median :0.0000 Median :0.00
## Mean :62.38 Mean : 825.4 Mean :0.3125 Mean :0.25
## 3rd Qu.:70.25 3rd Qu.: 947.2 3rd Qu.:1.0000 3rd Qu.:0.25
## Max. :85.00 Max. :1130.0 Max. :1.0000 Max. :1.00
## ct bw cum.n pt
## Min. :0.0000 Min. :0.0000 Min. : 1.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 3.000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median : 7.500 Median :0.0000
## Mean :0.4062 Mean :0.1875 Mean : 8.531 Mean :0.1875
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:12.500 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :21.000 Max. :1.0000
I chose the data set nuclear, which has different variables regarding the construction of nuclear power plants in the United States. As seen from the summary function, the data set has 32 rows and 12 columns. The summary function displays the Mean, Median, and IQR of each column.
Displaying the mean and median Net Power Capacity of the nuclear power plants:
meancap=mean(nuclear$cap)
medcap=median(nuclear$cap)
print(paste('The mean net power capacity is',as.character(meancap)))
## [1] "The mean net power capacity is 825.375"
print(paste('The median net power capacity is',as.character(medcap)))
## [1] "The median net power capacity is 822"
Displaying the mean and median Net Cost of Construction Per Nuclear Power Plant in Millions of Dollars:
meancost=mean(nuclear$cost)
medcost=median(nuclear$cost)
print(paste('The mean net cost of construction for each power plant is $',as.character(meancost),'million dollars'))
## [1] "The mean net cost of construction for each power plant is $ 461.5603125 million dollars"
print(paste('The Median net cost of construction for each power plant is $',as.character(medcost),'million dollars'))
## [1] "The Median net cost of construction for each power plant is $ 448.105 million dollars"
#2- Creating New Data Frame with subset Rows and Columns
To answer this question, I will create a new data frame named ‘northeast’. ‘northeast’ will be a data frame of all nuclear power plants located in the north east also with a cooling tower.
north_east=subset(nuclear,ne==1)
north_east=subset(north_east,ct==1)
north_east$ne=NULL
north_east$ct=NULL
print(north_east)
## # A tibble: 4 × 10
## ...1 cost date t1 t2 cap pr bw cum.n pt
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 4 652. 68 11 67 1065 0 0 12 0
## 2 5 642. 68 11 78 1065 1 0 12 0
## 3 6 345. 67.9 13 51 514 0 0 3 0
## 4 10 690. 68.3 12 71 792 0 1 2 0
#3- Creating a New Column for North East
North East has a column named ‘date’ which is the amount of years from January 1st, 1990, that have passed since the approval of the construction permit for the nuclear power plant.
I will create a new column named ‘time_occupied’, which will have the number of years passed since the approval of the construction permit but relative to the age of the computer running the code.
#Create new Column
#New column is the amount of time that has passed since the original construction permit for the powerplant was given in the north east
#calculating num of years from 2023 to 1990(date of dataset)
now=Sys.Date()
orig_str = '01-01-1990'
orig_date =as.Date(orig_str, format = '%m-%d-%Y')
time_since=now-orig_date
time_since=as.numeric(time_since)
time_since_yrs=time_since/365.25
north_east$time_occupied=(north_east$date)+time_since_yrs
print(north_east)
## # A tibble: 4 × 11
## ...1 cost date t1 t2 cap pr bw cum.n pt time_occupied
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 4 652. 68 11 67 1065 0 0 12 0 102.
## 2 5 642. 68 11 78 1065 1 0 12 0 102.
## 3 6 345. 67.9 13 51 514 0 0 3 0 101.
## 4 10 690. 68.3 12 71 792 0 1 2 0 102.
I actually misinterpreted the question.
Here is my fix. Renaming all columns:
north_east_2=north_east
column_names = names(north_east_2)
print(column_names)
## [1] "...1" "cost" "date" "t1"
## [5] "t2" "cap" "pr" "bw"
## [9] "cum.n" "pt" "time_occupied"
#New Column Names
new_names=c('no','cost of construction','date since permit','time for regulatory approval','time of inspection','power capacity','prior plant at sight','Babcock-Wilcox Steam Supply',
'cumul Num of powerplants built by each architect engineer' , 'partial turnkey','time occupied' )
names(north_east_2)=new_names
print(north_east_2)
## # A tibble: 4 × 11
## no `cost of construction` `date since permit` time for regulatory approva…¹
## <dbl> <dbl> <dbl> <dbl>
## 1 4 652. 68 11
## 2 5 642. 68 11
## 3 6 345. 67.9 13
## 4 10 690. 68.3 12
## # ℹ abbreviated name: ¹`time for regulatory approval`
## # ℹ 7 more variables: `time of inspection` <dbl>, `power capacity` <dbl>,
## # `prior plant at sight` <dbl>, `Babcock-Wilcox Steam Supply` <dbl>,
## # `cumul Num of powerplants built by each architect engineer` <dbl>,
## # `partial turnkey` <dbl>, `time occupied` <dbl>
#4- Overview and Comparison of NorthEast
summary(north_east)
## ...1 cost date t1
## Min. : 4.00 Min. :345.4 Min. :67.92 Min. :11.00
## 1st Qu.: 4.75 1st Qu.:568.0 1st Qu.:67.98 1st Qu.:11.00
## Median : 5.50 Median :647.3 Median :68.00 Median :11.50
## Mean : 6.25 Mean :582.5 Mean :68.06 Mean :11.75
## 3rd Qu.: 7.00 3rd Qu.:661.8 3rd Qu.:68.08 3rd Qu.:12.25
## Max. :10.00 Max. :690.2 Max. :68.33 Max. :13.00
## t2 cap pr bw cum.n
## Min. :51.00 Min. : 514.0 Min. :0.00 Min. :0.00 Min. : 2.00
## 1st Qu.:63.00 1st Qu.: 722.5 1st Qu.:0.00 1st Qu.:0.00 1st Qu.: 2.75
## Median :69.00 Median : 928.5 Median :0.00 Median :0.00 Median : 7.50
## Mean :66.75 Mean : 859.0 Mean :0.25 Mean :0.25 Mean : 7.25
## 3rd Qu.:72.75 3rd Qu.:1065.0 3rd Qu.:0.25 3rd Qu.:0.25 3rd Qu.:12.00
## Max. :78.00 Max. :1065.0 Max. :1.00 Max. :1.00 Max. :12.00
## pt time_occupied
## Min. :0 Min. :101.5
## 1st Qu.:0 1st Qu.:101.5
## Median :0 Median :101.5
## Mean :0 Mean :101.6
## 3rd Qu.:0 3rd Qu.:101.6
## Max. :0 Max. :101.9
#Mean and median capacity of nuclear power plants in the north east
mean_ne_cap=mean(north_east$cap)
med_ne_cap=median(north_east$cap)
#The mean capacity of north eastern power plants was slightly bigger than nation wide power plants
#The median capacity of north eastern power plants was ~100 MWe higher than the nationwide median
sd(north_east$cap)
## [1] 263.5564
sd(nuclear$cap)
## [1] 189.3591
#North Eastern Nuclear Power Plants have a higher spread in variability of capacitance compared to the nationwide spread
print(paste('The mean net power capacity is',as.character(mean_ne_cap),'MWe'))
## [1] "The mean net power capacity is 859 MWe"
print(paste('The median net power capacity is',as.character(med_ne_cap),'MWe'))
## [1] "The median net power capacity is 928.5 MWe"
The mean capacity of north eastern power plants was slightly bigger than nation wide power plants.
The median capacity of north eastern power plants was ~100 MWe higher than the nationwide median.
North Eastern Nuclear Power Plants have a higher spread in variability of capacitance compared to the nationwide spread.
Mean and Median Cost of Nuclear Power Plants in the North East:
mean_ne_cost=mean(north_east$cost)
med_ne_cost=median(north_east$cost)
avg_dif=mean_ne_cost-meancost
print(paste('The mean net cost of construction for each power plant is $',as.character(mean_ne_cost),'million dollars'))
## [1] "The mean net cost of construction for each power plant is $ 582.5325 million dollars"
print(paste('The Median net cost of construction for each power plant is $',as.character(med_ne_cost),'million dollars'))
## [1] "The Median net cost of construction for each power plant is $ 647.275 million dollars"
print(paste('Constructing a Nuclear power plant in the north east is more expensive with an average difference of $',as.character(avg_dif),'million dollars'))
## [1] "Constructing a Nuclear power plant in the north east is more expensive with an average difference of $ 120.9721875 million dollars"
The mean and median cost of nuclear power plants in the North East is more than the mean and median throughout the whole country
Constructing a Nuclear power plant in the north east is more expensive with an average difference of ~$121 million dollars
#5- Replace a Column’s values with something else
cum.n is a column in nuclear that represents the cumulative number of nuclear power plants already built by the architect and engineer of each power plant.
Lets make 3 Categories:
0-5 will be replaced with ‘no xp’
6-10 will be replaced with ‘adv xp’
>11 will be replaced with ‘xpert’
I am making a new data frame named nuclear 2 to demonstrate the function while not changing anything I already have.
nuclear2=nuclear
nuclear2$cum.n=ifelse(nuclear2$cum.n >= 0 & nuclear2$cum.n <= 5 , 'no xp',
ifelse(nuclear2$cum.n >= 6 & nuclear2$cum.n <= 10, 'adv xp',
ifelse(nuclear2$cum.n >=11, 'xpert',NA_character_)) )
print(nuclear2)
## # A tibble: 32 × 12
## ...1 cost date t1 t2 cap pr ne ct bw cum.n pt
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 1 460. 68.6 14 46 687 0 1 0 0 xpert 0
## 2 2 453. 67.3 10 73 1065 0 0 1 0 no xp 0
## 3 3 443. 67.3 10 85 1065 1 0 1 0 no xp 0
## 4 4 652. 68 11 67 1065 0 1 1 0 xpert 0
## 5 5 642. 68 11 78 1065 1 1 1 0 xpert 0
## 6 6 345. 67.9 13 51 514 0 1 1 0 no xp 0
## 7 7 272. 68.2 12 50 822 0 0 0 0 no xp 0
## 8 8 317. 68.4 14 59 457 0 0 0 0 no xp 0
## 9 9 457. 68.4 15 55 822 1 0 0 0 no xp 0
## 10 10 690. 68.3 12 71 792 0 1 1 1 no xp 0
## # ℹ 22 more rows
As you can see now, the cum.n column has been replaced with the above three values.
nuclear2$cum.n
## [1] "xpert" "no xp" "no xp" "xpert" "xpert" "no xp" "no xp" "no xp"
## [9] "no xp" "no xp" "no xp" "adv xp" "no xp" "adv xp" "xpert" "no xp"
## [17] "xpert" "no xp" "no xp" "adv xp" "xpert" "xpert" "xpert" "no xp"
## [25] "xpert" "xpert" "adv xp" "adv xp" "xpert" "xpert" "adv xp" "xpert"