###Start of assignment###
library(ggplot2) #load ggplot library
setwd(“C:/Users/magnu/Documents/R”) #set the working directory
nukedata <- read.csv(“nuclear.csv”) #import Nuclear power station construction data summary(nukedata) #explore data characteristics
#Initial data conclusions: #(1) ‘cost’: cost of construction in hundreds of millions #(2) ‘date’: months of construction #(3) ‘cap’: power generation capacity in MW #(4) ‘pr’,‘ne’,‘ct’,‘bw’,‘pt’: are all either 0 or 1, thus they likely mark that the station does or doesn’t have a certain characteristic
#Meaningful Question: what is the relationship between cost, date, and/or capacity for nuclear power station construction?
#Create subset of useful data - cost, date, and cap columns
nuke_df <- data.frame(X = nukedata\(cap[1:32], Y = nukedata\)date[1:32], Z = nukedata$cost[1:32])
#Create new column names for the new data frame.
names(nuke_df)[1] <- “Capacity (MW)” names(nuke_df)[2] <- “Construction Duration (mos)” names(nuke_df)[3] <- “Cost ($ millions)”
#Review relevant data for the new dataframe prior to plotting summary(nuke_df)
#Scatter plot of the construction duration #Min:67.17, Max: 71.08, Mean: 68.58 plot(nuke_df$Construction Duration (mos)
)
hist(nuke_df\(`Cost (\) millions)`) #Histogram of the cost of construction (in millions of dollars) #200-300: 8, 300-400: 4, 400-500:10, 500-600: 1, 600-700: 7, 700-800: 1, 800-900: 1
boxplot(nuke_df$Capacity (MW)
) #Boxplot of the capacity for generation after construction #min: 457, 1st Q: 745, Median: 822, 3rd Q: 947, Max: 1130
#apply ggplot to this dataset and interpret the results
qplot(Construction Duration (mos)
, Capacity (MW)
, data=nuke_df) #there is not a clear relationship between duration and capacity
qplot(Construction Duration (mos)
, Cost ($ millions)
, data=nuke_df) #there seems to be a correlation between duration of construction and cost
qplot(Capacity (MW)
, Cost ($ millions)
, data=nuke_df) #there seems to be a correlation between capacity and cost
#Answer: there is a relationship between the chosen subset variables. #While there is NOT a clear relationship between the duration and capacity, #there is a relationship between the duration and cost of construction AND #the capacity and cost of construction.
#Longer construction projects (ie. greater duration) and larger construction #projects (ie. greater capacity) can be directly tied to a greater cost. #Thus if we’re trying to lower the cost we can focus on one or both of these #variables to do so. This can be useful [LEFT OFF HERE] From this direct relationship, #we can indirectly tie the duration of the project to the
###BONUS: read the .csv file from github###
library(RCurl) x <- getURL(“https://raw.githubusercontent.com/Magnus-PS/CUNY-Bridge/master/nuclear.csv”) y <- read.csv(text = x) summary(y)
###End of assignment###