filename tempurl TEMP;
proc http
url= "https://s3.amazonaws.com/talent-assets.datacamp.com/product_sales.csv"
method="get" out=tempurl;
run;
/* Import the data from the URL into a SAS dataset */
options errors=0; /* Suppress invalid data log entries whe REVENUE values are "NA" */
data work.sales;
infile tempurl dlm =',' dsd firstobs=2 truncover ; /* Set delimiter and skip header row */
/* Supplay informats on the input statement to set variable lengths */
input week:best32. sales_method:$12. customer_id:$36. nb_sold:best32. revenue:best32.
years_as_customer:best32. nb_site_visits:best32. state:$13. ;
run;
options errors=20; /* Restore default setting */
proc means data=mydata NMISS N; run; proc contents data = sales; run;
‘proc means’ with NMISS N show number of missing in variables: revenue has 1074 NA.
‘proc contents’ creat output with details as names of 8 variales in which 4 char type and 4 numeric type.
proc univariate data = sales; run;
‘proc univariate’ creat output with summary of 4 numeric variables in which ‘revenue’ has 1074 missing, 7.16%, as ‘.’
proc freq data=sales;
tables week sales_method state / out=sales_char(keep=var:);
run;
‘proc freq’ creat output with summary of 4 character variables in which some incorrected values of ‘sales_method’ are shown as ‘em + cal’, ‘email’.
These incorrected values should be replaced with the corrected values in next steps.
What are top 10 states in which customers bought products for the company.
proc freq data=sales2;
tables State / out=freq_table outpct noprint;
run;
proc sort data=freq_table;
by descending Count;
run;
proc print data=freq_table (obs=10);
var State Count Percent;
title 'Top 10 States of Customers';
run;
data sales2;
set sales;
if sales_method = 'email' then sales_method = 'Email';
else if sales_method = 'em + call' then sales_method = 'Email + Call' ;
else sales_method = sales_method;
run;
check if the incorrected are fixed
proc freq data=sales2;
tables sales_method / nocum out = sales_m;
run;
The ‘sales_method’ is now correct with 3 values as ‘Email’, ‘Email+’ and ‘Call’.
data sales_m; set sales2; if revenue = '.' then revenue = 93.93; run;
Check the output, see the dataset with ‘.’ is replaced with 93.93.
Two rows have value of 47 and 63, these incorrected values are replaced with the mean of 5 years.
data sales2; set sales2; if years_as_customer > 40 then years_as_customer = 5; run;
data sales2;
set sales;
if sales_method = 'email' then sales_method = 'Email';
else if sales_method = 'em + cal' then sales_method = 'Email + Call' ;
else sales_method = sales_method;
if years_as_customer > 40 then years_as_customer = 5;
if revenue = '.' then revenue = 93.93;
run;
proc sql ;
CREATE TABLE sales2 AS
select week,
CASE
WHEN sales_method = 'em + call' THEN 'Email + Call'
WHEN sales_method = 'email' THEN 'Email'
ELSE sales_method
END AS sales_method,
customer_id,
nb_sold,
CASE
WHEN years_as_customer > 40 THEN 5
ELSE years_as_customer
END AS years_as_customer,
CASE
WHEN revenue IS NULL THEN 93.93
ELSE revenue
END AS revenue,
nb_site_visits,
state
from sales
;
quit;
title "Number of Customers By Sale Methods";
proc sgplot data=sales2;
vbar sales_method / datalabel ;
xaxis ;
yaxis grid ;
run;
title;
BarPlot
proc sort data=sales2;
by sales_method;
run;
data sales_re;
set sales2(keep=sales_method revenue);
by sales_method;
if first.sales_method then sum_revenue = 0;
sum_revenue + revenue;
if last.sales_method;
run;
1 Call 244564.82
2 Email 723415.75
3 Email + 441038
BarPlot