from datetime import datetime
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as mticker

%matplotlib inline

%reload_ext nb_black


# set option to display all columns
pd.set_option("display.max_columns", None)

# set option to display more rows
pd.set_option("display.min_rows", None)
pd.set_option("display.max_rows", 300)

# set options for plots
pd.set_option("display.width", 4500)
plt.rcParams["figure.figsize"] = (15, 7)

# Format numbers
pd.options.display.float_format = "{:,.0f}".format

# Set the plot background to white
plt.rcParams["axes.facecolor"] = "white"
plt.rcParams["figure.facecolor"] = "white"

# Set plot size
plt.rcParams["figure.figsize"] = (10, 5)
plt.rcParams["axes.axisbelow"] = True

styles = [
    {"selector": "th", "props": [("text-align", "left")]},
    {
        "selector": ".row_heading",
        "props": [
            ("text-align", "left"),
            ("font-size", "100%"),
            ("color", "black"),
        ],
    },
    {
        "selector": "caption",
        "props": [
            ("text-align", "left"),
            ("font-size", "120%"),
            ("color", "black"),
        ],
    },
]


def tbl_format(
    df, table_name, hide_index=True, col_types="formatted", col_names="titlecase"
):

    # create function to change column names to title case
    if col_names == "titlecase":

        def titlecase_col(string):
            def capitalize_words(word):
                exception = ["a", "an", "the", "or", "of", "and"]
                if word not in exception:
                    return word.capitalize()
                return word

            words_list = string.replace(" ", "_")
            words_list = string.split("_")
            words_formatted = map(capitalize_words, words_list)
            return " ".join(words_formatted)

        df = df.rename(columns=titlecase_col)

        if df.index.name is not None:
            df.index.name = titlecase_col(df.index.name)

    string_cols = list(df.select_dtypes(include="string"))
    date_cols = list(df.select_dtypes(include="datetime64"))
    int_cols = list(df.select_dtypes(include="int64"))
    float_cols = list(df.select_dtypes(include="float64"))

    cols_left = string_cols + date_cols
    cols_right = int_cols + float_cols

    # create dictionaries to format integer and float columns
    int_dict = dict.fromkeys(int_cols, "{:,.0f}")
    float_dict = dict.fromkeys(float_cols, "{:,.1f}")

    # Merge the dictionaries
    number_format = {**int_dict, **float_dict}

    # dict used to align column names of string columns to the left
    cols_left_dict = dict.fromkeys(
        cols_left, [{"selector": "th", "props": [("text-align", "left")]}]
    )

    # dict used to align column names of number columns to the right
    cols_right_dict = dict.fromkeys(
        cols_right, [{"selector": "th", "props": [("text-align", "right")]}]
    )

    header_align_dict = {**cols_left_dict, **cols_right_dict}
    styles_display = [
        {"selector": "th", "props": [("text-align", "left")]},
        {
            "selector": ".row_heading",
            "props": [
                ("text-align", "left"),
                ("font-size", "100%"),
                ("color", "black"),
            ],
        },
        {
            "selector": "caption",
            "props": [
                ("text-align", "left"),
                ("font-size", "120%"),
                ("color", "black"),
            ],
        },
    ]

    # if the columns have been assigned data types
    if col_types == "formatted":

        if hide_index == False:
            styler = (
                df.style.format(number_format)
                # align the headers according to dict values
                .set_table_styles(header_align_dict)
                # align index and configure caption using styles_display
                .set_table_styles(styles_display, overwrite=False)
                # align values to text if they are string or date
                .applymap(lambda x: "text-align:left", subset=cols_left).set_caption(
                    table_name
                )
            )

        else:
            styler = (
                df.style.format(number_format)
                .set_table_styles(header_align_dict)
                .set_table_styles(styles_display, overwrite=False)
                .applymap(lambda x: "text-align:left", subset=cols_left)
                .hide(axis="index")
                .set_caption(table_name)
            )

    # if the columns have not been assigned data types
    # and contain mostly strings
    elif col_types == "text_unformatted":
        if hide_index == False:
            styler = (
                df.style.format(number_format)
                .applymap(lambda x: "text-align:left")
                .applymap(lambda x: "text-align:right", subset=cols_right)
                .set_table_styles(header_align_dict, overwrite=True)
                .set_table_styles(styles_display, overwrite=False)
                .set_caption(table_name)
            )
        else:
            styler = (
                df.style.format(number_format)
                .applymap(lambda x: "text-align:left")
                .applymap(lambda x: "text-align:right", subset=cols_right)
                .set_table_styles(header_align_dict, overwrite=True)
                .set_table_styles(styles_display, overwrite=False)
                .hide(axis="index")
                .set_caption(table_name)
            )

    # if the columns have not been assigned data types
    # and contain mostly numbers
    elif col_types == "number_unformatted":
        if hide_index == False:
            styler = (
                df.style.format(number_format)
                .set_table_styles(styles_display, overwrite=False)
                .hide(axis="index")
                .set_caption(table_name)
            )
        else:
            styler = (
                df.style.format(number_format)
                .set_table_styles(styles_display, overwrite=False)
                .set_caption(table_name)
            )

    else:
        return "Ambiguous argument for col_types"
    display(styler)


complaints_orig = pd.read_csv("C:\\dataset\\311_service_requests_2021.csv", dtype="str")


# Preview the first 5 rows of the original dataset
tbl_format(
    complaints_orig.head(),
    "Table 1. Preview of the original dataset: NYC Service Requests 2021",
    col_types="text_unformatted",
    col_names="actual",
)


# Identify the number of observations (complaints) and columns in the data
row_count = complaints_orig.shape[0]
col_count = complaints_orig.shape[1]

print("Number of rows: ", "{:,.0f}".format(row_count))
print("Number of selected columns: ", col_count)
print("Date of first record: ", complaints_orig["created_date"].min())
print("Date of Last record: ",complaints_orig["created_date"].max())

Number of rows:  3,214,274
Number of selected columns:  7
Date of first record:  2021-01-01 00:00:00 UTC
Date of Last record:  2021-12-30 23:59:00 UTC


# Get column names from dataset
column_names = complaints_orig.columns.to_list()

# Create a data frame for the attributes of the dataset
selected_columns = pd.DataFrame({"column_name": column_names})

# load dataset with field descriptions of all columns
column_descriptions = pd.read_csv("C:\\dataset\\field_descriptions.csv", dtype="str")

# Merge the data frames to create table with field descriptions
selected_columns = pd.merge(selected_columns, column_descriptions, on="column_name")

selected_columns["column_name"] = (
    selected_columns["column_name"].str.title().str.replace("_", " ")
)

# table
tbl_format(
    selected_columns,
    "Table 2. Selected fields from the original dataset",
    col_types="text_unformatted",
)


# Check columns" data types for the relevant variables
print("\033[1m" + "Data types of the selected columns:" + "\033[0m")

complaints_orig.dtypes

Data types of the selected columns:

created_date      object
closed_date       object
complaint_type    object
descriptor        object
borough           object
agency            object
agency_name       object
dtype: object


# Explore how many different types of complaints there are
types_of_compl = complaints_orig["complaint_type"].unique().size

print ("Types of complaints: ",types_of_compl)

Types of complaints:  224


# Count the occurrences of unique values in the Complaint Type column
top_complaints = (
    complaints_orig["complaint_type"]
    .value_counts()
    .rename_axis("complaint_type")
    .reset_index(name="count")
)

tbl_format(
    top_complaints.head(15),
    "Table 3. 2021 Most frequent complaints",
    col_types="text_unformatted",
)


borough_values = pd.DataFrame(complaints_orig["borough"].unique())
borough_values.rename(
    columns={borough_values.columns[0]: "Unique values in the Borough column"}
).sort_values("Unique values in the Borough column").head(15).style.set_properties(
    **{"text-align": "left"}
).hide(
    axis="index"
)


agencies = complaints_orig.copy()
agencies["name_and_acronym"] = (
    complaints_orig["agency_name"] + " (" + complaints_orig["agency"] + ")"
)
agencies = pd.DataFrame(agencies["name_and_acronym"].unique())

agencies.rename(
    columns={
        agencies.columns[
            0
        ]: "City Government Agencies that responded to Service Requests"
    }
).sort_values("City Government Agencies that responded to Service Requests").head(
    50
).style.hide(
    axis="index"
).set_properties(
    **{"text-align": "left"}
)


# Replace "Unspecified" values with NA values
complaints_orig["borough"] = complaints_orig["borough"].replace("Unspecified", np.nan)

# Count the number of NA values for each column
na_values = (
    complaints_orig[
        [
            "created_date",
            "closed_date",
            "complaint_type",
            "descriptor",
            "borough",
            "agency",
            "agency_name",
        ]
    ]
    .isna()
    .sum()
    .rename_axis(["Column"])
    .reset_index(name="NA Values")
)

tbl_format(
    na_values, "Table 4. NA values in the selected fields", col_types="text_unformatted"
)


complaints_clean = complaints_orig.copy()


NA_percentage = "{:.1%}".format(
    complaints_clean["borough"].isnull().sum() / complaints_clean.shape[0]
)
print("Percentage of missing values in the Borough Column: ", NA_percentage)

Percentage of missing values in the Borough Column:  0.5%


# Remove the remaining NA values in the Borough Column
complaints_clean = complaints_clean[complaints_clean["borough"].notna()]


# Convert "Created Date" column to datetime
complaints_clean["created_date"] = pd.to_datetime(
    complaints_clean["created_date"], format="%Y-%m-%d %H:%M:%S UTC"
)

# Convert "Closed Date" column to datetime
complaints_clean["closed_date"] = pd.to_datetime(
    complaints_clean["closed_date"], format="%Y-%m-%d %H:%M:%S UTC"
)

# Convert "City" and "Complaint Type" columns to string data type

complaints_clean["complaint_type"] = complaints_clean["complaint_type"].astype("string")
complaints_clean["descriptor"] = complaints_clean["descriptor"].astype("string")
complaints_clean["borough"] = complaints_clean["borough"].astype("string")
complaints_clean["agency"] = complaints_clean["agency"].astype("string")
complaints_clean["agency_name"] = complaints_clean["agency_name"].astype("string")

complaints_clean.dtypes

created_date      datetime64[ns]
closed_date       datetime64[ns]
complaint_type            string
descriptor                string
borough                   string
agency                    string
agency_name               string
dtype: object


def titlecase_vals(df, column_name):
    df[column_name] = df[column_name].str.title()
    df[column_name] = df[column_name].replace(
        {" Or ": " or ", " Of ": " of ", " And ": " and "}, regex=True
    )
    return df[column_name]


complaints_clean["complaint_type"] = titlecase_vals(complaints_clean, "complaint_type")
complaints_clean["descriptor"] = titlecase_vals(complaints_clean, "descriptor")
complaints_clean["borough"] = complaints_clean["borough"].str.title()


complaints_clean["agency_name"] = complaints_clean["agency_name"].replace(
    "Mayorâ€™s Office of Special Enforcement", "Mayor's Office of Special Enforcement"
)


complaints_clean.sort_values(
    ["created_date", "complaint_type"], ascending=[True, False], inplace=True
)
complaints_clean.reset_index(drop=True, inplace=True)


# Display preview ofthe cleaned data frame

tbl_format(
    complaints_clean.head(),
    "Table 5. Preview of the cleaned data frame used for data manipulation",
    col_names="actual",
    hide_index=True,
)


# Create a list containing only the most frequent types of complaints
major_complaints_list = (
    complaints_clean["complaint_type"].value_counts()[:10].index.tolist()
)

# Create a new data frame with only these complaint types
major_complaints = complaints_clean[
    complaints_clean["complaint_type"].isin(major_complaints_list)
]


print("\033[1m" + "Most Frequent Complaints Data Frame" + "\033[0m")
print("Number of rows: ", "{:,.0f}".format(major_complaints.shape[0]))

Most Frequent Complaints Data Frame
Number of rows:  1,815,615


def rate(df):

    # Set rate factor to 1,000 people
    rate_factor = 1000

    # Set populations for each borough according to US Census Bureau 2021 estimates
    bronx_population = 1424948
    brooklyn_population = 2641052
    manhattan_population = 1576876
    queens_population = 2331143
    staten_island_population = 493494

    # Calculate rate factor for each complaint type and borough

    if df["borough"] == "Bronx":
        return df["number_of_complaints"] * rate_factor / bronx_population
    elif df["borough"] == "Brooklyn":
        return df["number_of_complaints"] * rate_factor / brooklyn_population
    elif df["borough"] == "Manhattan":
        return df["number_of_complaints"] * rate_factor / manhattan_population
    elif df["borough"] == "Queens":
        return df["number_of_complaints"] * rate_factor / queens_population
    elif df["borough"] == "Staten Island":
        return df["number_of_complaints"] * rate_factor / staten_island_population
    else:
        return np.nan


def rate_all(df):

    # Set rate factor to 1,000 people
    rate_factor = 1000

    # Set populations for each borough according to US Census Bureau
    bronx_population = 1424948
    brooklyn_population = 2641052
    manhattan_population = 1576876
    queens_population = 2331143
    staten_island_population = 493494
    
    nyc_population = (bronx_population
                      + brooklyn_population
                      + manhattan_population
                      + queens_population
                      + staten_island_population)

    # Calculate rate factor for the entire City of New York
    return df["number_of_complaints"] * rate_factor / nyc_population


# Calculate total number of complaints by counting the occurences
nyc_year = (
    major_complaints["complaint_type"]
    .value_counts()
    .rename_axis(["complaint_type"])
    .reset_index(name="number_of_complaints")
)

# Add column with the calculated annual rate using the previously defined function
nyc_year["annual_rate_(per_1,000_people)"] = nyc_year.apply(rate_all, axis=1)

tbl_format(nyc_year, "Table 6. Annual rate of NYC's 10 most frequent complaints")


# Calculate total number of complaints based on borough
borough_year = (
    major_complaints.groupby("complaint_type")["borough"]
    .value_counts()
    .reset_index(name="number_of_complaints")
)

# Add column with the calculated annual rate using the previously defined function
borough_year["annual_rate_(per_1,000_people)"] = borough_year.apply(rate, axis=1)

# Create wide version for number of complaints according to borough
borough_complaints_wide = (
    pd.pivot(
        borough_year,
        index="borough",
        columns="complaint_type",
        values="number_of_complaints",
    )
    .rename_axis(None, axis=0)
    .rename_axis(None, axis=1)
)

tbl_format(
    borough_complaints_wide,
    "Table 7. NYC's 10 most frequent complaints according to borough",
    hide_index=False,
)


# Create wide version for complaint rate according to borough
borough_rate_wide = (
    pd.pivot(
        borough_year,
        index="borough",
        columns="complaint_type",
        values="annual_rate_(per_1,000_people)",
    )
    .astype(float)
    .rename_axis(None, axis=0)
    .rename_axis(None, axis=1)
)

tbl_format(
    borough_rate_wide,
    "Table 8. Complaint annual rate according to borough",
    hide_index=False,
)


# Calculate total number of complaints based on agency
agency_year = (
    major_complaints.groupby("agency_name")["complaint_type"]
    .value_counts()
    .reset_index(name="number_of_complaints")
    .sort_values("agency_name")
)


# Create a data frame for time series data
complaints_hourly = major_complaints.copy()

# Group by hour to count number of complaints
complaints_hourly = (
    complaints_hourly.groupby(
        [
            pd.Grouper(key="created_date", freq="H"),
            "complaint_type",
            "borough",
            "agency_name",
        ]
    )
    .size()
    .reset_index(name="number_of_complaints")
)

complaints_hourly.sort_values(
    ["created_date", "complaint_type"],
    ascending=[True, True],
    ignore_index=True,
    inplace=True,
)


hour_total = (
    complaints_hourly.groupby(["complaint_type"])
    .resample("H", on="created_date")
    .sum()
    .reset_index()
    .sort_values(by="created_date", ignore_index=True)
)


weekday_hour = hour_total.copy()
weekday_hour["weekday"] = weekday_hour["created_date"].dt.day_name()
weekday_hour["hour"] = weekday_hour["created_date"].dt.hour


weekday_hour = (
    weekday_hour.groupby(["weekday", "hour"])["number_of_complaints"]
    .aggregate("mean")
    .reset_index()
)

# Create list and category to set the order of weekdays
days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
weekday_hour["weekday"] = pd.Categorical(
    weekday_hour["weekday"], categories=days, ordered=True
)


weekday_hour_wide = (
    pd.pivot(
        weekday_hour,
        index="weekday",
        columns="hour",
        values="number_of_complaints",
    )
    .astype(float)
    .sort_values("weekday")
    .rename_axis(None, axis=0)
    .rename_axis(None, axis=1)
)
weekday_hour_wide = weekday_hour_wide.add_suffix(":00")
tbl_format(
    weekday_hour_wide,
    "Table 9. Averge number of complaints according to hour and day of the week",
    hide_index=False,
)


daily = (
    complaints_hourly.groupby(["complaint_type"])
    .resample("D", on="created_date")
    .sum()
    .reset_index()
    .sort_values(by="created_date", ignore_index=True)
)

days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
weekday = daily.copy()
weekday["weekday"] = weekday["created_date"].dt.day_name()

weekday["weekday"] = pd.Categorical(weekday["weekday"], categories=days, ordered=True)

weekday_avg = (
    weekday.groupby(["weekday", "complaint_type"])
    .mean("number_of_complaints")
    .reset_index()
)

weekday_avg_wide = (
    pd.pivot(
        weekday_avg,
        index="complaint_type",
        columns="weekday",
        values="number_of_complaints",
    )
    .astype(float)
    .rename_axis(None, axis=1)
    .rename_axis("complaint_type", axis=0)
)
tbl_format(
    weekday_avg_wide,
    "Table 10. Average number of complaints for each day of the week according to type",
    hide_index=False,
)


nyc_monthly = (
    complaints_hourly.groupby(["complaint_type"])
    .resample("M", on="created_date")
    .sum()
    .reset_index()
    .sort_values(by="created_date", ignore_index=True)
)

# Create data frame with months sorted for table
nyc_monthly_tbl = nyc_monthly.copy()

# Create category to keep month names ordered
months = [
    "Jan",
    "Feb",
    "Mar",
    "Apr",
    "May",
    "Jun",
    "Jul",
    "Aug",
    "Sep",
    "Oct",
    "Nov",
    "Dec",
]

# Extract month name from created date and rename column
nyc_monthly_tbl["created_date"] = (
    nyc_monthly_tbl["created_date"].dt.month_name().str[:3]
)
nyc_monthly_tbl.rename(columns={"created_date": "month"}, inplace=True)

# Assign category to sort months
nyc_monthly_tbl["month"] = pd.Categorical(
    nyc_monthly_tbl["month"], categories=months, ordered=True
)

# Rearrange columns back to original positions
nyc_monthly_tbl = nyc_monthly_tbl[
    list(("month", "complaint_type", "number_of_complaints"))
]

nyc_monthly_tbl.sort_values(by="month", inplace=True)
# Reshape the data frame
nyc_monthly_wide = (
    pd.pivot(
        nyc_monthly_tbl,
        index="month",
        columns="complaint_type",
        values="number_of_complaints",
    )
    .astype(float)
    # .reset_index()
    .rename_axis(None, axis=1)
    .rename_axis(None, axis=0)
)

tbl_format(
    nyc_monthly_wide,
    "Table 11. Monthly aggregation of complaints types",
    hide_index=False,
)


def create_proportion(df, n_slices, slices_col=None, subplot_col=None):

    # Calculate proportion of all slices, no subplots
    if subplot_col == None:
        df_proportion = (
            df[slices_col]
            .value_counts(normalize=True)
            # extract the string to rename column in results df
            .rename_axis([format(slices_col)])
            .reset_index(name="proportion")
            .sort_values("proportion", ascending=False)
        )

        # If there are more than 1 group left after n_slices,
        # group the remaining proportions under the category "Other"
        # by adding together all the rows after the n_slice row
        # and add them to last row
        if len(df_proportion["proportion"][n_slices:]) > 1:
            others_total = df_proportion["proportion"][n_slices:].sum()
            df_top = df_proportion[:n_slices].copy()
            df_top.loc[n_slices, :] = ["Other", others_total]

        # If there are no groups after n_slices
        else:
            df_top = df_proportion.copy()

    # If df has subplot groups (e.g. one group per subplot),
    # calculate slice proportions for each subplot group individually
    else:
        df_proportion = (
            df.groupby(subplot_col)[slices_col]
            .value_counts(normalize=True)
            .reset_index(name="proportion")
        )

        groups_list = df_proportion[subplot_col].unique().tolist()
        df_group = groups_list
        df_top = pd.DataFrame()

        # Create a temporary separate data frame for each subplot group
        # by filtering each subplot group
        for i in range(len(groups_list)):
            group_x = str(groups_list[i])
            df_group[i] = df_proportion.copy(deep=True)
            df_group[i] = df_group[i].loc[df_group[i][subplot_col] == group_x]

            # If there are more than 1 group left after n_slices,
            # group the remaining proportions under the "Other" group
            if len(df_group[i]["proportion"][n_slices:]) > 1:
                others_total = df_group[i]["proportion"][n_slices:].sum()
                df_group[i] = df_group[i][:n_slices].copy()
                df_group[i].loc[n_slices, :] = [group_x, "Other", others_total]

            df_top = pd.concat([df_top, df_group[i]])

            df_top = (
                df_top.sort_values([subplot_col, "proportion"], ascending=[True, False])
                .groupby(subplot_col)
                .head(50)
            )

    return df_top


# Create data frame for cycle time (days open)
processing_time = major_complaints.copy()

# Create variable to calculate the hours a complaint remained open
processing_time["processing_hours"] = (
    processing_time["closed_date"] - processing_time["created_date"]
).dt.total_seconds() / 3600  # must use total_seconds because it is timedelta


# Remove rows with NA values
processing_time.dropna(axis=0, how="any", thresh=None, subset=None, inplace=True)

# Remove rows with Negative values
processing_time = processing_time[processing_time["processing_hours"] >= 0]

# Use only the complaints that do not have outliers
processing_time = processing_time[processing_time["processing_hours"] <= 4320]

# Calculate number of rows in the base data frame
initial_rows = major_complaints.shape[0]

# Calculate number of rows in the cycle time data frame
cycle_rows= processing_time.shape[0]

print("\nNumber of rows in the base data frame:\n", "{:,.0f}".format(initial_rows))
print("\nNumber of rows in the cycle time data frame:\n", "{:,.0f}".format(cycle_rows))

Number of rows in the base data frame:
 1,815,615

Number of rows in the cycle time data frame:
 1,807,068


processing_time["processing_days"] = processing_time["processing_hours"] / 24


boroughs_days_avg = processing_time.copy()
boroughs_days_avg = (
    boroughs_days_avg.groupby(["complaint_type", "borough"])
    .mean("processing_days")
    .reset_index()
)


descriptor_days_avg = processing_time.copy()
descriptor_days_avg = (
    descriptor_days_avg.groupby(["complaint_type", "descriptor"])
    .mean("processing_days")
    .reset_index()
)


# Visualize most frequent complaint types
fig, ax = plt.subplots()
ax.set_ylabel("Number of Complaints", fontsize=12)
ax.set_title("Figure 1. NYC's 10 Most Frequent Complaints", fontsize=12)
plt.xticks(rotation=40, ha="right")
ax.set_facecolor("white")
ax.yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter("{x:,.0f}"))

ax.bar(
    x=nyc_year["complaint_type"],
    height=nyc_year["number_of_complaints"],
    color="darkblue",
    width=0.7,
)
ax.grid(axis="y")


# cnfigure a new data frame to plot data  
nyc_monthly_plot =nyc_monthly.copy()

complaint_colors = [
    "black",
    "red",
    "grey",
    "orange",
    "yellow",
    "pink",
    "green",
    "darkgrey",
    "purple",
    "blue",
]

# Convert datetime to float so that lmplot is able to use it in the x axis
nyc_monthly_plot["created_date"] = nyc_monthly_plot["created_date"].apply(
    datetime.timestamp
)

monthly_plt = (sns.lmplot(data=nyc_monthly_plot, x='created_date', y='number_of_complaints',
            hue='complaint_type',
            order=3,
            scatter_kws={"s": 20},
            palette= complaint_colors,
            height=7, aspect=1.5,
            ci=None,
            scatter=False,
            hue_order=nyc_monthly_plot.groupby(
                'complaint_type').count().sort_values(by="complaint_type").index))
ax = plt.gca()
# fix xticks
#plt.title("Figure 2. NYC Number of Complaints per Month 2021")
ticks_loc = ax.get_xticks().tolist()
xticks = ax.get_xticks()
ax.set(ylim=(0, 50000))
ax.set_xlabel("Month", fontsize=14)
ax.set_ylabel("Number of Complaints", fontsize=14)
monthly_plt._legend.set_title('Complaint Type')
ax.yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter("{x:,.0f}"))
ax.xaxis.set_major_locator(mticker.FixedLocator(ticks_loc))
ax.set_xticklabels([pd.to_datetime(tm, unit='s').strftime('%b') for tm in xticks])
ax.set_title("Figure 2. NYC Number of Complaints per Month 2021", fontsize=15)

plt.show(monthly_plt)


complaint_descriptors = create_proportion(
    major_complaints, 3, slices_col="descriptor", subplot_col="complaint_type"
)

complaint_agencies = create_proportion(
    major_complaints, 3, slices_col="complaint_type", subplot_col="agency_name"
)


n_groups = len(complaint_descriptors["complaint_type"].unique())
fig, axs = plt.subplots(
    nrows=4,
    ncols=3,
    figsize=(2 * n_groups, 24),
    subplot_kw={"aspect": "equal"},
)
fig.subplots_adjust(top=0.7)
axs = axs.flatten()

for ax, (group_name, df_group) in zip(axs, complaint_descriptors.groupby("complaint_type")):
    texts, autotexts, autopcts = ax.pie(
        df_group.proportion,
        labels=df_group.descriptor,
        counterclock=False,
        startangle=87,
        autopct="%1.0f%%",
    )
    ax.set_title(group_name)
    # ax.yaxis.label.set_color("darkblue")
    ax.set_title(group_name, weight='bold', color="darkred",fontsize=14)
    #[autotext.set_color("white") for autotext in autotexts]
    plt.setp(autopcts, **{'color':'white', 'weight':'bold', 'fontsize':9})
#fig.tight_layout(pad=2)
fig.subplots_adjust(hspace=.2)
axs[10].set_visible(False)
axs[11].set_visible(False)
plt.suptitle("Figure 3. Composition of NYC's Most Frequent Complaints", size=20, y=.75)

plt.show()


borough_total = (
    major_complaints["borough"]
    .value_counts()
    .rename_axis("borough")
    .reset_index(name="number_of_complaints")
)

tbl_format(
    borough_total,
    "Table 12. Annual rate according to borough and complaint type (per 1,000 people)",
)


ax = borough_complaints_wide.plot(
    kind="bar",
    width=0.7,
    edgecolor="white",
    linewidth=2,
    color=complaint_colors,
)
# ax = plt.axes()
plt.xticks(
    rotation=0, horizontalalignment="center", fontweight="light", fontsize="medium"
)
# ax.set_facecolor("pink")
plt.legend(fontsize=11)
plt.title("Figure 4. NYC Total Number of Complaints According to Borough 2021")
plt.xlabel("Borough", fontweight="bold")
plt.ylabel("Number  of Complaints")
ax.yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter("{x:,.0f}"))
ax.get_legend().set_bbox_to_anchor((1, 1))

plt.show()


#

ax = borough_rate_wide.plot(
    kind="bar",
    width=0.7,
    edgecolor="white",
    linewidth=2,
    color=complaint_colors,
)
# ax = plt.axes()
plt.xticks(
    rotation=0, horizontalalignment="center", fontweight="light", fontsize="medium"
)

plt.legend(fontsize=11)
plt.title("Figure 5. NYC Complaint Rate (Per 1,000 People) 2021")
plt.xlabel("Borough", fontweight="bold")
plt.ylabel("Complaint Rate (per 1,000 people)")
ax.yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter("{x:,.0f}"))
ax.get_legend().set_bbox_to_anchor((1, 1))

plt.show()


def highlight_greater20(x):

    col_name = x.name
    comparison = nyc_year.query("complaint_type == @col_name")[
        "annual_rate_(per_1,000_people)"
    ].iloc[0]

    highlight = "color: red;"
    greater20 = x > comparison + (0.3 * comparison)

    return [highlight if i else "" for i in greater20]


table = (
    borough_rate_wide.style.format("{:,.1f}")  # .rename_axis("Borough")
    .set_table_styles(styles)
    .set_properties(**{"text-align": "left"})
    .apply(highlight_greater20)
    .set_caption(
        "Table 13. Annual rate (per 1,000 people) according to borough and complaint type "
    )
)

table


agency_year = (
    major_complaints["agency"]
    .value_counts()
    .rename_axis("agency")
    .reset_index(name="number_of_complaints")
)

# Visualize most frequent complaint types
fig, ax = plt.subplots()
ax.set_ylabel("Number of Complaints", fontsize=12)
ax.set_title("Figure 6. NYC's Most Frequent Complaints Accordin to Agency", fontsize=12)
plt.xticks(rotation=30, ha="right")
ax.set_facecolor("white")
ax.yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter("{x:,.0f}"))

ax.bar(
    x=agency_year["agency"],
    height=agency_year["number_of_complaints"],
    color="darkblue",
    width=0.7,
)
ax.grid(axis="y")


complaints_nypd = major_complaints.loc[(major_complaints["agency"] == "NYPD")]
# nypd = major_complaints["agency_name"] == "New York City Police Department"
nypd_proportion = create_proportion(complaints_nypd, 5, slices_col="complaint_type")


axes = nypd_proportion.plot(
    kind="pie",
    y="proportion",
    labels=nypd_proportion["complaint_type"],
    startangle=90,
    autopct="%.0f%%",
    legend=None,
    counterclock=False,
    # ax=axes[0],
    ylabel="",
    figsize=(12, 4),
)
axes.set_title("Figure 7. Complaints Directed to NYPD", color="black", fontsize=12)
# axes[1].set_title("Illegal Parking", color="r", fontsize=14)
plt.setp(autopcts, **{"color": "white", "weight": "bold", "fontsize": 9})
[text.set_color("red") for text in texts]
texts[0].set_color("blue")
[autotext.set_color("white") for autotext in autotexts]

plt.show()


fig, ax = plt.subplots(figsize=(12, 3))

from matplotlib.colors import TwoSlopeNorm

norm = TwoSlopeNorm(vcenter=25)

title = "Figure 8. Average Number of Complaints per Hour of the Day"

plt.title(title, fontsize=12)
t = ax.title
t.set_position([0.5, 1.05])


sns.heatmap(
    weekday_hour_wide,
    cmap="YlGnBu",
    linewidths=0.50,
    ax=ax,
    norm=norm,
)
plt.xticks(rotation=40)
plt.yticks(rotation=0)
plt.show()


fig, ax = plt.subplots(figsize=(10, 4))

from matplotlib.colors import TwoSlopeNorm

# Adjust intensity in heatmap
norm = TwoSlopeNorm(vcenter=70)

title = "Figure 9. Average Number of Complaints According to Type and Day of the Week"

plt.title(title, fontsize=11)
t = ax.title
t.set_position([0.5, 5])

sns.heatmap(
    weekday_avg_wide,
    cmap="YlGnBu",
    linewidths=0.30,
    ax=ax,
    norm=norm,
)
ax.set(ylabel=None)
plt.show()


# descriptive statistics for the number of hours that complaints remained open
cycle_stats = processing_time.groupby("complaint_type")["processing_hours"].describe()
cycle_stats.sort_values("mean", ascending=False, inplace=True)

tbl_format(
    cycle_stats,
    "Table 14. Descriptive statistics for complaint cycle time (hours)",
    hide_index=False,
)


# Create data frame for processing days
# of unsanitary conditions according to borough
unsanitary_days_borough = boroughs_days_avg[
    boroughs_days_avg["complaint_type"] == "Unsanitary Condition"
].sort_values("processing_days", ascending=False)

# Create data frame for processing days
# of unsanitary conditions according to descriptor
unsanitary_days_descriptors = descriptor_days_avg[
    descriptor_days_avg["complaint_type"] == "Unsanitary Condition"
].sort_values("processing_days", ascending=False)

fig, ax = plt.subplots(ncols=2, figsize=(9, 3))

plt1 = unsanitary_days_borough.plot.bar(
    x="borough", y="processing_days", color="blue", ax=ax[0], legend=None
)
plt2 = unsanitary_days_descriptors.plot.bar(
    x="descriptor", y="processing_days", color="red", ax=ax[1], legend=None
)

plt.suptitle("Figure 10. Unsanitary Condition Processing Days", size=12, y=1.05)

ax[0].set(ylabel="Processing Days")
ax[0].set_title("According to Borough", color="black", fontsize=9)
ax[0].set(xlabel=None)
ax[0].set_xticklabels(plt1.get_xticklabels(), rotation=40, ha="right")
ax[0].yaxis.set_ticks(np.arange(0, 30, 2))
ax[0].grid(axis="y")

ax[1].set_title("According to Descriptor", color="black", fontsize=9)
ax[1].set(xlabel=None)
ax[1].yaxis.set_ticks(np.arange(0, 30, 2))
ax[1].set_xticklabels(plt2.get_xticklabels(), rotation=40, ha="right")
ax[1].grid(axis="y")

plt.show()

created_date	closed_date	complaint_type	descriptor	borough	agency	agency_name
2021-09-05 21:47:24 UTC	nan	Mold	Public Complaint - Comm Location	STATEN ISLAND	DOHMH	Department of Health and Mental Hygiene
2021-02-04 11:46:00 UTC	2021-02-21 12:00:00 UTC	Snow	E9 Snow / Icy Sidewalk	BROOKLYN	DSNY	Department of Sanitation
2021-02-03 15:27:00 UTC	2021-02-05 12:00:00 UTC	Snow	E9 Snow / Icy Sidewalk	QUEENS	DSNY	Department of Sanitation
2021-02-08 13:23:00 UTC	2021-02-22 12:00:00 UTC	Snow	E9 Snow / Icy Sidewalk	BROOKLYN	DSNY	Department of Sanitation
2021-09-06 15:14:00 UTC	2021-09-07 12:00:00 UTC	Storm	2S Bulk-Storm Related	QUEENS	DSNY	Department of Sanitation

Column Name	Description
Created Date	Date SR was created
Closed Date	Date SR was closed by responding agency
Complaint Type	This is the first level of a hierarchy identifying the topic of the incident or condition
Descriptor	This is associated to the Complaint Type, and provides further detail on the incident or condition
Borough	Borough Block and Lot, provided by geovalidation
Agency	Acronym of responding City Government Agency
Agency Name	Full Agency name of responding City Government Agency

Complaint Type	Count
Noise - Residential	361,416
Illegal Parking	317,202
Request Large Bulky Item Collection	302,558
HEAT/HOT WATER	199,049
Noise - Street/Sidewalk	184,190
Blocked Driveway	152,264
UNSANITARY CONDITION	90,271
Noise - Vehicle	84,900
Street Condition	73,027
Street Light Condition	56,184
Water System	55,342
Noise - Commercial	53,982
PAINT/PLASTER	52,310
PLUMBING	51,839
Abandoned Vehicle	48,627

Complaint Type	Number of Complaints	Annual Rate (per 1,000 People)
Noise - Residential	361,411	42.7
Illegal Parking	317,193	37.5
Request Large Bulky Item Collection	302,555	35.7
Heat/Hot Water	199,044	23.5
Noise - Street/Sidewalk	184,185	21.8
Blocked Driveway	152,264	18.0
Unsanitary Condition	90,271	10.7
Noise - Vehicle	84,896	10.0
Street Condition	68,499	8.1
Water System	55,297	6.5

	Blocked driveway	Heat/hot water	Illegal parking	Noise - residential	Noise - street/sidewalk	Noise - vehicle	Request large bulky item collection	Street condition	Unsanitary condition	Water system
Bronx	27,010	69,573	53,469	116,882	57,536	24,221	22,356	8,088	30,423	12,801
Brooklyn	57,179	56,152	109,920	91,497	38,985	16,300	113,285	19,186	28,098	14,097
Manhattan	4,644	44,301	46,056	71,611	64,664	19,262	35,414	10,154	17,724	7,899
Queens	59,776	26,749	98,071	71,992	20,916	23,750	103,085	23,156	11,693	14,343
Staten Island	3,655	2,269	9,677	9,429	2,084	1,363	28,415	7,915	2,333	6,157

Data Analysis on NYC Service Requests After COVID-19's First Year¶

Table of Content¶

Project Objectives and Data Source ¶

Preview of the Data ¶

Environment ¶

Preview of the Original Dataset ¶

Attributes of the Dataset ¶

Data Cleaning ¶

Addressing Missing Values ¶

Formatting and Data Types ¶

Data Manipulation ¶

Most Frequent Complaints ¶

Complaint Rate per 1,000 People ¶

Aggregation of Complaint Types ¶

Creation of Time Series ¶

Group Proportions ¶

Data Frame: Complaint Cycle Time ¶

Data Analysis and Visualizations ¶

10 Most Frequent Complaints in New York City ¶

Seasonal Patterns in the Number of Complaints ¶

Descriptors' Proportion of NYC's Most Frequent Complaints ¶

Number of Complaints According to Borough ¶

Annual Complaint Rate According to Borough ¶

Number of Complaints According to Responsible Government Agency ¶

Heatmap: Level of Activity per Hour of the Day ¶

Heatmap: Level of Activity According to Type and Weekday ¶

Descriptive Statistics: Complaint Processing Time ¶

Conclusions and Recommendations ¶

References¶

Column	Na values
created_date	0
closed_date	110,211
complaint_type	0
descriptor	90,682
borough	16,317
agency	0
agency_name	0

created_date	closed_date	complaint_type	descriptor	borough	agency	agency_name
2021-01-01 00:00:00	2021-01-08 00:00:01	Food Poisoning	1 or 2	Brooklyn	DOHMH	Department of Health and Mental Hygiene
2021-01-01 00:00:04	2021-01-01 00:23:27	Noise - Commercial	Car/Truck Music	Brooklyn	NYPD	New York City Police Department
2021-01-01 00:00:17	2021-01-01 00:43:43	Illegal Fireworks		Brooklyn	NYPD	New York City Police Department
2021-01-01 00:00:24	2021-01-01 00:42:21	Illegal Fireworks		Brooklyn	NYPD	New York City Police Department
2021-01-01 00:00:49	2021-01-01 01:08:12	Noise - Residential	Loud Talking	Bronx	NYPD	New York City Police Department

	Blocked driveway	Heat/hot water	Illegal parking	Noise - residential	Noise - street/sidewalk	Noise - vehicle	Request large bulky item collection	Street condition	Unsanitary condition	Water system
Bronx	19.0	48.8	37.5	82.0	40.4	17.0	15.7	5.7	21.4	9.0
Brooklyn	21.7	21.3	41.6	34.6	14.8	6.2	42.9	7.3	10.6	5.3
Manhattan	2.9	28.1	29.2	45.4	41.0	12.2	22.5	6.4	11.2	5.0
Queens	25.6	11.5	42.1	30.9	9.0	10.2	44.2	9.9	5.0	6.2
Staten Island	7.4	4.6	19.6	19.1	4.2	2.8	57.6	16.0	4.7	12.5

	0:00	1:00	2:00	3:00	4:00	5:00	6:00	7:00	8:00	9:00	10:00	11:00	12:00	13:00	14:00	15:00	16:00	17:00	18:00	19:00	20:00	21:00	22:00	23:00
Monday	25.0	15.4	9.6	6.6	5.7	6.8	9.6	16.2	21.3	24.8	25.8	25.8	25.8	24.9	25.6	24.6	25.3	25.4	25.5	26.0	26.6	27.6	28.3	23.8
Tuesday	17.0	10.7	7.1	4.9	4.7	6.5	9.6	17.2	21.5	24.7	24.5	24.6	24.5	24.3	24.8	23.7	25.4	24.7	24.4	25.0	26.9	28.4	28.9	25.1
Wednesday	17.6	10.7	6.8	4.9	4.6	6.4	9.6	17.1	20.9	23.7	23.1	23.0	23.0	23.0	23.3	22.9	23.7	22.9	24.3	24.6	26.7	27.5	27.8	23.9
Thursday	17.9	10.7	7.1	4.9	4.8	6.4	9.2	16.4	20.9	24.6	23.1	24.0	23.3	22.5	22.8	21.9	22.8	22.2	22.7	23.4	25.1	27.4	29.5	25.3
Friday	19.6	12.5	8.0	5.7	5.2	6.4	9.3	16.1	20.7	23.6	23.0	23.7	23.1	22.3	22.1	21.8	22.1	22.7	22.9	24.0	26.1	30.7	35.5	36.4
Saturday	34.8	23.8	15.7	11.1	8.0	6.8	7.8	11.2	15.5	17.7	18.8	19.4	18.9	20.0	20.3	20.3	21.1	21.9	23.8	26.4	30.2	37.6	48.6	54.7
Sunday	51.3	38.7	25.3	16.9	11.1	8.3	8.1	11.0	14.5	17.9	20.2	20.9	21.2	21.9	22.7	22.7	23.9	24.4	26.0	28.0	31.1	35.1	37.5	33.6

	Monday	Tuesday	Wednesday	Thursday	Friday	Saturday	Sunday
Complaint Type
Blocked Driveway	420.3	404.3	414.0	419.2	424.6	423.2	422.5
Heat/Hot Water	632.3	593.1	509.8	505.2	553.8	520.3	513.2
Illegal Parking	865.8	893.3	881.1	909.2	896.8	809.2	844.5
Noise - Residential	780.4	690.4	660.0	672.1	899.6	1,616.3	1,631.4
Noise - Street/Sidewalk	400.2	336.1	342.3	359.7	482.2	777.6	843.9
Noise - Vehicle	195.5	187.9	192.6	197.0	232.0	299.4	328.2
Request Large Bulky Item Collection	1,035.8	965.1	920.3	872.5	750.2	518.0	756.5
Street Condition	216.2	231.4	231.1	211.3	180.2	116.5	130.6
Unsanitary Condition	313.4	315.0	291.8	274.3	251.4	148.6	141.5
Water System	157.7	173.3	176.7	168.4	162.9	114.7	109.6

	Blocked driveway	Heat/hot water	Illegal parking	Noise - residential	Noise - street/sidewalk	Noise - vehicle	Request large bulky item collection	Street condition	Unsanitary condition	Water system
Jan	10,478.0	35,308.0	19,314.0	27,428.0	3,915.0	3,461.0	23,917.0	4,927.0	6,033.0	4,081.0
Feb	12,797.0	28,885.0	18,315.0	21,899.0	3,300.0	2,474.0	16,812.0	4,965.0	5,773.0	2,616.0
Mar	10,881.0	19,933.0	23,384.0	24,471.0	9,818.0	5,978.0	28,905.0	7,094.0	6,021.0	3,896.0
Apr	11,047.0	12,126.0	24,621.0	28,301.0	14,381.0	8,149.0	27,104.0	6,644.0	5,580.0	3,162.0
May	12,811.0	7,155.0	28,813.0	36,245.0	25,046.0	11,638.0	27,739.0	6,383.0	5,832.0	3,625.0
Jun	12,899.0	2,733.0	28,628.0	35,574.0	29,349.0	10,932.0	29,205.0	6,914.0	7,025.0	7,948.0
Jul	12,245.0	2,314.0	25,548.0	41,602.0	25,126.0	9,003.0	28,759.0	5,952.0	8,235.0	8,462.0
Aug	12,359.0	2,256.0	26,325.0	32,561.0	21,728.0	8,867.0	28,630.0	5,263.0	8,864.0	7,403.0
Sep	13,778.0	5,338.0	30,389.0	38,055.0	22,156.0	9,051.0	27,911.0	6,218.0	10,885.0	4,306.0
Oct	14,731.0	12,572.0	32,592.0	31,291.0	17,787.0	7,250.0	23,661.0	4,956.0	9,891.0	3,275.0
Nov	14,541.0	40,630.0	30,645.0	22,379.0	6,612.0	4,078.0	19,864.0	4,166.0	9,036.0	3,117.0
Dec	13,697.0	29,794.0	28,619.0	21,605.0	4,967.0	4,015.0	20,048.0	5,017.0	7,096.0	3,406.0

Borough	Number of Complaints
Brooklyn	544,699
Queens	453,531
Bronx	422,359
Manhattan	321,729
Staten Island	73,297

	Count	Mean	Std	Min	25%	50%	75%	Max
Complaint Type
Unsanitary Condition	88,645.0	488.8	611.8	0.0	137.2	289.3	552.6	4,318.8
Street Condition	66,734.0	169.9	301.0	0.0	21.0	65.8	210.1	4,318.3
Request Large Bulky Item Collection	302,545.0	88.5	75.4	0.0	37.0	62.6	108.8	1,467.6
Water System	54,624.0	82.3	275.3	0.0	0.1	5.0	34.5	4,291.9
Heat/Hot Water	198,830.0	58.3	49.2	0.0	27.5	49.3	77.8	2,789.1
Noise - Residential	360,188.0	7.1	36.2	0.0	0.3	0.7	1.7	1,730.2
Blocked Driveway	151,648.0	1.6	3.0	0.0	0.5	0.9	1.8	165.4
Illegal Parking	316,095.0	1.4	5.0	0.0	0.4	0.8	1.5	1,601.2
Noise - Street/Sidewalk	183,349.0	1.2	2.7	0.0	0.3	0.6	1.3	161.9
Noise - Vehicle	84,410.0	1.2	2.5	0.0	0.3	0.6	1.2	162.4

City Government Agencies that responded to Service Requests
Department for the Aging (DFTA)
Department of Buildings (DOB)
Department of Consumer Affairs (DCA)
Department of Education (DOE)
Department of Environmental Protection (DEP)
Department of Health and Mental Hygiene (DOHMH)
Department of Homeless Services (DHS)
Department of Housing Preservation and Development (HPD)
Department of Information Technology and Telecommunications (DOITT)
Department of Parks and Recreation (DPR)
Department of Sanitation (DSNY)
Department of Transportation (DOT)
Economic Development Corporation (EDC)
Mayorâ€™s Office of Special Enforcement (MAYORâ€™S OFFICE OF SPECIAL ENFORCEMENT)
New York City Police Department (NYPD)
Taxi and Limousine Commission (TLC)