"""Prepares and explores the COVID school closures dataset downloaded from
https://docs.google.com/spreadsheets/d/1ndHgP53atJ5J-EtxgWcpSfYG8LdzHpUsnb6mWybErYg
in line with the COMP0034 'How to' guide for data preparation and exploration

Usage:
    ./data_prep.py

Author:
    Sarah Sanders - 23.05.2022
"""
import pandas as pd
import matplotlib as plt


def set_pandas_display_options(dataframe):
    """Set the pandas display options to the size of the dataframe

    Parameters
    ----------
    dataframe : pandas DataFrame
        DataFrame containing the raw data.
    """
    pd.set_option("display.max_rows", len(dataframe.index))
    pd.set_option("display.max_columns", len(dataframe.columns))
    pd.set_option('display.width', 150)


def print_summary_info(dataframe):
    """Print summary information for a dataframe

    Parameters
    ----------
    dataframe : pandas DataFrame
        DataFrame containing the raw data.
    """

    print("Number of rows and columns")
    print(dataframe.shape)

    print("First 5 rows")
    print(dataframe.head(5))

    print("Column labels")
    print(dataframe.columns)

    print("Column labels and data types")
    print(dataframe.info(verbose=True))


def remove_data(datafame):
    """Keeps only the needed columns and rows as per the 'how to' guide """
    col_list = [
        "Country",
        "Code",
        "Region",
        "Income Group",
        "School Closures",
        "Closure extended?",
        "Date",
        "Number of confirmed cases at time of closure",
        "Number of cases at re-opening",
        "Total weeks FULLY CLOSED (excluding academic breaks) [red=need to verify breaks]"
    ]
    df = datafame[col_list]

    # Keep only rows where country is not null
    df = df[df["Country"].notna()]
    df.reset_index(drop=True)
    return df


def find_nulls(dataframe):
    """ Finds and prints info on nulls and NaNs. """
    # Count null values
    print("Counts of isnull and isna\n=========================\n")
    print(dataframe.isnull().sum())
    # print(dataframe.isna().sum())

    # Identify rows and columns with null values
    df_cols_with_na = dataframe.loc[:, dataframe.isna().any()]
    df_rows_with_na = df_cols_with_na[df_cols_with_na.isnull().any(axis=1)]
    print("All columns and rows with null values\n=====================\n")
    print(df_rows_with_na)


def inconsistent_data_example():
    """ Example illustrating inconsistent data """
    # Inconsistent data
    data = [
        ["Rio", "M", 31, "IT"],
        ["Maz", "M", 32, "IT"],
        ["Tom", "M", 35, "I T"],
        ["Nick", "M", 34, "IT"],
        ["Julius", "m", 340, "IT "],
        ["Julia", "F", 32, " IT"],
        ["Tom", "male", 35, "IT"],
        ["Nicola", "F", 31, "IT "],
        ["Julia", "F", 31, "IT"],
        ["Julia", "F", 31, "IT"],
    ]

    df_names = pd.DataFrame(data, columns=["Name", "Gender", "Age", "Department"])

    # Find unique values
    print("Unique values for the gender field:\n")
    print(df_names["Gender"].unique())

    # Count the times a value appears
    print("Count values for the gender field:")
    print(df_names["Gender"].value_counts())

    # Replace values
    print("Replace values for the gender field (m > M, male > M):")
    df_names["Gender"] = df_names["Gender"].replace({"m": "M", "male": "M"})
    print(df_names["Gender"].unique())

    # Find values outside an accepted range
    print("Find ages greater than 100")
    print(df_names[df_names["Age"] > 100])

    # Find duplicates
    print("Find duplicate rows")
    print(df_names.duplicated())

    # Drop duplicates
    print("Dataframe contents before dropping duplicates:\n")
    print(df_names)
    df_names = df_names.drop_duplicates()
    print("Dataframe contents after dropping duplicates:\n")
    print(df_names)

    # Remove blank spaces
    print("Dataframe contents after removing blank spaces in Department name:\n")
    print(df_names["Department"])
    df_names["Department"].str.strip()
    # df_names['Department'].str.replace(' ', '')
    print("Dataframe contents after removing blank spaces in Department name:\n")
    print(df_names["Department"])


if __name__ == "__main__":
    # TODO: Scope to create further functions and reduce the procedural code in main!

    URL = "https://docs.google.com/spreadsheets/d/1ndHgP53atJ5J-EtxgWcpSfYG8LdzHpUsnb6mWybErYg/export?gid=0&format=xlsx"

    # Load data into a dataframe
    # df_from_csv = pd.read_csv('cgd.csv', skiprows=1)
    df_cgd = pd.read_excel('cgd.xlsx', sheet_name='School closure tracker- Global', skiprows=1)
    # df_cgd = pd.read_excel(URL, sheet_name="School closure tracker- Global", skiprows=1)
    ''' cols = [
        "Country",
        "Code",
        "Region",
        "Income Group",
        "School Closures",
        "Closure extended?",
        "Date",
        "Number of confirmed cases at time of closure",
        "Number of cases at re-opening",
    ]
    df_cgd = pd.read_excel("cgd.xlsx",sheet_name="School closure tracker- Global", skiprows=1, usecols=cols)'''

    # Merge with the total weeks fully closed column from the ClosureOpening Dates (Global) worksheet
    col_list = ['Country', 'Total weeks FULLY CLOSED (excluding academic breaks) [red=need to verify breaks]']
    df_closure = pd.read_excel('cgd.xlsx', sheet_name="ClosureOpening Dates (Global)", skiprows=1, usecols=col_list)
    df_cgd = pd.merge(df_cgd, df_closure, on='Country', how='outer')

    set_pandas_display_options(df_cgd)

    print_summary_info(df_cgd)

    # Remove unwanted columns and rows where country is null
    df_cgd = remove_data(df_cgd)

    find_nulls(df_cgd)

    # Fill rows where 'Closure extended?' column is null with 'Unknown'
    df_cgd["Closure extended?"] = df_cgd["Closure extended?"].fillna("Unknown")

    # Handle classified data
    df_cgd["Closure extended?"] = df_cgd["Closure extended?"].replace({"Yes": 1, "No": 2, "Unknown": 3})

    # Rename some of the columns with long names for ease
    df_cgd = df_cgd.rename(columns={
        "Date": "Closure date", "Number of confirmed cases at time of closure": "Cases at closure",
        "Number of cases at re-opening": "Cases at re-opening",
        "Total weeks FULLY CLOSED (excluding academic breaks) [red=need to verify breaks]": "Weeks closed"
    })

    # Save prepared data set
    df_cgd.to_excel('cgd_cleaned.xlsx', index=False)
COMP0034/COMP0035 tutorials, code & guides

For use in COMP0034 and COMP0035