"""Prepares and explores the COVID school closures dataset downloaded from
https://docs.google.com/spreadsheets/d/1ndHgP53atJ5J-EtxgWcpSfYG8LdzHpUsnb6mWybErYg
in line with the COMP0034 'How to' guide for data preparation and exploration
Usage:
./data_prep.py
Author:
Sarah Sanders - 23.05.2022
"""
import pandas as pd
import matplotlib as plt
def set_pandas_display_options(dataframe):
"""Set the pandas display options to the size of the dataframe
Parameters
----------
dataframe : pandas DataFrame
DataFrame containing the raw data.
"""
pd.set_option("display.max_rows", len(dataframe.index))
pd.set_option("display.max_columns", len(dataframe.columns))
pd.set_option('display.width', 150)
def print_summary_info(dataframe):
"""Print summary information for a dataframe
Parameters
----------
dataframe : pandas DataFrame
DataFrame containing the raw data.
"""
print("Number of rows and columns")
print(dataframe.shape)
print("First 5 rows")
print(dataframe.head(5))
print("Column labels")
print(dataframe.columns)
print("Column labels and data types")
print(dataframe.info(verbose=True))
def remove_data(datafame):
"""Keeps only the needed columns and rows as per the 'how to' guide """
col_list = [
"Country",
"Code",
"Region",
"Income Group",
"School Closures",
"Closure extended?",
"Date",
"Number of confirmed cases at time of closure",
"Number of cases at re-opening",
"Total weeks FULLY CLOSED (excluding academic breaks) [red=need to verify breaks]"
]
df = datafame[col_list]
# Keep only rows where country is not null
df = df[df["Country"].notna()]
df.reset_index(drop=True)
return df
def find_nulls(dataframe):
""" Finds and prints info on nulls and NaNs. """
# Count null values
print("Counts of isnull and isna\n=========================\n")
print(dataframe.isnull().sum())
# print(dataframe.isna().sum())
# Identify rows and columns with null values
df_cols_with_na = dataframe.loc[:, dataframe.isna().any()]
df_rows_with_na = df_cols_with_na[df_cols_with_na.isnull().any(axis=1)]
print("All columns and rows with null values\n=====================\n")
print(df_rows_with_na)
def inconsistent_data_example():
""" Example illustrating inconsistent data """
# Inconsistent data
data = [
["Rio", "M", 31, "IT"],
["Maz", "M", 32, "IT"],
["Tom", "M", 35, "I T"],
["Nick", "M", 34, "IT"],
["Julius", "m", 340, "IT "],
["Julia", "F", 32, " IT"],
["Tom", "male", 35, "IT"],
["Nicola", "F", 31, "IT "],
["Julia", "F", 31, "IT"],
["Julia", "F", 31, "IT"],
]
df_names = pd.DataFrame(data, columns=["Name", "Gender", "Age", "Department"])
# Find unique values
print("Unique values for the gender field:\n")
print(df_names["Gender"].unique())
# Count the times a value appears
print("Count values for the gender field:")
print(df_names["Gender"].value_counts())
# Replace values
print("Replace values for the gender field (m > M, male > M):")
df_names["Gender"] = df_names["Gender"].replace({"m": "M", "male": "M"})
print(df_names["Gender"].unique())
# Find values outside an accepted range
print("Find ages greater than 100")
print(df_names[df_names["Age"] > 100])
# Find duplicates
print("Find duplicate rows")
print(df_names.duplicated())
# Drop duplicates
print("Dataframe contents before dropping duplicates:\n")
print(df_names)
df_names = df_names.drop_duplicates()
print("Dataframe contents after dropping duplicates:\n")
print(df_names)
# Remove blank spaces
print("Dataframe contents after removing blank spaces in Department name:\n")
print(df_names["Department"])
df_names["Department"].str.strip()
# df_names['Department'].str.replace(' ', '')
print("Dataframe contents after removing blank spaces in Department name:\n")
print(df_names["Department"])
if __name__ == "__main__":
# TODO: Scope to create further functions and reduce the procedural code in main!
URL = "https://docs.google.com/spreadsheets/d/1ndHgP53atJ5J-EtxgWcpSfYG8LdzHpUsnb6mWybErYg/export?gid=0&format=xlsx"
# Load data into a dataframe
# df_from_csv = pd.read_csv('cgd.csv', skiprows=1)
df_cgd = pd.read_excel('cgd.xlsx', sheet_name='School closure tracker- Global', skiprows=1)
# df_cgd = pd.read_excel(URL, sheet_name="School closure tracker- Global", skiprows=1)
''' cols = [
"Country",
"Code",
"Region",
"Income Group",
"School Closures",
"Closure extended?",
"Date",
"Number of confirmed cases at time of closure",
"Number of cases at re-opening",
]
df_cgd = pd.read_excel("cgd.xlsx",sheet_name="School closure tracker- Global", skiprows=1, usecols=cols)'''
# Merge with the total weeks fully closed column from the ClosureOpening Dates (Global) worksheet
col_list = ['Country', 'Total weeks FULLY CLOSED (excluding academic breaks) [red=need to verify breaks]']
df_closure = pd.read_excel('cgd.xlsx', sheet_name="ClosureOpening Dates (Global)", skiprows=1, usecols=col_list)
df_cgd = pd.merge(df_cgd, df_closure, on='Country', how='outer')
set_pandas_display_options(df_cgd)
print_summary_info(df_cgd)
# Remove unwanted columns and rows where country is null
df_cgd = remove_data(df_cgd)
find_nulls(df_cgd)
# Fill rows where 'Closure extended?' column is null with 'Unknown'
df_cgd["Closure extended?"] = df_cgd["Closure extended?"].fillna("Unknown")
# Handle classified data
df_cgd["Closure extended?"] = df_cgd["Closure extended?"].replace({"Yes": 1, "No": 2, "Unknown": 3})
# Rename some of the columns with long names for ease
df_cgd = df_cgd.rename(columns={
"Date": "Closure date", "Number of confirmed cases at time of closure": "Cases at closure",
"Number of cases at re-opening": "Cases at re-opening",
"Total weeks FULLY CLOSED (excluding academic breaks) [red=need to verify breaks]": "Weeks closed"
})
# Save prepared data set
df_cgd.to_excel('cgd_cleaned.xlsx', index=False)