02-data.md

title: Working with Data
teaching: 20
exercises: 10
questions:
- "How should I work with numeric data in Python?"
- "What's the recommended way to handle and analyse tabular data?"
- "How can I import tabular data for analysis in Python and export the results?"
objectives:
- "handle and summarise numeric data with Numpy."
- "filter values in their data based on a range of conditions."
- "load tabular data into a Pandas dataframe object."
- "describe what is meant by the data type of an array/series, and the impact this has on how the data is handled."
- "add and remove columns from a dataframe."
- "select, aggregate, and visualise data in a dataframe."
keypoints:
- "Specialised third-party libraries such as Numpy and Pandas provide powerful objects and functions that can help us analyse our data."
- "Pandas dataframe objects allow us to efficiently load and handle large tabular data."
- "Use the `pandas.read_csv` and `pandas.write_csv` functions to read and write tabular data."
from skimage.io import imread
raw = imread('cilliated_cell.png')
nuclei = imread('cilliated_cell_nuclei.png')
# if you want to see what these images look like - we can use matplotlib (more to come later!)
import matplotlib.pyplot as plt
plt.imshow(raw, cmap='gray')
plt.imshow(nuclei)
print(raw.shape)
print(raw.dtype)
print(np.max(raw))
print(np.min(raw))
# 1
pixels_in_nuclei = raw[nuclei == 1]
print(np.median(pixels_in_nuclei))

# 2
new_image = raw.copy()
new_image[nuclei == 0] = 0
plt.imshow(new_image, cmap='gray')
import pandas as pd
covid_cases = pd.read_csv("data/CovidCaseData_20200624.csv")
covid_cases.head()
covid_cases.tail()
print(f'covid_cases is a {type(covid_cases)} object with {covid_cases.shape[0]} rows and {covid_cases.shape[1]} columns')
print('covid_cases has the following columns:\n' + '\n'.join(covid_cases.columns))
print(covid_cases.iloc[24242,4])
print(covid_cases.iloc[24242,])
print(covid_cases.iloc[24242,:])
print(covid_cases.iloc[100:120,4:6])
print(covid_cases.iloc[100:120,:])

# select a whole row
print(covid_cases.loc[0,:])
# or
print(covid_cases.loc[0,])

# select a whole column
print(covid_cases.loc[:,'continentExp'])
# or
print(covid_cases['continentExp'])
# or(!)
print(covid_cases.continentExp)
type(covid_cases['continentExp'])
pd.unique(covid_cases['continentExp'])
covid_cases['continentExp'] == 'Europe'
covid_cases[covid_cases['continentExp'] == 'Europe'].max()
# 1
mask_germany = covid_cases['countryterritoryCode'] == 'DEU'
id_max = covid_cases[mask_germany]['cases'].idxmax()
print(covid_cases.iloc[id_max]['dateRep'])

# 2
mask_april = (covid_cases['year'] == 2020) & (covid_cases['month'] == 4)
mean_april = covid_cases[mask_germany & mask_april]['cases'].mean()
print(mean_april)

# 3
mask_march = (covid_cases['year'] == 2020) & (covid_cases['month'] == 3)
mean_march = covid_cases[mask_germany & mask_march]['cases'].mean()
print(mean_march)
print("Mean cases per day was {} in April than in March 2020.".
      format(["lower", "higher"][mean_april > mean_march]))

# 4
mask_higher_mean_april = (covid_cases['cases'] > mean_april)
selection = covid_cases[mask_germany & mask_march & mask_higher_mean_april]
nbr_days = len(selection)   # Assume clean data
print(nbr_days)
asia_lockdowns = pd.read_csv('data/AsiaLockdowns.csv', index_col=0)
africa_lockdowns = pd.read_csv('data/AfricaLockdowns.csv', index_col=0)
latest_date = covid_cases['dateRep'].max()
print(latest_date)
print(covid_cases['dateRep'].dtype)
pd.to_datetime(covid_cases['dateRep'], dayfirst=True)
# dayfirst=True is necessary because by default pandas reads mm/dd/yyyy dates :(
covid_cases['dateRep'] = pd.to_datetime(covid_cases['dateRep'], dayfirst=True)
print(covid_cases['dateRep'].max())
covid_lockdowns['End date'] = covid_lockdowns['End date'].
                                fillna(covid_cases['dateRep'].max())
covid_cases['casesPerMillion'] = covid_cases['cases'] / (covid_cases['popData2019']/1e6)
covid_cases.head()
covid_lockdowns.index.name='countriesAndTerritories'
covid_cases.merge(covid_lockdowns, on="countriesAndTerritories")
combined = covid_cases.merge(covid_lockdowns, on="countriesAndTerritories")
combined = combined.sort_values(by=['countriesAndTerritories','dateRep'])
rolling_mean_cases = combined.groupby('countriesAndTerritories')['cases'].rolling(7).mean()
combined['rolling mean'] = rolling_mean_cases
rolling_mean_cases = rolling_mean_cases.reset_index(0, drop=True)
combined['rolling mean'] = rolling_mean_cases

# plot rolling average for Germany
combined[combined['countriesAndTerritories']=='Germany'].set_index('dateRep')['rolling mean'].plot(kind='line')
# plot cumulative sum of cases for Germany
combined[combined['countriesAndTerritories']=='Germany'].set_index('dateRep')['cases'].cumsum().plot(kind='line')
peak_dates.index = combined.loc[peak_rows]['countriesAndTerritories']
start_dates.index = combined.loc[peak_rows]['countriesAndTerritories']
print((peak_dates - start_dates).median())
def get_days(t):
    return t.days

(peak_dates - start_dates).apply(get_days).plot(kind='box')