In real-world data analysis, your data will likely:
Fortunately, pandas can help you with all of this!
.filter()cols_to_keep = ['INSTNM', 'STABBR', 'GRAD_DEBT_MDN_SUPP']
debt_df = full_df.filter(cols_to_keep)
debt_df.columns = ['name', 'state', 'debt']
debt_df.head().loc[] method (note the brackets).query() will “query” your
dataset based on an expression& (and) and | (or)tx_debt_df = debt_df.query('debt != "PrivacySuppressed" & state == "TX" ')
tx_debt_df.head()
# Alternatively, use an index-based method
# tx_debt_df = debt_df.loc[(debt_df['debt'] != 'PrivacySuppressed') & (debt_df['state'] == 'TX')]states = ['OK', 'NM', 'TX', 'LA']
# Use `in @states` to get values in the list
# @ operator allows for use of variables in the query
sw_debt_df = debt_df.query("debt != 'PrivacySuppressed' & state in @states")
sw_debt_df.head().assign() method# With .assign()
df2 = df1.assign(col4 = df1.col1 + df1.col2)
# With index-based labeling
df2['col5'] = df2['col3'] / df2['col4']
df2.head()dtype conversion.astype() method.dropna() method: delete all rows (or columns) that
have any missing values (NaN in pandas).fillna() method: fill in missing data with a specified
valuecols_to_keep = ['INSTNM', 'STABBR', 'GRAD_DEBT_MDN_SUPP']
states = ['OK', 'NM', 'TX', 'LA']
sw_debt_clean = (full_df
.filter(cols_to_keep)
.set_axis(['name', 'state', 'debt'], axis = 'columns')
.query("debt != 'PrivacySuppressed' & state in @states")
.assign(debtnum = lambda x: x.debt.astype(float))
.dropna()
)pandas: .groupby() method!Process:
.groupby() in
pandasseabornimport seaborn as sns
sns.set(style = "darkgrid")
sns.boxplot(x = 'state', y = 'debtnum', data = sw_debt_clean)seaborngrid = sns.FacetGrid(data = sw_debt_clean, col = 'state', col_wrap = 2)
grid.map(sns.kdeplot, 'debtnum').merge() method in
pandaspandashow parameter):
'inner' (default), 'left',
'right', and 'outer'from pandas_datareader import wb
countries = ['ZA', 'BR', 'US']
tfr = wb.download(indicator = 'SP.DYN.TFRT.IN',
country = countries, start = 1960,
end = 2019).reset_index()
tfr.head().pivot() method in pandastfr_wide = tfr.pivot(index = 'year', columns = 'country',
values = 'SP.DYN.TFRT.IN')
tfr_wide.head()pd.melt() function in pandastfr_long = pd.melt(tfr_wide.reset_index(), id_vars = 'year',
var_name = 'country', value_name = 'tfr')
tfr_long.head()tfr_long['year'] = tfr_long['year'].astype(int)
sns.lineplot(x = "year", y = "tfr",
hue = "country", data = tfr_long)