In real-world data analysis, your data will likely:
Fortunately, pandas
can help you with all of this!
.filter()
cols_to_keep = ['INSTNM', 'STABBR', 'GRAD_DEBT_MDN_SUPP']
debt_df = full_df.filter(cols_to_keep)
debt_df.columns = ['name', 'state', 'debt']
debt_df.head()
.loc[]
method (note the brackets).query()
will “query” your
dataset based on an expression&
(and) and |
(or)tx_debt_df = debt_df.query('debt != "PrivacySuppressed" & state == "TX" ')
tx_debt_df.head()
# Alternatively, use an index-based method
# tx_debt_df = debt_df.loc[(debt_df['debt'] != 'PrivacySuppressed') & (debt_df['state'] == 'TX')]
states = ['OK', 'NM', 'TX', 'LA']
# Use `in @states` to get values in the list
# @ operator allows for use of variables in the query
sw_debt_df = debt_df.query("debt != 'PrivacySuppressed' & state in @states")
sw_debt_df.head()
.assign()
method# With .assign()
df2 = df1.assign(col4 = df1.col1 + df1.col2)
# With index-based labeling
df2['col5'] = df2['col3'] / df2['col4']
df2.head()
dtype
conversion.astype()
method.dropna()
method: delete all rows (or columns) that
have any missing values (NaN
in pandas
).fillna()
method: fill in missing data with a specified
valuecols_to_keep = ['INSTNM', 'STABBR', 'GRAD_DEBT_MDN_SUPP']
states = ['OK', 'NM', 'TX', 'LA']
sw_debt_clean = (full_df
.filter(cols_to_keep)
.set_axis(['name', 'state', 'debt'], axis = 'columns')
.query("debt != 'PrivacySuppressed' & state in @states")
.assign(debtnum = lambda x: x.debt.astype(float))
.dropna()
)
pandas
: .groupby()
method!Process:
.groupby()
in
pandas
seaborn
import seaborn as sns
sns.set(style = "darkgrid")
sns.boxplot(x = 'state', y = 'debtnum', data = sw_debt_clean)
seaborn
grid = sns.FacetGrid(data = sw_debt_clean, col = 'state', col_wrap = 2)
grid.map(sns.kdeplot, 'debtnum')
.merge()
method in
pandas
pandas
how
parameter):
'inner'
(default), 'left'
,
'right'
, and 'outer'
from pandas_datareader import wb
countries = ['ZA', 'BR', 'US']
tfr = wb.download(indicator = 'SP.DYN.TFRT.IN',
country = countries, start = 1960,
end = 2019).reset_index()
tfr.head()
.pivot()
method in pandas
tfr_wide = tfr.pivot(index = 'year', columns = 'country',
values = 'SP.DYN.TFRT.IN')
tfr_wide.head()
pd.melt()
function in pandas
tfr_long = pd.melt(tfr_wide.reset_index(), id_vars = 'year',
var_name = 'country', value_name = 'tfr')
tfr_long.head()
tfr_long['year'] = tfr_long['year'].astype(int)
sns.lineplot(x = "year", y = "tfr",
hue = "country", data = tfr_long)