Visualize Author Gender By Year (Solution)

seaborn
plotly
interactive
line-plot
bar-plot
groupby
python
solution
exercise
Published

February 26, 2024

Import pandas

Code
import pandas as pd

Read in CSV

Code
aa_df = pd.read_csv("https://raw.githubusercontent.com/melaniewalsh/responsible-datasets-in-context/main/datasets/aa-periodical-poetry/African-American-Periodical-Poetry_1900-1928-Created-by-Amardeep-Singh-and-Kate-Hennessey,-Lehigh-University.csv")

View first 5 rows

Code
aa_df.head()
aa_df['gender (if known)'].value_counts()
gender (if known)
male      657
female    317
Name: count, dtype: int64

Convert year to datetime value

Code
aa_df['year'] = pd.to_datetime(aa_df['year'], format = "%Y")

#aa_df.set_index('year').reindex(pd.date_range('1900-01-01', '1930-12-31', freq='Y'))
      
# Sort years and return unique years to check for missing data
aa_df['year'].sort_values().unique()
<DatetimeArray>
['1900-01-01 00:00:00', '1901-01-01 00:00:00', '1902-01-01 00:00:00',
 '1903-01-01 00:00:00', '1904-01-01 00:00:00', '1905-01-01 00:00:00',
 '1906-01-01 00:00:00', '1907-01-01 00:00:00', '1908-01-01 00:00:00',
 '1909-01-01 00:00:00', '1910-01-01 00:00:00', '1911-01-01 00:00:00',
 '1912-01-01 00:00:00', '1913-01-01 00:00:00', '1914-01-01 00:00:00',
 '1915-01-01 00:00:00', '1916-01-01 00:00:00', '1917-01-01 00:00:00',
 '1918-01-01 00:00:00', '1919-01-01 00:00:00', '1920-01-01 00:00:00',
 '1921-01-01 00:00:00', '1922-01-01 00:00:00', '1923-01-01 00:00:00',
 '1924-01-01 00:00:00', '1925-01-01 00:00:00', '1926-01-01 00:00:00',
 '1927-01-01 00:00:00', '1928-01-01 00:00:00']
Length: 29, dtype: datetime64[ns]

Group by year, count instances of author by gender

Code
aa_df.groupby('year')['gender (if known)'].value_counts()
year        gender (if known)
1900-01-01  male                  5
            female                1
1901-01-01  male                 31
            female                3
1902-01-01  male                  5
            female                1
1903-01-01  male                 15
            female                6
1904-01-01  male                 12
1905-01-01  male                 44
            female               24
1906-01-01  male                 55
            female               22
1907-01-01  male                 37
            female                8
1908-01-01  male                 10
            female                1
1909-01-01  male                 14
            female                2
1910-01-01  male                  4
            female                1
1911-01-01  male                  5
1912-01-01  male                  3
            female                2
1913-01-01  male                  7
1914-01-01  male                  6
            female                1
1915-01-01  male                  8
            female                3
1916-01-01  female               11
            male                  8
1917-01-01  female               11
            male                 11
1918-01-01  male                  8
            female                4
1919-01-01  male                 14
            female                7
1920-01-01  male                  8
            female                7
1921-01-01  male                 30
            female                6
1922-01-01  female                8
            male                  5
1923-01-01  male                 22
            female                8
1924-01-01  female               32
            male                 26
1925-01-01  male                 57
            female               27
1926-01-01  male                 47
            female               30
1927-01-01  male                 94
            female               58
1928-01-01  male                 66
            female               33
Name: count, dtype: int64

Make this grouping into a dataframe

Code
aa_gender_by_year = aa_df.groupby('year')['gender (if known)'].value_counts().reset_index()

#aa_gender_by_year['year'] = aa_gender_by_year['year'].astype(int)

aa_gender_by_year
year gender (if known) count
0 1900-01-01 male 5
1 1900-01-01 female 1
2 1901-01-01 male 31
3 1901-01-01 female 3
4 1902-01-01 male 5
5 1902-01-01 female 1
6 1903-01-01 male 15
7 1903-01-01 female 6
8 1904-01-01 male 12
9 1905-01-01 male 44
10 1905-01-01 female 24
11 1906-01-01 male 55
12 1906-01-01 female 22
13 1907-01-01 male 37
14 1907-01-01 female 8
15 1908-01-01 male 10
16 1908-01-01 female 1
17 1909-01-01 male 14
18 1909-01-01 female 2
19 1910-01-01 male 4
20 1910-01-01 female 1
21 1911-01-01 male 5
22 1912-01-01 male 3
23 1912-01-01 female 2
24 1913-01-01 male 7
25 1914-01-01 male 6
26 1914-01-01 female 1
27 1915-01-01 male 8
28 1915-01-01 female 3
29 1916-01-01 female 11
30 1916-01-01 male 8
31 1917-01-01 female 11
32 1917-01-01 male 11
33 1918-01-01 male 8
34 1918-01-01 female 4
35 1919-01-01 male 14
36 1919-01-01 female 7
37 1920-01-01 male 8
38 1920-01-01 female 7
39 1921-01-01 male 30
40 1921-01-01 female 6
41 1922-01-01 female 8
42 1922-01-01 male 5
43 1923-01-01 male 22
44 1923-01-01 female 8
45 1924-01-01 female 32
46 1924-01-01 male 26
47 1925-01-01 male 57
48 1925-01-01 female 27
49 1926-01-01 male 47
50 1926-01-01 female 30
51 1927-01-01 male 94
52 1927-01-01 female 58
53 1928-01-01 male 66
54 1928-01-01 female 33
Code
# If there were missing years, this is how you might handle it
# Create a dataframe with a complete range of years from 1900 to 1930
# years = pd.DataFrame({'year': pd.date_range(start='1900', end='1930', freq='YS')})

# Merge with existing dataframe
# aa_gender_by_year = pd.merge(years, aa_gender_by_year, on='year', how='left')

# Fill NaN values
#aa_gender_by_year['gender (if known)'] = aa_gender_by_year['gender (if known)'].fillna("no author recorded")

#aa_gender_by_year['count'] = aa_gender_by_year['count'].fillna(0)

# Sort years and return unique years to check for missing data
#aa_gender_by_year['year'].sort_values().unique()

#aa_gender_by_year

Visualize with seaborn

Code
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

# Create the line plot
ax = sns.lineplot(data=aa_gender_by_year, x="year", y="count", hue="gender (if known)")

# Set x-axis major ticks to every 5 years
ax.xaxis.set_major_locator(mdates.YearLocator(5))

# Format the x-axis labels to show the year only
#ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))

Code
from matplotlib.ticker import MultipleLocator

plt.figure(figsize=(13.7, 8.27))

# Create the bar plot
ax = sns.barplot(data=aa_gender_by_year, 
# pull out just the year
x=aa_gender_by_year["year"].dt.year, y="count", hue="gender (if known)")

# Set x-axis major ticks every 1 year
ax.xaxis.set_major_locator(MultipleLocator(1))

# Set x-axis labels to the desired format (only show every 5th year)
ax.set_xticklabels([date.year for date in pd.date_range(start='1900', end='1930', freq='YS')], rotation=45)

plt.show()
/var/folders/06/y6vmvyfj0wg08vb3rszcfy080000gn/T/ipykernel_97692/3665439903.py:14: UserWarning:

set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.

Make interactive visualizations with plotly

Code
import plotly.express as px

# Create the interactive line plot with Plotly
fig = px.line(aa_gender_by_year, 
              x='year', 
              y='count', 
              color='gender (if known)',
              title='Author Gender by Year')
fig.show()
Code
fig = px.bar(aa_gender_by_year, 
             x='year', 
             y='count', 
             color='gender (if known)',
             title='Author Gender by Year')


fig.show()