# General Libraries Needed
import glob, csv
import pandas as pd
from collections import defaultdict
import numpy as np

# Functions for Unsupervised Clustering
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import Normalizer

# Functions for Supervised Classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix

# Libraries for Graphing
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')


files = glob.glob("shakespeare/*.txt") # Get files from shakespeare directory
filenames = [f.split("/")[1].split("_")[0] for f in files] # Keep titles of plays for later

# Open all files and store their text
texts = []
for f in files:
    with open(f, 'r') as readfile:
        texts.append(readfile.read())
        
# Print the first 1000 characters of the first file (Coriolanus)
print(texts[0][:1000])

Coriolanus
by William Shakespeare
Edited by Barbara A. Mowat and Paul Werstine
  with Michael Poston and Rebecca Niles
Folger Shakespeare Library
https://shakespeare.folger.edu/shakespeares-works/coriolanus/
Created on May 11, 2016, from FDT version 0.9.2.1

Characters in the Play
======================
Caius MARTIUS, later Caius Martius CORIOLANUS
VOLUMNIA, his mother
VIRGILIA, his wife
YOUNG MARTIUS, their son
VALERIA, friend to Volumnia and Virgilia
A GENTLEWOMAN, Volumnia's attendant
MENENIUS Agrippa, patrician
COMINIUS, patrician and general
Titus LARTIUS, patrician and military officer
SICINIUS Velutus, tribune
Junius BRUTUS, tribune
Roman SENATORS, PATRICIANS, NOBLES
Roman LIEUTENANT
Roman OFFICERS
Roman AEDILES
Roman HERALD
Roman SOLDIERS
Roman CITIZENS or PLEBEIANS
Roman MESSENGERS
A ROMAN defector, Nicanor
Tullus AUFIDIUS, general of the Volscians
Volscian CONSPIRATORS of his faction
Three of his SERVINGMEN
Volscian SENATORS, LORDS
Volscian LIEUTENANT
Volscian SOLDIERS
Two of


vectorizer = CountVectorizer(min_df=.65, max_df=1.0, strip_accents="unicode", stop_words="english")
wordcounts = vectorizer.fit_transform(texts)
df = pd.DataFrame(wordcounts.toarray(), index=filenames, columns=vectorizer.get_feature_names())
df


normalizer = Normalizer(norm='l2') #Create a Normalizer instance just like we did with the Vectorizer
norm_features = normalizer.fit_transform(wordcounts.toarray()) # Feed in our wordcounts

# Put into a pandas dataframe so we can view it
df_norm = df = pd.DataFrame(norm_features, index=filenames, columns=vectorizer.get_feature_names())
df_norm


ax = df_norm.plot.scatter(x='right', y='wrong')

for i, txt in enumerate(df_norm.index):
    ax.annotate(txt, (df_norm['right'].iat[i],df_norm['wrong'].iat[i]))


df_norm.plot.scatter(x='right', y='wrong')

<matplotlib.axes._subplots.AxesSubplot at 0x7fbe83cdb8b0>


kmeans = KMeans(n_clusters=4, random_state=42) # Create a KMeans instance that will look for 4 clusters
kmeans.fit(norm_features) # Feed in our normalized data

# Show which plays the model put together
kmeans_groups = defaultdict(list)
for k,v in zip(kmeans.labels_,filenames):
    kmeans_groups[k].append(v)
    
for v in kmeans_groups.values():
    print(v)

['coriolanus', 'julius-caesar', 'antony-and-cleopatra', 'troilus-and-cressida', 'the-winters-tale', 'timon-of-athens', 'the-merchant-of-venice', 'macbeth', 'the-two-noble-kinsmen', 'the-tempest', 'pericles', 'othello', 'a-midsummer-nights-dream', 'king-lear', 'cymbeline', 'titus-andronicus', 'romeo-and-juliet', 'alls-well-that-ends-well']
['loves-labors-lost', 'henry-viii', 'henry-v', 'king-john', 'henry-vi-part-1', 'hamlet', 'richard-ii', 'richard-iii', 'henry-vi-part-2', 'henry-vi-part-3']
['henry-iv-part-2', 'henry-iv-part-1', 'much-ado-about-nothing']
['as-you-like-it', 'the-merry-wives-of-windsor', 'the-taming-of-the-shrew', 'twelfth-night', 'the-two-gentlemen-of-verona', 'the-comedy-of-errors', 'measure-for-measure']


def convert_title(title):
    title = title.lower().replace("'", "").replace(" ","-").replace("labours", "labors")
    if title.startswith('1') or title.startswith('2') or title.startswith('3'):
        title = f"{title[2:]}-part-{title[0]}"
    return title

with open('shakespeare/Shx_VEP.csv', 'r') as metafile:
    reader = csv.DictReader(metafile)
    genre_by_title = {convert_title(r['shortTitle']): r['Genre'] for r in reader}
    
genre = [genre_by_title[f] for f in filenames]
print(genre)

['TR', 'TR', 'CO', 'TR', 'HI', 'HI', 'TR', 'CO', 'HI', 'TR', 'CO', 'HI', 'HI', 'TR', 'TC', 'CO', 'CO', 'CO', 'CO', 'TC', 'TR', 'TR', 'CO', 'TR', 'CO', 'TR', 'CO', 'CO', 'HI', 'HI', 'CO', 'TR', 'TR', 'HI', 'HI', 'CO', 'HI', 'CO']


X_train, X_test, y_train, y_test = train_test_split(df_norm, genre, test_size=0.45, random_state=42)
lr = LogisticRegression(random_state=0, solver='lbfgs', penalty='none')
clf = lr.fit(X_train, y_train)

y_pred = clf.predict(X_test)
# evaluate accuracy
print("Accuracy score:", accuracy_score(y_test, y_pred, normalize=True, sample_weight=None))
print("Cross validation score:", np.mean(cross_val_score(lr, norm_features, genre, cv=2)))
print()
print("Results of this run:\n")
print("Play Title | Actual Genre | Predicted Genre")
for title, real, predicted in zip(X_test.index, y_test, y_pred):
    print(f"{title} | {real} | {predicted}")

Accuracy score: 0.7222222222222222
Cross validation score: 0.7368421052631579

Results of this run:

Play Title | Actual Genre | Predicted Genre
henry-iv-part-1 | HI | CO
henry-vi-part-3 | HI | HI
henry-viii | HI | HI
macbeth | TR | TR
measure-for-measure | CO | CO
the-two-gentlemen-of-verona | CO | CO
troilus-and-cressida | TR | TR
the-comedy-of-errors | CO | CO
twelfth-night | CO | CO
as-you-like-it | CO | CO
the-tempest | CO | TR
king-john | HI | HI
the-merry-wives-of-windsor | CO | CO
henry-iv-part-2 | HI | CO
pericles | TC | TR
timon-of-athens | TR | TR
romeo-and-juliet | TR | CO
coriolanus | TR | TR


cm = confusion_matrix(y_test,y_pred)
cm_df = pd.DataFrame(cm, columns=clf.classes_, index=clf.classes_)
f, ax = plt.subplots(figsize=(15, 5))
sns.heatmap(cm_df,annot=True,cmap='Greens',linewidths=.5)

<matplotlib.axes._subplots.AxesSubplot at 0x7fbe83d384c0>


for g,c in zip(clf.classes_, clf.coef_):
    print("Genre:", g)
    sort_values = sorted(list(zip(vectorizer.get_feature_names(), c)), key = lambda x: x[1], reverse=True)
    print("Best positive indicators (words with highest coefficients):")
    for s in sort_values[:5]:
        print("\t", s)
    print("Best negative indicators (words with lowest coefficients):")
    for s in sort_values[-5:]:
        print("\t", s)
    print()

Genre: CO
Best positive indicators (words with highest coefficients):
	 ('love', 14.31353413344846)
	 ('son', 14.288996455277514)
	 ('sir', 13.826512853008708)
	 ('prince', 10.989743814876732)
	 ('sweet', 8.775287101685953)
Best negative indicators (words with lowest coefficients):
	 ('let', -6.5742804071516785)
	 ('noble', -8.210848867991874)
	 ('th', -9.775218786022753)
	 ('queen', -12.782344540946523)
	 ('thy', -13.495860197542175)

Genre: HI
Best positive indicators (words with highest coefficients):
	 ('king', 28.729068957896786)
	 ('thy', 13.404769720173682)
	 ('duke', 12.274762205125416)
	 ('god', 11.233728402247833)
	 ('queen', 8.381486461205936)
Best negative indicators (words with lowest coefficients):
	 ('come', -7.667322357222622)
	 ('love', -9.337678472130184)
	 ('know', -9.41381214695534)
	 ('th', -10.148492273110667)
	 ('sir', -14.334972078255046)

Genre: TC
Best positive indicators (words with highest coefficients):
	 ('daughter', 16.561641338907688)
	 ('th', 12.289619192861556)
	 ('ll', 11.671523790180954)
	 ('cousin', 10.946760327333376)
	 ('friend', 7.923798366266301)
Best negative indicators (words with lowest coefficients):
	 ('thee', -6.921523092074274)
	 ('hath', -7.262919428947598)
	 ('thou', -10.59842112965189)
	 ('lord', -14.144347789391407)
	 ('king', -14.528295358748878)

Genre: TR
Best positive indicators (words with highest coefficients):
	 ('let', 10.0533736928238)
	 ('thou', 9.1987861101176)
	 ('tis', 8.962009289591174)
	 ('speak', 8.923738873160241)
	 ('th', 7.634091866265561)
Best negative indicators (words with lowest coefficients):
	 ('duke', -9.563584248580998)
	 ('son', -9.658531000472388)
	 ('prince', -10.517858900365619)
	 ('daughter', -11.696260456336757)
	 ('love', -12.092368621432009)

	2015	31	able	abroad	absence	according	act	action	add	adieu	...	wrong	wrongs	yea	year	years	yes	yield	yonder	young	youth
coriolanus	0.000000	0.000000	0.004019	0.000000	0.008037	0.002009	0.018083	0.026121	0.000000	0.004019	...	0.004019	0.004019	0.008037	0.002009	0.010046	0.014065	0.014065	0.004019	0.022102	0.004019
julius-caesar	0.000000	0.000000	0.000000	0.005484	0.002742	0.008226	0.019194	0.005484	0.002742	0.000000	...	0.046615	0.005484	0.010968	0.002742	0.010968	0.016452	0.008226	0.008226	0.030162	0.000000
loves-labors-lost	0.002367	0.002367	0.000000	0.004734	0.002367	0.000000	0.011834	0.002367	0.004734	0.021301	...	0.011834	0.000000	0.007100	0.011834	0.033135	0.016568	0.007100	0.002367	0.009467	0.011834
antony-and-cleopatra	0.002000	0.002000	0.002000	0.004000	0.004000	0.000000	0.024001	0.010000	0.002000	0.014001	...	0.006000	0.000000	0.004000	0.004000	0.004000	0.014001	0.012001	0.004000	0.006000	0.010000
henry-viii	0.001966	0.001966	0.013760	0.005897	0.001966	0.000000	0.017691	0.003931	0.003931	0.000000	...	0.011794	0.001966	0.017691	0.005897	0.007863	0.039314	0.000000	0.000000	0.017691	0.000000
henry-v	0.002048	0.002048	0.002048	0.004095	0.004095	0.006143	0.016381	0.008190	0.004095	0.006143	...	0.006143	0.004095	0.006143	0.010238	0.006143	0.014333	0.014333	0.006143	0.004095	0.016381
troilus-and-cressida	0.000000	0.000000	0.002075	0.002075	0.002075	0.000000	0.024902	0.018677	0.006226	0.008301	...	0.014526	0.000000	0.020752	0.002075	0.002075	0.010376	0.004150	0.026977	0.024902	0.020752
the-winters-tale	0.000000	0.000000	0.004365	0.006547	0.010912	0.002182	0.017459	0.002182	0.002182	0.004365	...	0.004365	0.002182	0.015277	0.006547	0.019642	0.004365	0.002182	0.000000	0.028371	0.013094
king-john	0.000000	0.000000	0.000000	0.006252	0.002084	0.002084	0.027091	0.010420	0.006252	0.006252	...	0.016672	0.008336	0.006252	0.006252	0.002084	0.002084	0.004168	0.000000	0.047931	0.006252
timon-of-athens	0.002118	0.002118	0.004236	0.002118	0.002118	0.000000	0.012709	0.000000	0.002118	0.000000	...	0.008473	0.008473	0.006354	0.002118	0.000000	0.019063	0.008473	0.004236	0.010591	0.012709
the-merchant-of-venice	0.002778	0.002778	0.005556	0.008335	0.008335	0.008335	0.022226	0.000000	0.002778	0.016669	...	0.036117	0.000000	0.008335	0.008335	0.008335	0.022226	0.011113	0.005556	0.066677	0.022226
henry-vi-part-1	0.002280	0.002280	0.009118	0.002280	0.000000	0.002280	0.011398	0.000000	0.002280	0.004559	...	0.022796	0.011398	0.004559	0.004559	0.015957	0.020516	0.029635	0.006839	0.038753	0.011398
henry-iv-part-2	0.001790	0.001790	0.008948	0.007158	0.001790	0.001790	0.012527	0.021475	0.001790	0.000000	...	0.017896	0.005369	0.050108	0.017896	0.014317	0.005369	0.007158	0.001790	0.041161	0.021475
macbeth	0.003064	0.003064	0.000000	0.006127	0.006127	0.006127	0.027572	0.003064	0.006127	0.006127	...	0.000000	0.003064	0.000000	0.000000	0.000000	0.006127	0.012254	0.000000	0.039827	0.003064
the-two-noble-kinsmen	0.002325	0.002325	0.009300	0.009300	0.000000	0.000000	0.025574	0.004650	0.002325	0.004650	...	0.011625	0.002325	0.018599	0.004650	0.002325	0.088347	0.002325	0.004650	0.051148	0.013949
as-you-like-it	0.002345	0.002345	0.002345	0.000000	0.002345	0.007034	0.011723	0.002345	0.000000	0.009379	...	0.002345	0.000000	0.007034	0.009379	0.016413	0.025792	0.004689	0.009379	0.070341	0.070341
the-merry-wives-of-windsor	0.001461	0.001461	0.005846	0.000000	0.004384	0.001461	0.011691	0.002923	0.000000	0.011691	...	0.017537	0.001461	0.004384	0.002923	0.004384	0.010230	0.001461	0.007307	0.023383	0.004384
the-tempest	0.002898	0.002898	0.000000	0.005796	0.000000	0.000000	0.023185	0.002898	0.000000	0.000000	...	0.005796	0.005796	0.014491	0.008694	0.005796	0.008694	0.005796	0.000000	0.008694	0.000000
the-taming-of-the-shrew	0.002430	0.002430	0.002430	0.002430	0.000000	0.002430	0.012149	0.007289	0.004860	0.007289	...	0.019438	0.000000	0.012149	0.007289	0.014579	0.014579	0.000000	0.002430	0.031587	0.004860
pericles	0.002968	0.002968	0.002968	0.000000	0.005935	0.000000	0.023741	0.008903	0.005935	0.002968	...	0.014838	0.000000	0.002968	0.002968	0.014838	0.008903	0.017806	0.002968	0.020773	0.000000
hamlet	0.001720	0.001720	0.001720	0.001720	0.000000	0.003440	0.032684	0.017202	0.000000	0.013762	...	0.010321	0.000000	0.005161	0.008601	0.005161	0.008601	0.003440	0.003440	0.029244	0.027524
othello	0.000000	0.000000	0.000000	0.004383	0.006575	0.000000	0.035067	0.015342	0.002192	0.008767	...	0.030684	0.002192	0.004383	0.006575	0.015342	0.019725	0.002192	0.002192	0.021917	0.010959
a-midsummer-nights-dream	0.003179	0.003179	0.006359	0.000000	0.003179	0.009538	0.015897	0.006359	0.000000	0.015897	...	0.012717	0.003179	0.012717	0.000000	0.003179	0.006359	0.012717	0.012717	0.022256	0.022256
king-lear	0.001783	0.001783	0.001783	0.003567	0.000000	0.001783	0.017834	0.003567	0.001783	0.001783	...	0.012484	0.005350	0.005350	0.007134	0.010701	0.014268	0.005350	0.000000	0.017834	0.003567
twelfth-night	0.002116	0.002116	0.000000	0.000000	0.002116	0.002116	0.016924	0.004231	0.004231	0.004231	...	0.010578	0.002116	0.000000	0.004231	0.021155	0.008462	0.004231	0.004231	0.021155	0.055004
cymbeline	0.002034	0.002034	0.000000	0.008135	0.008135	0.002034	0.026439	0.016270	0.004068	0.002034	...	0.004068	0.002034	0.016270	0.002034	0.022371	0.012203	0.018304	0.000000	0.022371	0.030507
the-two-gentlemen-of-verona	0.002466	0.002466	0.002466	0.002466	0.002466	0.009862	0.012328	0.002466	0.000000	0.007397	...	0.012328	0.002466	0.000000	0.000000	0.002466	0.007397	0.004931	0.002466	0.014793	0.036984
the-comedy-of-errors	0.000000	0.000000	0.003196	0.000000	0.003196	0.003196	0.015980	0.000000	0.000000	0.000000	...	0.035156	0.012784	0.006392	0.003196	0.019176	0.009588	0.006392	0.009588	0.000000	0.003196
richard-ii	0.001887	0.001887	0.001887	0.001887	0.003774	0.003774	0.011321	0.000000	0.005661	0.005661	...	0.011321	0.013208	0.011321	0.001887	0.028304	0.001887	0.007548	0.003774	0.022643	0.011321
richard-iii	0.001536	0.001536	0.000000	0.006142	0.001536	0.000000	0.010749	0.001536	0.000000	0.006142	...	0.018426	0.006142	0.003071	0.000000	0.010749	0.004607	0.010749	0.001536	0.035317	0.006142
measure-for-measure	0.002023	0.002023	0.002023	0.004045	0.006068	0.006068	0.022248	0.010113	0.002023	0.004045	...	0.016181	0.002023	0.006068	0.016181	0.014158	0.026294	0.012136	0.006068	0.012136	0.018203
titus-andronicus	0.002259	0.002259	0.002259	0.004517	0.000000	0.000000	0.011293	0.004517	0.000000	0.000000	...	0.011293	0.024845	0.002259	0.002259	0.009035	0.002259	0.006776	0.002259	0.060984	0.013552
romeo-and-juliet	0.001751	0.001751	0.003503	0.005254	0.000000	0.001751	0.012259	0.001751	0.001751	0.007005	...	0.005254	0.000000	0.007005	0.007005	0.019265	0.003503	0.000000	0.008757	0.040281	0.010508
henry-iv-part-1	0.001760	0.001760	0.001760	0.001760	0.005281	0.001760	0.008802	0.007042	0.000000	0.003521	...	0.007042	0.003521	0.036969	0.007042	0.014084	0.001760	0.003521	0.000000	0.019365	0.015844
henry-vi-part-2	0.001720	0.001720	0.013759	0.001720	0.000000	0.005160	0.012039	0.003440	0.001720	0.000000	...	0.008600	0.000000	0.005160	0.001720	0.005160	0.006880	0.012039	0.000000	0.015479	0.008600
alls-well-that-ends-well	0.000000	0.000000	0.008136	0.000000	0.000000	0.000000	0.020339	0.004068	0.002034	0.006102	...	0.012203	0.002034	0.002034	0.006102	0.002034	0.016271	0.002034	0.004068	0.050848	0.024407
henry-vi-part-3	0.001516	0.001516	0.003032	0.006065	0.000000	0.001516	0.013646	0.000000	0.007581	0.001516	...	0.015162	0.003032	0.006065	0.003032	0.009097	0.007581	0.028807	0.004549	0.031840	0.006065
much-ado-about-nothing	0.002244	0.002244	0.000000	0.000000	0.000000	0.000000	0.011221	0.004488	0.002244	0.004488	...	0.015709	0.006733	0.060593	0.004488	0.002244	0.013465	0.004488	0.004488	0.026930	0.015709

Understanding Statistical Modeling¶

Reading a Text Corpus¶

Getting Word Counts¶

Unsupervised Clustering¶

Targets¶

Supervised Classification¶

Interpreting Results¶