Source code for yellowbrick.text.postag

# yellowbrick.text.postag
# Implementation of part-of-speech visualization for text.
#
# Author:   Rebecca Bilbro
# Created:  Sun Mar 5 18:07:06 2017 -0500
#
# Copyright (C) 2017 The scikit-yb developers
# For license information, see LICENSE.txt
#
# ID: postag.py [849f5a8] rebecca.bilbro@bytecubed.com $

"""
Implementation of part-of-speech visualization for text,
enabling the user to visualize a single document or
small subset of documents.
"""

##########################################################################
# Imports
##########################################################################

import numpy as np
import importlib

from yellowbrick.draw import bar_stack
from yellowbrick.text.base import TextVisualizer
from yellowbrick.style.colors import resolve_colors
from yellowbrick.exceptions import YellowbrickValueError

##########################################################################
# Part-of-speech tag punctuation and labels
##########################################################################

# NOTE: Penn Treebank converts all sentence closers (!,?,;) to periods
PUNCT_TAGS = [".", ":", ",", "``", "''", "(", ")", "#", "$"]

TAGSET_NAMES = {"penn_treebank": "Penn Treebank", "universal": "Universal Dependencies"}

PENN_TAGS = [
    "noun",
    "verb",
    "adjective",
    "adverb",
    "preposition",
    "determiner",
    "pronoun",
    "conjunction",
    "infinitive",
    "wh- word",
    "modal",
    "possessive",
    "existential",
    "punctuation",
    "digit",
    "non-English",
    "interjection",
    "list",
    "symbol",
    "other",
]

UNIVERSAL_TAGS = [
    "noun",
    "verb",
    "adjective",
    "adverb",
    "adposition",
    "determiner",
    "pronoun",
    "conjunction",
    "infinitive",
    "punctuation",
    "number",
    "interjection",
    "symbol",
    "other",
]


##########################################################################
# PosTagVisualizer
##########################################################################


[docs]class PosTagVisualizer(TextVisualizer): """ Parts of speech (e.g. verbs, nouns, prepositions, adjectives) indicate how a word is functioning within the context of a sentence. In English as in many other languages, a single word can function in multiple ways. Part-of-speech tagging lets us encode information not only about a word’s definition, but also its use in context (for example the words “ship” and “shop” can be either a verb or a noun, depending on the context). The PosTagVisualizer creates a bar chart to visualize the relative proportions of different parts-of-speech in a corpus. Note that the PosTagVisualizer requires documents to already be part-of-speech tagged; the visualizer expects the corpus to come in the form of a list of (document) lists of (sentence) lists of (tag, token) tuples. Parameters ---------- ax : matplotlib axes The axes to plot the figure on. tagset: string The tagset that was used to perform part-of-speech tagging. Either "penn_treebank" or "universal", defaults to "penn_treebank". Use "universal" if corpus has been tagged using SpaCy. colors : list or tuple of strings Specify the colors for each individual part-of-speech. Will override colormap if both are provided. colormap : string or matplotlib cmap Specify a colormap to color the parts-of-speech. frequency: bool {True, False}, default: False If set to True, part-of-speech tags will be plotted according to frequency, from most to least frequent. stack : bool {True, False}, default : False Plot the PosTag frequency chart as a per-class stacked bar chart. Note that fit() requires y for this visualization. parser : string or None, default: None If set to a string, string must be in the form of 'parser_tagger' or 'parser' to use defaults (for spacy this is 'en_core_web_sm', for nltk this is 'word'). The 'parser' argument is one of the accepted parsing libraries. Currently 'nltk' and 'spacy' are the only accepted libraries. NLTK or SpaCy must be installed into your environment. 'tagger' is the tagset to use. For example 'nltk_wordpunct' would use the NLTK library with 'wordpunct' tagset. Or 'spacy_en_core_web_sm' would use SpaCy with the 'en_core_web_sm' tagset. kwargs : dict Pass any additional keyword arguments to the PosTagVisualizer. Attributes ---------- pos_tag_counts_: dict Mapping of part-of-speech tags to counts. Examples -------- >>> viz = PosTagVisualizer() >>> viz.fit(X) >>> viz.show() """ def __init__( self, ax=None, tagset="penn_treebank", colormap=None, colors=None, frequency=False, stack=False, parser=None, **kwargs, ): super(PosTagVisualizer, self).__init__(ax=ax, **kwargs) self.tagset_names = TAGSET_NAMES if tagset not in self.tagset_names: raise YellowbrickValueError( "'{}' is an invalid tagset. Please choose one of {}.".format( tagset, ", ".join(self.tagset_names.keys()) ) ) else: self.tagset = tagset self.punct_tags = frozenset(PUNCT_TAGS) self.frequency = frequency self.colormap = colormap self.colors = colors self.stack = stack self.parser = parser @property def parser(self): return self._parser @parser.setter def parser(self, parser): accepted_parsers = ("nltk", "spacy") if not parser: self._parser = None elif parser.startswith(accepted_parsers): parser_tagger = parser.split("_", 1) # split on the first occurrence of _ parser_name = None tagger_name = None if len(parser_tagger) == 1: # if only 'nltk' or 'spacy' is provided parser_name = parser_tagger[0] if len(parser_tagger) == 2: parser_name = parser_tagger[0] tagger_name = parser_tagger[1] try: importlib.import_module(parser_name) except ModuleNotFoundError: raise ModuleNotFoundError( "Can't find module '{}' in this environment.".format(parser) ) if parser_name == "nltk": nltk = importlib.import_module("nltk") try: nltk.data.find("corpora/treebank") except LookupError: raise LookupError( "Error occured because nltk postag data is not available" ) nltk_taggers = ["word", "wordpunct"] if not tagger_name: tagger_name = "word" parser = parser_name + "_" + tagger_name if tagger_name not in nltk_taggers: raise ValueError( "If using NLTK, tagger should either be 'word' (default) or" " 'wordpunct'." ) elif parser_name == "spacy": if not tagger_name: tagger_name = "en_core_web_sm" parser = parser_name + "_" + tagger_name try: spacy = importlib.import_module("spacy") spacy.load(tagger_name) except OSError: raise OSError( "Spacy model '{}' has not been downloaded into this" " environment.".format(tagger_name) ) self._parser = parser else: raise ValueError( "{} is an invalid parser. Currently the supported parsers are 'nltk'" "and 'spacy'".format(parser) )
[docs] def fit(self, X, y=None, **kwargs): """ Fits the corpus to the appropriate tag map. Text documents must be tokenized & tagged before passing to fit if the 'parse' argument has not been specified at initialization. Otherwise X can be a raw text ready to be parsed. Parameters ---------- X : list or generator or str (raw text) Should be provided as a list of documents or a generator that yields a list of documents that contain a list of sentences that contain (token, tag) tuples. If X is a string, the 'parse' argument should be specified as 'nltk' or 'spacy' in order to parse the raw documents. y : ndarray or Series of length n An optional array of target values that are ignored by the visualizer. kwargs : dict Pass generic arguments to the drawing method Returns ------- self : instance Returns the instance of the transformer/visualizer """ self.labels_ = ["documents"] if self.parser: parser_name = self.parser.split("_", 1)[0] parse_func = getattr(self, "parse_{}".format(parser_name)) X = parse_func(X) if self.stack: if y is None: raise YellowbrickValueError("Specify y for stack=True") self.labels_ = np.unique(y) if self.tagset == "penn_treebank": self.pos_tag_counts_ = self._penn_tag_map() self._handle_treebank(X, y) elif self.tagset == "universal": self.pos_tag_counts_ = self._uni_tag_map() self._handle_universal(X, y) self.draw() return self
[docs] def parse_nltk(self, X): """ Tag a corpora using NLTK tagging (Penn-Treebank) to produce a generator of tagged documents in the form of a list of (document) lists of (sentence) lists of (token, tag) tuples. Parameters ---------- X : str (raw text) or list of paragraphs (containing str) """ nltk = importlib.import_module("nltk") nltk.data.find("corpora/treebank") tagger = self.parser.split("_", 1)[1] if tagger == "word": for doc in X: yield [ nltk.pos_tag(nltk.word_tokenize(sent)) for sent in nltk.sent_tokenize(doc) ] elif tagger == "wordpunct": for doc in X: yield [ nltk.pos_tag(nltk.wordpunct_tokenize(sent)) for sent in nltk.sent_tokenize(doc) ]
[docs] def parse_spacy(self, X): """ Tag a corpora using SpaCy tagging (Universal Dependencies) to produce a generator of tagged documents in the form of a list of (document) lists of (sentence) lists of (token, tag) tuples. Parameters ---------- X : str (raw text) or list of paragraphs (containing str) """ spacy = importlib.import_module("spacy") tagger = self.parser.split("_", 1)[1] nlp = spacy.load(tagger) if isinstance(X, list): for doc in X: tagged = nlp(doc) yield [ [(token.text, token.pos_) for token in sents] for sents in tagged.sents ] elif isinstance(X, str): tagged = nlp(X) yield [ [(token.text, token.pos_) for token in sents] for sents in tagged.sents ]
def _penn_tag_map(self): """ Returns a Penn Treebank part-of-speech tag map. """ self._pos_tags = PENN_TAGS return self._make_tag_map(PENN_TAGS) def _uni_tag_map(self): """ Returns a Universal Dependencies part-of-speech tag map. """ self._pos_tags = UNIVERSAL_TAGS return self._make_tag_map(UNIVERSAL_TAGS) def _make_tag_map(self, tagset): """ Returns a map of the tagset to a counter unless stack=True then returns a map of labels to a map of tagset to counters. """ # ensures the dict contains a zero counter per tag zeros = [0] * len(tagset) return {label: dict(zip(tagset, zeros)) for label in self.labels_} return dict(zip(tagset, zeros)) def _handle_universal(self, X, y=None): """ Scan through the corpus to compute counts of each Universal Dependencies part-of-speech. Parameters ---------- X : list or generator Should be provided as a list of documents or a generator that yields a list of documents that contain a list of sentences that contain (token, tag) tuples. """ jump = { # combine proper and regular nouns "NOUN": "noun", "PROPN": "noun", "ADJ": "adjective", "VERB": "verb", # include particles with adverbs "ADV": "adverb", "PART": "adverb", "ADP": "adposition", "PRON": "pronoun", "CCONJ": "conjunction", "PUNCT": "punctuation", "DET": "determiner", "NUM": "number", "INTJ": "interjection", "SYM": "symbol", } for idx, tagged_doc in enumerate(X): for tagged_sent in tagged_doc: for _, tag in tagged_sent: if tag == "SPACE": continue if self.stack: counter = self.pos_tag_counts_[y[idx]] else: counter = self.pos_tag_counts_["documents"] counter[jump.get(tag, "other")] += 1 def _handle_treebank(self, X, y=None): """ Create a part-of-speech tag mapping using the Penn Treebank tags Parameters ---------- X : list or generator Should be provided as a list of documents or a generator that yields a list of documents that contain a list of sentences that contain (token, tag) tuples. """ for idx, tagged_doc in enumerate(X): for tagged_sent in tagged_doc: for _, tag in tagged_sent: if self.stack: counter = self.pos_tag_counts_[y[idx]] else: counter = self.pos_tag_counts_["documents"] if tag.startswith("N"): counter["noun"] += 1 elif tag.startswith("J"): counter["adjective"] += 1 elif tag.startswith("V"): counter["verb"] += 1 # include particles with adverbs elif tag.startswith("RB") or tag == "RP": counter["adverb"] += 1 elif tag.startswith("PR"): counter["pronoun"] += 1 elif tag.startswith("W"): counter["wh- word"] += 1 elif tag == "CC": counter["conjunction"] += 1 elif tag == "CD": counter["digit"] += 1 # combine predeterminer and determiner elif tag in ["DT" or "PDT"]: counter["determiner"] += 1 elif tag == "EX": counter["existential"] += 1 elif tag == "FW": counter["non-English"] += 1 elif tag == "IN": counter["preposition"] += 1 elif tag == "POS": counter["possessive"] += 1 elif tag == "LS": counter["list"] += 1 elif tag == "MD": counter["modal"] += 1 elif tag in self.punct_tags: counter["punctuation"] += 1 elif tag == "TO": counter["infinitive"] += 1 elif tag == "UH": counter["interjection"] += 1 elif tag == "SYM": counter["symbol"] += 1 else: counter["other"] += 1
[docs] def draw(self, **kwargs): """ Called from the fit method, this method creates the canvas and draws the part-of-speech tag mapping as a bar chart. Parameters ---------- kwargs: dict generic keyword arguments. Returns ------- ax : matplotlib axes Axes on which the PosTagVisualizer was drawn. """ # Converts nested dict to nested list pos_tag_counts = np.array( [list(i.values()) for i in self.pos_tag_counts_.values()] ) # stores sum of nested list column wise pos_tag_sum = np.sum(pos_tag_counts, axis=0) if self.frequency: # sorts the count and tags by sum for frequency true idx = (pos_tag_sum).argsort()[::-1] self._pos_tags = np.array(self._pos_tags)[idx] pos_tag_counts = pos_tag_counts[:, idx] if self.stack: bar_stack( pos_tag_counts, ax=self.ax, labels=list(self.labels_), ticks=self._pos_tags, colors=self.colors, colormap=self.colormap, ) else: xidx = np.arange(len(self._pos_tags)) colors = resolve_colors( n_colors=len(self._pos_tags), colormap=self.colormap, colors=self.colors ) self.ax.bar(xidx, pos_tag_counts[0], color=colors) return self.ax
[docs] def finalize(self, **kwargs): """ Finalize the plot with ticks, labels, and title Parameters ---------- kwargs: dict generic keyword arguments. """ # NOTE: not deduping here, so this is total, not unique self.ax.set_ylabel("Count") if self.frequency: self.ax.set_xlabel( "{} part-of-speech tags, sorted by frequency".format( self.tagset_names[self.tagset] ) ) else: self.ax.set_xlabel( "{} part-of-speech tags".format(self.tagset_names[self.tagset]) ) # bar stack(helper) sets the ticks if stack is true if not self.stack: self.ax.set_xticks(range(len(self._pos_tags))) self.ax.set_xticklabels(self._pos_tags, rotation=90) self.set_title( "PosTag plot for {}-token corpus".format( (sum([sum(i.values()) for i in self.pos_tag_counts_.values()])) ) ) # Call tight layout to maximize readability self.fig.tight_layout()
[docs] def show(self, outpath=None, **kwargs): if outpath is not None: kwargs["bbox_inches"] = kwargs.get("bbox_inches", "tight") return super(PosTagVisualizer, self).show(outpath, **kwargs)
########################################################################## ## Quick Method ##########################################################################
[docs]def postag( X, y=None, ax=None, tagset="penn_treebank", colormap=None, colors=None, frequency=False, stack=False, parser=None, show=True, **kwargs, ): """ Display a barchart with the counts of different parts of speech in X, which consists of a part-of-speech-tagged corpus, which the visualizer expects to be a list of lists of lists of (token, tag) tuples. Parameters ---------- X : list or generator Should be provided as a list of documents or a generator that yields a list of documents that contain a list of sentences that contain (token, tag) tuples. y : ndarray or Series of length n An optional array of target values that are ignored by the visualizer. ax : matplotlib axes The axes to plot the figure on. tagset: string The tagset that was used to perform part-of-speech tagging. Either "penn_treebank" or "universal", defaults to "penn_treebank". Use "universal" if corpus has been tagged using SpaCy. colors : list or tuple of colors Specify the colors for each individual part-of-speech. colormap : string or matplotlib cmap Specify a colormap to color the parts-of-speech. frequency: bool {True, False}, default: False If set to True, part-of-speech tags will be plotted according to frequency, from most to least frequent. stack : bool {True, False}, default : False Plot the PosTag frequency chart as a per-class stacked bar chart. Note that fit() requires y for this visualization. parser : string or None, default: None If set to a string, string must be in the form of 'parser_tagger' or 'parser' to use defaults (for spacy this is 'en_core_web_sm', for nltk this is 'word'). The 'parser' argument is one of the accepted parsing libraries. Currently 'nltk' and 'spacy' are the only accepted libraries. NLTK or SpaCy must be installed into your environment. 'tagger' is the tagset to use. For example 'nltk_wordpunct' would use the NLTK library with 'wordpunct' tagset. Or 'spacy_en_core_web_sm' would use SpaCy with the 'en_core_web_sm' tagset. show: bool, default: True If True, calls ``show()``, which in turn calls ``plt.show()`` however you cannot call ``plt.savefig`` from this signature, nor ``clear_figure``. If False, simply calls ``finalize()`` kwargs : dict Pass any additional keyword arguments to the PosTagVisualizer. Returns ------- visualizer: PosTagVisualizer Returns the fitted, finalized visualizer """ # Instantiate the visualizer visualizer = PosTagVisualizer( ax=ax, tagset=tagset, colors=colors, colormap=colormap, frequency=frequency, stack=stack, parser=parser, **kwargs, ) # Fit and transform the visualizer (calls draw) visualizer.fit(X, y=y, **kwargs) if show: visualizer.show() else: visualizer.finalize() # Return the visualizer object return visualizer