In [1]:
from typing import List, Tuple
from collections import Counter
import re
from typing import Callable

import numpy as np

def tokenize(text: str) -> List[str]:
    return re.findall(r"\w+", text)

class BagOfWords:
    def __init__(self, max_words: int = None, tokenizer: Callable[[str], List[str]]=tokenize):
        self.max_words = max_words
        self.tokenizer = tokenizer
    
    def encode(self, texts: List[str]) -> Tuple[np.ndarray, np.ndarray]:
        # Get vocab
        big_doc = " ".join(texts)
        all_freqs = self._tokenize_and_count(big_doc)
        self.vocab_to_idx_ = {
            token: idx
            for idx, (token, _) in enumerate(all_freqs.most_common(self.max_words))
        }
        vocab = np.array(list(self.vocab_to_idx_.keys()))

        # Encode texts
        dtm = np.zeros((len(texts), len(vocab)))
        for text_index in range(len(texts)):
            text_freqs = self._tokenize_and_count(texts[text_index])
            for token, freq in text_freqs.items():
                dtm_idx = self.vocab_to_idx_.get(token, None)
                if dtm_idx is not None:
                    dtm[text_index, dtm_idx] = freq
        return dtm, vocab

    def _tokenize_and_count(self, text) -> Counter:
        tokens = self.tokenizer(text)
        freqs = Counter(tokens)
        return freqs

In [2]:
from pathlib import Path
text_files = list(Path("Romane").glob('*.txt'))
print(len(text_files))
texts = [file.read_text() for file in text_files]

76


In [11]:
bow = BagOfWords(max_words=None)
dtm, types = bow.encode(texts)
print(dtm.shape, types.shape)
dtm


(76, 257802) (257802,)


array([[1.946e+03, 1.852e+03, 1.503e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [1.266e+03, 6.050e+02, 4.770e+02, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [5.444e+03, 3.323e+03, 2.429e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       ...,
       [2.718e+03, 2.209e+03, 1.799e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [2.876e+03, 2.149e+03, 1.940e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [3.752e+03, 2.877e+03, 2.964e+03, ..., 1.000e+00, 1.000e+00,
        1.000e+00]])

In [4]:
# Find the most common word in the collection
types[dtm.sum(axis=0).argmax()]

'und'

In [5]:
# Find the longest and the shortest text.
print("Longest text", text_files[dtm.sum(axis=1).argmax()])
print("Shortest text", text_files[dtm.sum(axis=1).argmin()])

Longest text Romane/Gutzkow,-Karl_Der Zauberer von Rom.txt
Shortest text Romane/Fischer,-Caroline-Auguste_Gustavs Verirrungen.txt


In [6]:
# Find the text with the smallest vocabulary.
dtm_occ = np.zeros_like(dtm)
dtm_occ[dtm > 0] = 1
print(text_files[dtm_occ.sum(axis=1).argmin()])

Romane/Fischer,-Caroline-Auguste_Gustavs Verirrungen.txt


In [7]:
# Compute relative word frequencies, normalized by the text lengths.
dtm_rel = dtm / dtm.sum(axis=1).reshape(-1, 1)
dtm_rel.sum(axis=1)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1.])

In [8]:
# Find the word whose relative frequency varies the most across the collection.
types[dtm_rel.std(axis=0).argmax()]

'ich'

In [13]:
# Find all words that occur in less than five texts.
word_mask = dtm_occ.sum(axis=0) < 5
less_than_five = types[word_mask]
less_than_five.shape

(207450,)

In [10]:
# Mask all words in each text that occur less often than the mean word frequency of the text.
dtm_masked = dtm_rel.copy()
mask = dtm_rel < dtm_rel.mean(axis=0)
dtm_masked[mask] = 0.0
dtm_masked

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [3.33711153e-02, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [3.45208051e-02, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [0.00000000e+00, 2.38883121e-02, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [3.11194789e-02, 2.32530459e-02, 2.09915817e-02, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 2.36578708e-02, ...,
        7.98173778e-06, 7.98173778e-06, 7.98173778e-06]])