# Excercise stub

In [1]:
from typing import List, Tuple, Union, Dict

with open("names.txt") as f:
    names = [line.strip() for line in f if line]
print(f"Number of names: {len(names)}")
print(names[::1000])

class NamesDataset:
    """
    Implement all methods in this class,
    such that the output type fulfills the type annotations
    and their behavior follows the description in the type annotations. 
    """
    def __init__(self, names: List[str]):
        """
        The constructor must save the names as an attribute.
        """
    
    def query(self, prefix: str) -> List[str]:
        """
        The query method takes in a prefix and returns a list 
        of all names in the dataset that start with the given prefix.
        """
    
    def longest_name(self, prefix: str = None, return_len: bool = False) -> Union[str, Tuple[str, int]]:
        """
        The longest_name method returns the longest name in the dataset.
        
        It also must support filtering the names by the prefix.
        If a prefix is given, than it should return the longest name, starting with the prefix
        
        Additionally, if the return_len argument is True, it should output the longest name. 
        AND its length (as tuple). 
        """
    
    def mean_name_length(self):
        """
        This method computes the mean length of all names in the dataset.
        """
        lenghts = [len(name) for name in self.names]
        return sum(lenghts) / len(lenghts)
    
    def ngrams(self, size: int, prefix: str = None) -> List[str]:
        """
        This method returns a list of all character n_grams of all names (or those starting with a given prefix).
        The size determines the size of the n_grams.
        """
        
    def count_ngrams(self, size: int, prefix: str = None) -> List[Tuple[str, int]]:
        """
        This method returns a list of tuples containing all n_grams created with the given the arguments
        and their frequency within all n_grams found under the parameters.
        The list should be sorted in ascending order.
        """

Number of names: 32033
['emma', 'paityn', 'blythe', 'emmanuelle', 'ivyanna', 'dempsey', 'arika', 'shanzay', 'paighton', 'saydie', 'grettell', 'lissette', 'mang', 'kaegan', 'breylin', 'myesha', 'brandii', 'life', 'ziasia', 'justus', 'mahdi', 'wynston', 'avyukt', 'kaesen', 'teagen', 'raeden', 'red', 'davud', 'grigor', 'chukwuebuka', 'vail', 'jerold', 'zeferino']


# Solution

In [2]:
from typing import List, Tuple, Union, Dict

from typing import List, Tuple, Union, Dict

with open("names.txt") as f:
    names = [line.strip() for line in f if line]
print(f"Number of names: {len(names)}")
print(names[::1000])

class NamesDataset:
    def __init__(self, names: List[str]):
        self.names = names
    
    def query(self, prefix: str) -> List[str]:
        filtered_names = [name for name in self.names if name.startswith(prefix)]
        return filtered_names
    
    def longest_name(self, prefix: str = None, return_len: bool = False) -> Union[str, Tuple[str, int]]:
        if prefix is None:
            names_to_check = self.names
        else:
            names_to_check = self.query(prefix)
        longest_name = None
        longest_name_len = -1
        for name in names_to_check:
            name_len = len(name)
            if name_len > longest_name_len:
                longest_name = name
                longest_name_len = name_len
        if return_len:
            return longest_name, longest_name_len
        return longest_name
    
    def mean_name_length(self):
        lenghts = [len(name) for name in self.names]
        return sum(lenghts) / len(lenghts)
    
    @staticmethod
    def word_to_ngrams(name: str, size: int) -> List[str]:
        return ["".join(n_gram) for n_gram in zip(*[name[i:] for i in range(size)])]
    
    def ngrams(self, size: int, prefix: str = None) -> List[str]:
        
        if prefix is None:
            names_to_check = self.names
        else:
            names_to_check = self.query(prefix)
        
        n_grams = []
        for name in names_to_check:
            n_grams.extend(self.word_to_ngrams(name, size))
        return n_grams
        
    def count_ngrams(self, size: int, prefix: str = None) -> Dict[str, int]:
        ngram_counts = {}
        for n_gram in self.ngrams(size, prefix):
            if n_gram not in ngram_counts:
                ngram_counts[n_gram] = 1
            else:
                ngram_counts[n_gram] += 1
        
        ngram_counts = list(sorted(
            ngram_counts.items(),
            key=lambda entry: entry[1],
            reverse=True
        ))
        return ngram_counts

dataset = NamesDataset(names)
print(dataset.query("layla"))
print(dataset.mean_name_length())
print(dataset.longest_name(), dataset.longest_name("layla"), dataset.longest_name("layla", return_len=True))
print(dataset.ngrams(2, "layla"))
print(dataset.count_ngrams(2)[:10])
print(dataset.count_ngrams(5)[:10])

Number of names: 32033
['emma', 'paityn', 'blythe', 'emmanuelle', 'ivyanna', 'dempsey', 'arika', 'shanzay', 'paighton', 'saydie', 'grettell', 'lissette', 'mang', 'kaegan', 'breylin', 'myesha', 'brandii', 'life', 'ziasia', 'justus', 'mahdi', 'wynston', 'avyukt', 'kaesen', 'teagen', 'raeden', 'red', 'davud', 'grigor', 'chukwuebuka', 'vail', 'jerold', 'zeferino']
['layla', 'laylah', 'laylani', 'laylanie', 'laylamarie', 'laylarose', 'laylaa', 'laylanni', 'laylannie']
6.122217712983486
muhammadibrahim laylamarie ('laylamarie', 10)
['la', 'ay', 'yl', 'la', 'la', 'ay', 'yl', 'la', 'ah', 'la', 'ay', 'yl', 'la', 'an', 'ni', 'la', 'ay', 'yl', 'la', 'an', 'ni', 'ie', 'la', 'ay', 'yl', 'la', 'am', 'ma', 'ar', 'ri', 'ie', 'la', 'ay', 'yl', 'la', 'ar', 'ro', 'os', 'se', 'la', 'ay', 'yl', 'la', 'aa', 'la', 'ay', 'yl', 'la', 'an', 'nn', 'ni', 'la', 'ay', 'yl', 'la', 'an', 'nn', 'ni', 'ie']
[('an', 5438), ('ar', 3264), ('el', 3248), ('ri', 3033), ('na', 2977), ('le', 2921), ('en', 2675), ('la', 2623), 