#%matplotlib notebook
#%matplotlib widget
#%matplotlib ipympl
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(42)


# Doing it the `pythonic` way
from typing import Union, List

NumericalType = Union[float, int]

def dot_product(x: List[NumericalType], y: List[NumericalType]) -> float:
    dot_product = 0.0
    for xi, yi in zip(x, y):
        dot_product += xi * yi
    return dot_product


x = list(range(1000))
y = list(range(1000))

print(x[:10], y[:10])

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


dp = dot_product(x, y)
print(dp)

332833500.0


%%timeit
dp = dot_product(x, y)

44.1 µs ± 240 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


# Doing it the `numpy` way
import numpy as np

x_np = np.array(x)
y_np = np.array(y)

print(x[:10], y[:10])

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


dp_np = (x_np * y_np).sum()
dp_np

332833500


%%timeit
dp_np = (x_np * y_np).sum()

1.7 µs ± 13.3 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


%%timeit
# Sometimes, you can go even faster, if there is a built-in function available
np.dot(x_np, y_np)

964 ns ± 3.96 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


print(np.dot(x_np, y_np))

332833500


# The np.array function takes in a (potentially nested) list of numbers and converts it into a ndarray

array = np.array([[1, 2, 3.5], [1, 3, 4]])
print("Type of the array:", type(array))
print("Datatype of the content of the array:", array.dtype)
print(array)

Type of the array: <class 'numpy.ndarray'>
Datatype of the content of the array: float64
[[1.  2.  3.5]
 [1.  3.  4. ]]


a, b = np.array([1, 2, 3]), np.array([4, 5, 6])
a + a, a * b, a - b

(array([2, 4, 6]), array([ 4, 10, 18]), array([-3, -3, -3]))


a = np.array([1, 2, 3])
b = a * 2
b

array([2, 4, 6])


c = a**2
c

array([1, 4, 9])

import numpy as np


my_list = [[1, 2], [3, 4]]
my_array = np.array(my_list)
print(my_array)

[[1 2]
 [3 4]]


zero_array = np.zeros(shape=(2, 2))
print(zero_array)

ones_array = np.ones((2, 2))
print(ones_array)

constant_array = np.full(shape=(2, 2), fill_value=-100)
print(constant_array)

[[0. 0.]
 [0. 0.]]
[[1. 1.]
 [1. 1.]]
[[-100 -100]
 [-100 -100]]


running_array = np.arange(0, 9)
print(running_array)

evenly_spaced_array = np.linspace(0, 1, num=5)
print(evenly_spaced_array)

[0 1 2 3 4 5 6 7 8]
[0.   0.25 0.5  0.75 1.  ]


rand_normal_array = np.random.randn(10000)
print(rand_normal_array.shape)
print("Mean:", rand_normal_array.mean(), "Std:", rand_normal_array.std())

rand_integer_array = np.random.randint(0, 2, (10))
print(rand_integer_array)

(10000,)
Mean: -0.002135983368426207 Std: 1.0034122061299875
[1 1 0 0 0 1 1 0 1 0]


table = np.array([
    [1, 2, 3],
    [4, 5, 6]
])
print("Number of dimensions:", table.ndim)
print("Shape of the array:", table.shape)

Number of dimensions: 2
Shape of the array: (2, 3)


array = np.arange(27).reshape(3, 3, 3)
print(array.ndim)
print(array.shape)
print(array)

3
(3, 3, 3)
[[[ 0  1  2]
  [ 3  4  5]
  [ 6  7  8]]

 [[ 9 10 11]
  [12 13 14]
  [15 16 17]]

 [[18 19 20]
  [21 22 23]
  [24 25 26]]]


# Get the first element across all dimensions
print(array[0, 0, 0])

# Get the first "matrix"
print(array[0, :, :])

# Get the last row vector of the second matrix
print(array[1, -1, :])

0
[[0 1 2]
 [3 4 5]
 [6 7 8]]
[15 16 17]

array[<dim0>, <dim1>, <dim1>, ...]


print(array)

[[[ 0  1  2]
  [ 3  4  5]
  [ 6  7  8]]

 [[ 9 10 11]
  [12 13 14]
  [15 16 17]]

 [[18 19 20]
  [21 22 23]
  [24 25 26]]]

array[0, 1, 0]


array[0, 1, 0]

3

array[-1, :2, 0]


array[-1, :2, 0]

array([18, 21])

array[:, :, :]


array[:, :, :]

array([[[ 0,  1,  2],
        [ 3,  4,  5],
        [ 6,  7,  8]],

       [[ 9, 10, 11],
        [12, 13, 14],
        [15, 16, 17]],

       [[18, 19, 20],
        [21, 22, 23],
        [24, 25, 26]]])


array = np.random.randint(0, 100, (100, 100))
smaller_fifty_mask = array < 50
print(smaller_fifty_mask)
print(smaller_fifty_mask.shape)

[[False False False ...  True False False]
 [ True False  True ...  True  True  True]
 [ True False  True ...  True  True  True]
 ...
 [ True False False ...  True  True False]
 [False  True False ...  True False False]
 [False False  True ... False  True False]]
(100, 100)


values_smaller_than_fifty = array[smaller_fifty_mask]
print(values_smaller_than_fifty)
print(values_smaller_than_fifty.shape)

[32  4 10 ... 25 36  1]
(4991,)


array[smaller_fifty_mask] = 0
print(array)

[[81 72 71 ...  0 82 75]
 [ 0 75  0 ...  0  0  0]
 [ 0 60  0 ...  0  0  0]
 ...
 [ 0 85 88 ...  0  0 53]
 [50  0 93 ...  0 78 91]
 [84 79  0 ... 54  0 75]]


vector = np.array([1, 2, 3, 4, 5, 6, 7, 8])
print(vector)
print("Orig shape:", vector.shape)
print("No. of entries:", vector.size)

matrix = vector.reshape(2, 4)
print(matrix)
print("New shape:", matrix.shape)
print("No. of entries:", matrix.size)

tuples = vector.reshape(-1, 2)
print(tuples)
print("New shape:", tuples.shape)
print("No. of entries:", tuples.size)

[1 2 3 4 5 6 7 8]
Orig shape: (8,)
No. of entries: 8
[[1 2 3 4]
 [5 6 7 8]]
New shape: (2, 4)
No. of entries: 8
[[1 2]
 [3 4]
 [5 6]
 [7 8]]
New shape: (4, 2)
No. of entries: 8

cube = np.arange(27).reshape(3, 3, 3)
arr = cube.reshape(-1)
arr.shape
`


cube = np.arange(27).reshape(3, 3, 3)
arr = cube.reshape(-1)
arr.shape
arr

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26])


x = np.random.randint(-10, 0, (10,))


print(np.abs(x))
print(np.square(x))
print(np.exp(x))
print("And many more!")

[ 6  9  9  7 10  4 10  8  1  4]
[ 36  81  81  49 100  16 100  64   1  16]
[2.47875218e-03 1.23409804e-04 1.23409804e-04 9.11881966e-04
 4.53999298e-05 1.83156389e-02 4.53999298e-05 3.35462628e-04
 3.67879441e-01 1.83156389e-02]
And many more!


print("Addition")
print(x + x)
print(np.add(x, x))

print("Multiplication")
print(x * x)
print(np.multiply(x, x))

print("Divide")
print(x / x)
print(np.divide(x, x,))

Addition
[-12 -18 -18 -14 -20  -8 -20 -16  -2  -8]
[-12 -18 -18 -14 -20  -8 -20 -16  -2  -8]
Multiplication
[ 36  81  81  49 100  16 100  64   1  16]
[ 36  81  81  49 100  16 100  64   1  16]
Divide
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


data = np.array([1, 2, 3])
data.mean(), data.std()

(2.0, 0.816496580927726)


data = np.arange(27).reshape(3, 3, 3)
data.mean(), data.reshape(-1).mean()

(13.0, 13.0)


# Shape [n_programmers, n_days]
programmers_loc = np.random.randint(250, 500, size=(600, 30))
decay = np.arange(30) * np.random.randint(0, 20, size=(30,))
programmers_loc -= decay

print("Average number of lines of code written:", programmers_loc.mean())

Average number of lines of code written: 244.63327777777778


average_loc_per_employee = programmers_loc.mean(axis=1)
print(average_loc_per_employee.shape)

(600,)


np.min(average_loc_per_employee), np.argmin(average_loc_per_employee)

(202.46666666666667, 495)


plt.plot(programmers_loc.mean(axis=0))

[<matplotlib.lines.Line2D at 0x10cae88e0>]


np.mean(programmers_loc), programmers_loc.mean()

(244.63327777777778, 244.63327777777778)


data1 = np.random.randint(0, 10, size=(2, 2 ,2))
data2 = np.ones((2, 2, 2), dtype=np.int64)

print(data1)

print(data1 + data2)

[[[5 5]
  [3 1]]

 [[3 1]
  [6 8]]]
[[[6 6]
  [4 2]]

 [[4 2]
  [7 9]]]


print(data1 + 1)

[[[6 6]
  [4 2]]

 [[4 2]
  [7 9]]]


matrix = np.arange(9).reshape(3, 3)
print(matrix)

row_vec = np.array([1, -1, 5])
print(row_vec)
print("matrix + row_vec=\n", matrix + row_vec)

col_vec = np.array([[1], [-1], [5]])
print(col_vec.shape)
print(col_vec)
print("matrix + col_vec=\n", matrix + col_vec)

[[0 1 2]
 [3 4 5]
 [6 7 8]]
[ 1 -1  5]
matrix + row_vec=
 [[ 1  0  7]
 [ 4  3 10]
 [ 7  6 13]]
(3, 1)
[[ 1]
 [-1]
 [ 5]]
matrix + col_vec=
 [[ 1  2  3]
 [ 2  3  4]
 [11 12 13]]

for dim_a, dim_b in zip(a.shape[::-1], b.shape[::-1]):
    if not (1 in (dim_a, dim_b) or dim_a == dim_b):
        raise ValueError("operands could not be broadcast together")


from skimage import data

picture = data.astronaut()
print(type(picture))
print("Shape of the picture", picture.shape)

plt.imshow(picture)
plt.show()

<class 'numpy.ndarray'>
Shape of the picture (512, 512, 3)


channel_map = {0: "red", 1: "green", 2: "blue"}
for channel_index in range(picture.shape[2]):
    channel_name = channel_map[channel_index]
    plt.imshow(picture[:, :, channel_index], cmap="Greys")
    plt.title(f"Channel: {channel_name}")
    plt.show()


white = np.full((512, 512, 3), 255)
white[128:384, 128:384, :] = [127, 127, 255]
plt.imshow(white)
plt.show()


# Create an array with rgb to grayscale weight
rgb_grayscale_weights = np.array([0.2125, 0.7154, 0.0721])

print("First pixel in original image", picture[0, 0, :])

# First, we use broadcasting to scale all values in the RGB channels
scaled_rgb_picture = rgb_grayscale_weights * picture
print("First pixel with scaled rgb values", scaled_rgb_picture[0, 0, :])

# To receive the gray values, just sum over the channels for each pixel
grayscaled_picture = scaled_rgb_picture.sum(axis=2)

print("Grayscaled image shape", grayscaled_picture.shape)
plt.imshow(grayscaled_picture, cmap="gray")

plt.show()

First pixel in original image [154 147 151]
First pixel with scaled rgb values [ 32.725  105.1638  10.8871]
Grayscaled image shape (512, 512)

Data Science for Humanities 1¶

Session: (Re-)introduction to Python¶

Part 3: `Numpy` & Scientific Programming: Moving from lists to arrays¶

Winter term 22/23¶

Prof. Goran Glavaš, Lennart Keller¶

Goals¶

Introduction:¶

Benefits about learning how `numpy` works¶

But, why don't we use built-in types like lists to store numbers?¶

Example: Dot-Product¶

Why is `numpy` faster?¶

Reason 1: Fixed datatypes¶

Reason 2: Vectorized operations¶

Numpy: Usage basics¶

Creating `ndarrays`¶

`ndarray`: Dimensionality¶

`ndarray`: Indexing¶

Quiztime¶

`ndarray`: Masking¶

Retrieving¶

In-place operations¶

`ndarray`: Reshaping¶

Quiz¶

`ndarray`: Computations¶

`ufuncs`¶

Examples: Unary `ufuncs`¶

Binary `ufuncs`¶

Aggregation functions¶

Side-Note: Methods vs. functions¶

`ndarray`: Broadcasting¶

Broadcasting Example: Convert RGB- to grayscale-images¶

Further reading¶

Data Science for Humanities 1¶

Session: (Re-)introduction to Python¶

Part 3: Numpy & Scientific Programming: Moving from lists to arrays¶

Winter term 22/23¶

Prof. Goran Glavaš, Lennart Keller¶

Goals¶

Introduction:¶

Benefits about learning how numpy works¶

But, why don't we use built-in types like lists to store numbers?¶

Example: Dot-Product¶

Why is numpy faster?¶

Reason 1: Fixed datatypes¶

Reason 2: Vectorized operations¶

Numpy: Usage basics¶

Creating ndarrays¶

ndarray: Dimensionality¶

ndarray: Indexing¶

Quiztime¶

ndarray: Masking¶

Retrieving¶

In-place operations¶

ndarray: Reshaping¶

Quiz¶

ndarray: Computations¶

ufuncs¶

Examples: Unary ufuncs¶

Binary ufuncs¶

Aggregation functions¶

Side-Note: Methods vs. functions¶

ndarray: Broadcasting¶

Broadcasting Example: Convert RGB- to grayscale-images¶

Further reading¶

Part 3: `Numpy` & Scientific Programming: Moving from lists to arrays¶

Benefits about learning how `numpy` works¶

Why is `numpy` faster?¶

Creating `ndarrays`¶

`ndarray`: Dimensionality¶

`ndarray`: Indexing¶

`ndarray`: Masking¶

`ndarray`: Reshaping¶

`ndarray`: Computations¶

`ufuncs`¶

Examples: Unary `ufuncs`¶

Binary `ufuncs`¶

`ndarray`: Broadcasting¶