import numpy as np

import numpy as np
my_list = [[1, 2], [3, 4]]
my_array = np.array(my_list)
print(my_array)

[[1 2]
 [3 4]]

zero_array = np.zeros(shape=(2, 2))
print(zero_array)

ones_array = np.ones((2, 2))
print(ones_array)

constant_array = np.full(shape=(2, 2), fill_value=-100)
print(constant_array)

[[0. 0.]
 [0. 0.]]
[[1. 1.]
 [1. 1.]]
[[-100 -100]
 [-100 -100]]

np.zeros(shape=(3, 2))

array([[0., 0.],
       [0., 0.],
       [0., 0.]])

numbers = np.arange(9)
numbers

array([0, 1, 2, 3, 4, 5, 6, 7, 8])

rand_normal_array = np.random.randn(10000)
print(rand_normal_array.shape)
print("Mean:", rand_normal_array.mean(), "Std:", rand_normal_array.std())

rand_integer_array = np.random.randint(0, 2, (10))
print(rand_integer_array)

(10000,)
Mean: 0.0011282100385092903 Std: 1.0016185642357456
[0 0 0 1 1 0 0 1 0 0]

array = np.arange(27).reshape(3, 3, 3)
print(array.ndim)
print(array.shape)
print(array)

3
(3, 3, 3)
[[[ 0  1  2]
  [ 3  4  5]
  [ 6  7  8]]

 [[ 9 10 11]
  [12 13 14]
  [15 16 17]]

 [[18 19 20]
  [21 22 23]
  [24 25 26]]]

# Get the first element across all dimensions
print(array[0, 0, 0])

# Get the first "matrix"
print(array[0, :, :])

# Get the last row vector of the second matrix
print(array[1, -1, :])

0
[[0 1 2]
 [3 4 5]
 [6 7 8]]
[15 16 17]

# How to get number 17?

array[1, 2, 2], array[1, -1, -1]

(17, 17)

array[<dim0>, <dim1>, <dim1>, ...]

site_traffic = np.random.randint(100, 1000, (365, 24))
site_traffic.shape, site_traffic.mean()

((365, 24), 549.8228310502283)

high_traffic_hours = 0
for day_idx in range(site_traffic.shape[0]):
    for hour_idx in range(site_traffic.shape[1]):
        traffic = site_traffic[day_idx, hour_idx]
        if traffic > 750:
            high_traffic_hours += 1
high_traffic_hours

2382

high_traffic_periods = site_traffic > 750
high_traffic_periods

array([[ True, False, False, ..., False,  True, False],
       [ True, False, False, ..., False, False, False],
       [ True, False, False, ..., False, False, False],
       ...,
       [ True, False, False, ..., False, False,  True],
       [False,  True, False, ...,  True,  True,  True],
       [False, False, False, ..., False,  True, False]])

high_traffic_periods.sum()

2382

high_traffic_values = site_traffic[high_traffic_periods]
print(high_traffic_values)
print(high_traffic_values.shape)

[776 762 903 ... 903 929 876]
(2382,)

low_median_traffic = site_traffic.copy()
low_median_traffic[high_traffic_periods] = 0.0
low_median_traffic

array([[  0, 100, 470, ..., 334,   0, 734],
       [  0, 616, 241, ..., 257, 205, 300],
       [  0, 502, 320, ..., 275, 634, 481],
       ...,
       [  0, 194, 441, ..., 194, 393,   0],
       [176,   0, 364, ...,   0,   0,   0],
       [161, 661, 121, ..., 472,   0, 440]])

vector = np.array([1, 2, 3, 4, 5, 6, 7, 8])
print(vector)
print("Orig shape:", vector.shape)
print("No. of entries:", vector.size)

matrix = vector.reshape(2, 4)
print(matrix)
print("New shape:", matrix.shape)
print("No. of entries:", matrix.size)

tuples = vector.reshape(-1, 2)
print(tuples)
print("New shape:", tuples.shape)
print("No. of entries:", tuples.size)

[1 2 3 4 5 6 7 8]
Orig shape: (8,)
No. of entries: 8
[[1 2 3 4]
 [5 6 7 8]]
New shape: (2, 4)
No. of entries: 8
[[1 2]
 [3 4]
 [5 6]
 [7 8]]
New shape: (4, 2)
No. of entries: 8

# Average hourly visits
site_traffic.mean(), np.mean(site_traffic)

(549.8228310502283, 549.8228310502283)

# Average number of visits per day
site_traffic.mean(axis=1).shape

(365,)

a, b  = np.arange(3), np.arange(3) * 2
a, b

(array([0, 1, 2]), array([0, 2, 4]))

# Element wise addition
c = a + b
c

array([0, 3, 6])

import matplotlib.pyplot as plt



# Define the angle of rotation in radians
theta = np.pi/9
# Create a 2D rotation matrix
rotation_matrix = np.array([[np.cos(theta), -np.sin(theta)],
                            [np.sin(theta), np.cos(theta)]])

points = np.array([
    [1, 1],
    [0, 0]
])

plt.scatter(x=points[:, 0], y=points[:, 1])
plt.grid(True)
plt.show()

# Matrix multiplication to rotate points in space
points_rotated = rotation_matrix @ points.T
# Equivalent to:
# points_rotated = np.matmul(rotation_matrix, np.transpose(points))


plt.scatter(x=points_rotated[:, 0], y=points_rotated[:, 1])
plt.grid(True)
plt.show()

point_shifted = points + 2
plt.scatter(x=point_shifted[:, 0], y=point_shifted[:, 1])
plt.grid(True)
plt.show()

matrix = np.arange(9).reshape(3, 3)
print(matrix)

row_vec = np.array([1, -1, 5])
print(row_vec)
print("matrix + row_vec=\n", matrix + row_vec)

col_vec = np.array([[1], [-1], [5]])
print(col_vec.shape)
print(col_vec)
print("matrix + col_vec=\n", matrix + col_vec)

[[0 1 2]
 [3 4 5]
 [6 7 8]]
[ 1 -1  5]
matrix + row_vec=
 [[ 1  0  7]
 [ 4  3 10]
 [ 7  6 13]]
(3, 1)
[[ 1]
 [-1]
 [ 5]]
matrix + col_vec=
 [[ 1  2  3]
 [ 2  3  4]
 [11 12 13]]

for dim_a, dim_b in zip(a.shape[::-1], b.shape[::-1]):
    if not (1 in (dim_a, dim_b) or dim_a == dim_b):
        raise ValueError("operands could not be broadcast together")

import pandas as pd

columns = ["Transaction", "Product", "Price", "Quantity"]

data = [
    [0, "Beer", 0.89, 6],
    [0, "Chips", 1.99, 1],
    [1, "Milk", 1.20, 3],
    [2, "Bread", 2.55, 1],
]

df = pd.DataFrame(data=data, columns=columns)
df

df.shape, df.columns, df.index

((4, 4),
 Index(['Transaction', 'Product', 'Price', 'Quantity'], dtype='object'),
 RangeIndex(start=0, stop=4, step=1))

# Access columns
df["Price"]

0    0.89
1    1.99
2    1.20
3    2.55
Name: Price, dtype: float64

# Access rows
df.loc[0]

Transaction       0
Product        Beer
Price          0.89
Quantity          6
Name: 0, dtype: object

# Sorting based on specific columns
df.sort_values(by="Price", ascending=False)

# Aggregating all rows with a specific value
df.groupby("Transaction")["Price"].sum()

Transaction
0    2.88
1    1.20
2    2.55
Name: Price, dtype: float64

df.groupby("Product")["Quantity"].mean()

Product
Beer     6.0
Bread    1.0
Chips    1.0
Milk     3.0
Name: Quantity, dtype: float64

# Counting unique values within columns
df.Transaction.value_counts()

Transaction
0    2
1    1
2    1
Name: count, dtype: int64

# Querying for data:
df.query("Price < 2.0")
# Equivalent to
# df[df["Price"] < 2.0]

df.query("Product.str.startswith('B')")

# Equivalent to 
# df[df["Product"].str.startswith("B")]

def is_even(x):
    return x % 2 == 0

df.query("@is_even(Transaction)")

# Equivalent to 
# df[[is_even(t) for t in df["Transaction"]]]

# Saving data in a variety of commonly used formats
df.to_csv("sales.csv", index=False)
df.to_excel("sales.xlsx", index=False)

# Loading data in various formats
df = pd.read_excel("sales.xlsx")
df

Data Science for Humanities 2¶

Session: Python's Data Science Stack¶

Part 1: Numpy & Pandas¶

Summer term 25¶

Prof. Goran Glavaš, Lennart Keller¶

`Numpy` & Scientific Programming: Moving from lists to arrays¶

First things first - Numpy: Usage basics¶

Creating `ndarrays`¶

`ndarray`: Dimensionality¶

`ndarray`: Indexing¶

`ndarray`: Masking¶

Retrieving¶

Inplace Manipulations¶

`ndarray`: Reshaping¶

`ndarray`: Computations¶

`ndarray`: Broadcasting¶

`Pandas`: Wrangling with data in Python¶

`DataFrame`¶

How to learn really `Pandas`?¶

Data Science for Humanities 2¶

Session: Python's Data Science Stack¶

Part 1: Numpy & Pandas¶

Summer term 25¶

Prof. Goran Glavaš, Lennart Keller¶

Numpy & Scientific Programming: Moving from lists to arrays¶

First things first - Numpy: Usage basics¶

Creating ndarrays¶

ndarray: Dimensionality¶

ndarray: Indexing¶

ndarray: Masking¶

Retrieving¶

Inplace Manipulations¶

ndarray: Reshaping¶

ndarray: Computations¶

ndarray: Broadcasting¶

Pandas: Wrangling with data in Python¶

DataFrame¶

How to learn really Pandas?¶

`Numpy` & Scientific Programming: Moving from lists to arrays¶

Creating `ndarrays`¶

`ndarray`: Dimensionality¶

`ndarray`: Indexing¶

`ndarray`: Masking¶

`ndarray`: Reshaping¶

`ndarray`: Computations¶

`ndarray`: Broadcasting¶

`Pandas`: Wrangling with data in Python¶

`DataFrame`¶

How to learn really `Pandas`?¶