Démos sur la complexité algorithmique et l’intensité arithmétique

Sources

Sources disponibles au format notebook Jupyter

Préparation

Pensez à désactiver le TurboBoost (augmentation temporaire de la fréquence du processeur) afin d’obtenir des temps de calcul plus consistants.

Sous Linux, vous pouvez le faire avec la commande :

sudo wrmsr -p0 0x1a0 0x4000850089

Pour le réactiver à nouveau, redémarrez ou utilisez :

sudo wrmsr -p0 0x1a0 0x850089

Pour exécuter ce Notebook, vous avez besoin des modules numpy et matplotlib.

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 

plt.rcParams['figure.figsize'] = (16, 8)
import timeit

def tictoc(stmt='pass', setup='pass'):
    timer = timeit.Timer(stmt, setup)
    r = timer.autorange() # Returns (execution count, total time)
    return r[1] / r[0]
def benchmark(N_list, stmt=lambda N: 'pass', setup=lambda N: 'pass'):

    # Benchmark
    t_list = []
    for i in range(len(N_list)):
        N = N_list[i]

        if i > 0:
            print(f"N = {N:12g} (x{N / N_list[i-1]:.2f}) ; t = ", end='', flush=True)
        else:
            print(f"N = {N:12g} ; t = ", end='', flush=True)

        t_list.append(tictoc(stmt(N), setup(N)))

        if i > 0:
            print(f"{t_list[i]:12g}s (x{t_list[i]/t_list[i-1]:.2f})")
        else:
            print(f"{t_list[i]:12g}s")

    return np.array(N_list), np.array(t_list)


Complexité algorithmique en temps

def complexity(N_list, t_list):
    # Graph in linear scale
    plt.figure()
    plt.title('linear scale')
    plt.plot(N_list, t_list, 'x-')
    plt.xlabel('N')
    plt.ylabel('t')
    plt.grid()
    plt.show()

    # Graph in loglog scale
    plt.figure()
    plt.title('loglog scale')
    plt.loglog(N_list, t_list, 'x-')
    plt.xlabel('N')
    plt.ylabel('t')
    plt.grid()
    plt.show()

    # Complexity estimation
    plt.figure()
    plt.title('p estimation in O(N^p)')
    plt.semilogx(N_list[1:], np.diff(np.log(t_list)) / np.diff(np.log(N_list)), 'x-')
    plt.xlabel('N')
    plt.ylabel('p')
    plt.grid()
    plt.show()

Produit scalaire

N_list, t_list = benchmark(
    np.logspace(1, 8, 15),
    lambda N: 'np.dot(u, v)',
    lambda N: f"import numpy as np; N = {int(N)}; u = np.ones(N); v = np.ones(N)")
complexity(N_list, t_list)
N =           10 ; t =  1.05107e-06s
N =      31.6228 (x3.16) ; t =  1.04053e-06s (x0.99)
N =          100 (x3.16) ; t =  1.03611e-06s (x1.00)
N =      316.228 (x3.16) ; t =   1.0777e-06s (x1.04)
N =         1000 (x3.16) ; t =  1.19262e-06s (x1.11)
N =      3162.28 (x3.16) ; t =  1.99129e-06s (x1.67)
N =        10000 (x3.16) ; t =  3.30299e-06s (x1.66)
N =      31622.8 (x3.16) ; t =  4.47103e-06s (x1.35)
N =       100000 (x3.16) ; t =  9.13646e-06s (x2.04)
N =       316228 (x3.16) ; t =  2.85211e-05s (x3.12)
N =        1e+06 (x3.16) ; t =  0.000247144s (x8.67)
N =  3.16228e+06 (x3.16) ; t =   0.00131324s (x5.31)
N =        1e+07 (x3.16) ; t =   0.00472349s (x3.60)
N =  3.16228e+07 (x3.16) ; t =    0.0154287s (x3.27)
N =        1e+08 (x3.16) ; t =    0.0505415s (x3.28)

png

png

png

Produit matrice x vecteur

N_list, t_list = benchmark(
    np.logspace(1, 4.5, 15),
    lambda N: f"A @ u",
    lambda N: f"import numpy as np; N = {int(N)}; A = np.ones((N, N)); u = np.ones(N)")
complexity(N_list, t_list)
N =           10 ; t =  1.22919e-06s
N =      17.7828 (x1.78) ; t =  1.31257e-06s (x1.07)
N =      31.6228 (x1.78) ; t =  1.35403e-06s (x1.03)
N =      56.2341 (x1.78) ; t =  1.74956e-06s (x1.29)
N =          100 (x1.78) ; t =  3.17784e-06s (x1.82)
N =      177.828 (x1.78) ; t =  4.35106e-06s (x1.37)
N =      316.228 (x1.78) ; t =  6.21931e-06s (x1.43)
N =      562.341 (x1.78) ; t =  1.71963e-05s (x2.76)
N =         1000 (x1.78) ; t =  5.57912e-05s (x3.24)
N =      1778.28 (x1.78) ; t =  0.000486685s (x8.72)
N =      3162.28 (x1.78) ; t =   0.00229801s (x4.72)
N =      5623.41 (x1.78) ; t =   0.00776526s (x3.38)
N =        10000 (x1.78) ; t =    0.0250533s (x3.23)
N =      17782.8 (x1.78) ; t =    0.0800647s (x3.20)
N =      31622.8 (x1.78) ; t =     0.255795s (x3.19)

png

png

png

Produit matrice x matrice

N_list, t_list = benchmark(
    np.logspace(1, 4, 13),
    lambda N: f"A @ B",
    lambda N: f"import numpy as np; N = {int(N)}; A = np.ones((N, N)); B = np.ones((N, N))")
complexity(N_list, t_list)
N =           10 ; t =  1.90318e-06s
N =      17.7828 (x1.78) ; t =  2.40894e-06s (x1.27)
N =      31.6228 (x1.78) ; t =  4.30803e-06s (x1.79)
N =      56.2341 (x1.78) ; t =  6.51653e-06s (x1.51)
N =          100 (x1.78) ; t =   1.5339e-05s (x2.35)
N =      177.828 (x1.78) ; t =  6.65733e-05s (x4.34)
N =      316.228 (x1.78) ; t =  0.000313453s (x4.71)
N =      562.341 (x1.78) ; t =   0.00174288s (x5.56)
N =         1000 (x1.78) ; t =   0.00992882s (x5.70)
N =      1778.28 (x1.78) ; t =    0.0510079s (x5.14)
N =      3162.28 (x1.78) ; t =     0.272624s (x5.34)
N =      5623.41 (x1.78) ; t =       1.5006s (x5.50)
N =        10000 (x1.78) ; t =       8.3724s (x5.58)

png

png

png

Intensité arithmétique

## Spécifique à mon ordinateur !!!
flop_per_cycle = 16
core_count = 6
core_frequency = 2.7e9
memory_channel = 2
memory_frequency = 2667e6
memory_bits = 64

peak_flops = core_count * flop_per_cycle * core_frequency
memory_bandwith = memory_channel * memory_frequency * memory_bits / 8

print(f"Peak performance: {peak_flops / 1e9:.2f} Gflop/s")
print(f"Max memory bandwith: {memory_bandwith / 1e9:.2f} GB/s")
print(f"Max memory bandwith: {memory_bandwith / 1e9 / 8:.2f} GT/s (in double transfers per second)")
print(f"Optimal arithmetic intensity: {peak_flops / (memory_bandwith / 8):.2f}")
Peak performance: 259.20 Gflop/s
Max memory bandwith: 42.67 GB/s
Max memory bandwith: 5.33 GT/s (in double transfers per second)
Optimal arithmetic intensity: 48.59

Modèle Roofline

def roofline(peak_flops, peak_transfer, bench_ia=np.array([]), bench_flops=np.array([])):
    f = lambda ia: np.minimum(peak_flops, peak_transfer * ia)
    ia = np.logspace(-1, 4, 101)
    plt.figure()
    plt.loglog(ia, f(ia) / 1e9)
    #plt.plot(bench_ia, bench_flops / 1e9, 'o')
    plt.scatter(bench_ia, bench_flops / 1e9, c=range(len(bench_ia)))
    plt.grid()
    plt.xlabel("Arithmetic intensity")
    plt.ylabel("Gflops/s")
    plt.show()

roofline(peak_flops, memory_bandwith / 8)

png

Produit scalaire

N_list, t_list = benchmark(
    np.logspace(1, 8, 15),
    lambda N: 'np.dot(u, v)',
    lambda N: f"import numpy as np; N = {int(N)}; u = np.ones(N); v = np.ones(N)")

operations = 2 * N_list - 1
input_output = 2 * N_list + 1

flops = operations / t_list
ia  = operations / input_output
roofline(peak_flops, memory_bandwith / 8, ia, flops)
N =           10 ; t =   1.0438e-06s
N =      31.6228 (x3.16) ; t =  1.05943e-06s (x1.01)
N =          100 (x3.16) ; t =  1.07154e-06s (x1.01)
N =      316.228 (x3.16) ; t =  1.09916e-06s (x1.03)
N =         1000 (x3.16) ; t =   1.2144e-06s (x1.10)
N =      3162.28 (x3.16) ; t =  1.83971e-06s (x1.51)
N =        10000 (x3.16) ; t =  3.29834e-06s (x1.79)
N =      31622.8 (x3.16) ; t =  4.35697e-06s (x1.32)
N =       100000 (x3.16) ; t =  8.76171e-06s (x2.01)
N =       316228 (x3.16) ; t =  2.85693e-05s (x3.26)
N =        1e+06 (x3.16) ; t =  0.000199505s (x6.98)
N =  3.16228e+06 (x3.16) ; t =   0.00132559s (x6.64)
N =        1e+07 (x3.16) ; t =   0.00474764s (x3.58)
N =  3.16228e+07 (x3.16) ; t =    0.0153812s (x3.24)
N =        1e+08 (x3.16) ; t =    0.0497987s (x3.24)

png

plt.figure()
plt.loglog(N_list, flops / 1e9)
plt.scatter(N_list, flops / 1e9, c=range(len(N_list)))
plt.xlabel('N')
plt.ylabel('Gflops/s')
plt.grid()
plt.show()

png

Produit matrice x vecteur

N_list, t_list = benchmark(
    np.logspace(1, 4.5, 15),
    lambda N: f"A @ u",
    lambda N: f"import numpy as np; N = {int(N)}; A = np.ones((N, N)); u = np.ones(N)")

operations = N_list * (2 * N_list - 1)
input_output = N_list**2 + 2 * N_list

flops = operations / t_list
ia  = operations / input_output
roofline(peak_flops, memory_bandwith / 8, ia, flops)
N =           10 ; t =  1.26035e-06s
N =      17.7828 (x1.78) ; t =  1.30422e-06s (x1.03)
N =      31.6228 (x1.78) ; t =  1.38104e-06s (x1.06)
N =      56.2341 (x1.78) ; t =  1.71519e-06s (x1.24)
N =          100 (x1.78) ; t =  2.70578e-06s (x1.58)
N =      177.828 (x1.78) ; t =  4.34213e-06s (x1.60)
N =      316.228 (x1.78) ; t =  6.18147e-06s (x1.42)
N =      562.341 (x1.78) ; t =  1.73249e-05s (x2.80)
N =         1000 (x1.78) ; t =  4.91323e-05s (x2.84)
N =      1778.28 (x1.78) ; t =   0.00047967s (x9.76)
N =      3162.28 (x1.78) ; t =   0.00231569s (x4.83)
N =      5623.41 (x1.78) ; t =   0.00775096s (x3.35)
N =        10000 (x1.78) ; t =     0.025109s (x3.24)
N =      17782.8 (x1.78) ; t =    0.0799698s (x3.18)
N =      31622.8 (x1.78) ; t =     0.258249s (x3.23)

png

plt.figure()
plt.loglog(N_list, flops / 1e9)
plt.scatter(N_list, flops / 1e9, c=range(len(N_list)))
plt.xlabel('N')
plt.ylabel('Gflops/s')
plt.grid()
plt.show()

png

Produit matrice x matrice

N_list, t_list = benchmark(
    np.logspace(1, 4, 13),
    lambda N: f"A @ B",
    lambda N: f"import numpy as np; N = {int(N)}; A = np.ones((N, N)); B = np.ones((N, N))")

operations = N_list**2 * (2*N_list - 1)
input_output = 3 * N_list**2

flops = operations / t_list
ia  = operations / input_output
roofline(peak_flops, memory_bandwith / 8, ia, flops)
N =           10 ; t =  2.01286e-06s
N =      17.7828 (x1.78) ; t =  2.48949e-06s (x1.24)
N =      31.6228 (x1.78) ; t =  4.28999e-06s (x1.72)
N =      56.2341 (x1.78) ; t =  6.37573e-06s (x1.49)
N =          100 (x1.78) ; t =  1.61976e-05s (x2.54)
N =      177.828 (x1.78) ; t =  6.56802e-05s (x4.05)
N =      316.228 (x1.78) ; t =  0.000319644s (x4.87)
N =      562.341 (x1.78) ; t =   0.00174173s (x5.45)
N =         1000 (x1.78) ; t =   0.00907966s (x5.21)
N =      1778.28 (x1.78) ; t =    0.0510608s (x5.62)
N =      3162.28 (x1.78) ; t =     0.271739s (x5.32)
N =      5623.41 (x1.78) ; t =      1.49738s (x5.51)
N =        10000 (x1.78) ; t =      8.35061s (x5.58)

png

plt.figure()
plt.loglog(N_list, flops / 1e9)
plt.scatter(N_list, flops / 1e9, c=range(len(N_list)))
plt.xlabel('N')
plt.ylabel('Gflops/s')
plt.grid()
plt.show()

png