Démos sur le calcul flottant

Sources

Performances des nombres sous-normaux

Pour tester la représentation en base 2 : https://www.h-schmidt.net/FloatConverter/IEEE754.html

import numpy as np
import time
import math
import matplotlib.pyplot as plt
%matplotlib inline
# Python utilise des nombres flottants à double précision (64 bits) :
# - signe sur 1 bit
# - mantisse de 53 bits (52 stockés)
# - exposant sur 11 bits (de -1024 à +1023)
e_max = -(1024-100)
e_min = -(1024+60) # Pour légèrement dépasser la capacité des nombres sous-normaux
N = 100000

duration = np.empty(e_max - e_min + 1)

for e in range(e_min, e_max+1):
    values = (2**e) * np.ones(N)
    tic = time.process_time()
    r = values.sum()
    duration[e - e_min] = time.process_time() - tic;

plt.figure(figsize=(12, 6))
plt.semilogy(range(e_min, e_max+1), duration)
plt.xlabel('exposant')
plt.ylabel('temps');

png

Approximation des réels

a = 10000.0
b = 9999.5
c = 0.1
print("{:.20f}".format(c))
0.10000000000000000555
a1 = a + c
print("{:.20f}".format(a1))
10000.10000000000036379788
print("{:.20f}".format(a1 - b))
0.60000000000036379788
print("{:.20f}".format(0.6))
0.59999999999999997780

Les modes d’arrondi

Le code en C++ :

#include <iostream>
#include <iomanip>
#include <cfenv>

int main()
{
  double x = 1;

  std::cout << std::setprecision(50);

  std::fesetround(FE_TONEAREST); // Par défaut
  std::cout << "Au plus proche:  1/10  =  " << x/10 << std::endl;
  std::cout << "Au plus proche: -1/10  = " << -x/10 << std::endl;
  std::cout << std::endl;

  std::fesetround(FE_DOWNWARD);
  std::cout << "Vers -oo:        1/10  =  " << x/10 << std::endl;
  std::cout << "Vers -oo:       -1/10  = " << -x/10 << std::endl;
  std::cout << std::endl;

  std::fesetround(FE_UPWARD);
  std::cout << "Vers +oo:        1/10  =  " << x/10 << std::endl;
  std::cout << "Vers +oo:       -1/10  = " << -x/10 << std::endl;
  std::cout << std::endl;

  std::fesetround(FE_TOWARDZERO);
  std::cout << "Vers 0:          1/10  =  " << x/10 << std::endl;
  std::cout << "Vers 0:         -1/10  = " << -x/10 << std::endl;
  std::cout << std::endl;

  return 0;
}

Le résultat :

Au plus proche:  1/10  =  0.1000000000000000055511151231257827021181583404541
Au plus proche: -1/10  = -0.1000000000000000055511151231257827021181583404541

Vers -oo:        1/10  =  0.099999999999999991673327315311325946822762489318847
Vers -oo:       -1/10  = -0.10000000000000000555111512312578270211815834045411

Vers +oo:        1/10  =  0.10000000000000000555111512312578270211815834045411
Vers +oo:       -1/10  = -0.099999999999999991673327315311325946822762489318847

Vers 0:          1/10  =  0.099999999999999991673327315311325946822762489318847
Vers 0:         -1/10  = -0.099999999999999991673327315311325946822762489318847

Unit in the Last Place

def ulp(v):
    print("{}\t{}".format(v, v - np.nextafter(v, v-1)))

ulp(1000)
ulp(100)
ulp(10)
ulp(1)
ulp(0.1)
ulp(0.01)
ulp(0)
1000    1.1368683772161603e-13
100 1.4210854715202004e-14
10  1.7763568394002505e-15
1   1.1102230246251565e-16
0.1 1.3877787807814457e-17
0.01    1.734723475976807e-18
0   5e-324
(1000 + 1.5e-12) - 1000
1.4779288903810084e-12
(1000 + 1e-14) - 1000
0.0
(1 + 1e-16) - 1
0.0
(1 + 2e-16) - 1
2.220446049250313e-16
(1 + 3e-16) - 1
2.220446049250313e-16
(1 + 4e-16) - 1
4.440892098500626e-16

Non associativité des opérations

0.1 + (0.2 - 0.3)
2.7755575615628914e-17
(0.1 + 0.2) - 0.3
5.551115123125783e-17
1 + 1e-17 - 1
0.0
1 - 1 + 1e-17
1e-17

Annulation catastrophique

Différences finies

def f(x):
    return x**3

def df(x):
    return 3 * x**2

def deriv_fd1(f, x, h):
    return (f(x+h) - f(x)) / h

def deriv_fd2(f, x, h):
    return (f(x+h) - f(x-h)) / (2*h)

x = 1
h = np.logspace(-16, -1, 101)
err1 = abs(deriv_fd1(f, x, h) - df(x))
err2 = abs(deriv_fd2(f, x, h) - df(x))

plt.figure(figsize=(12, 6))
plt.loglog(h, err1)
plt.loglog(h, err2)
plt.legend(['Ordre 1', 'Ordre 2'])
plt.xlabel('h')
plt.ylabel('erreurr');

png

f(x)
1
f(x + 1e-9) # (1 + 1e-9)^3 = 1 + 3e-9 + 3e-18 + 1e-27
1.0000000030000002
f(x + 1e-9) - f(x - 1e-9) # = 6e-9 + 2e-27
6.000000163375319e-09
f(x + 1e-9) - f(x - 1e-9) - 6e-9 # = 2e-27
1.6337531864689604e-16
(f(x + 1e-9) - f(x - 1e-9)) / 2e-9 # = 3 + 1e-18
3.0000000816876593

Calcul des racines d’un polynôme du second degré

a = 1.
b = 1e4
c = 1.
# Calcul des racines
delta = b*b - 4*a*c
x = (-b - math.sqrt(delta)) / (2*a)
y = (-b + math.sqrt(delta)) / (2*a)
print("x = {}".format(x))
print("y = {}".format(y))
x = -9999.999899999999
y = -0.00010000000111176632
# Vérification
print("ax² + bx + c = {}".format(a*x*x + b*x + c))
print("ay² + by + c = {}".format(a*y*y + b*y + c))
ax² + bx + c = 0.0
ay² + by + c = -1.1176630732023796e-09
b*b
100000000.0
a*c
1.0
math.sqrt(b*b - 4*a*c)
9999.999799999998

$y=(-b+\sqrt{\delta})/(2*a)$ est mal calculé (soustraction de $2$ grands nombres proches, mais $x$ est bien calculé.

On sait que $x \times y = c / a$, d’où :

y = c / (x*a)
print("y = {}".format(y))
y = -0.00010000000100000001
# Vérification
print("ax² + bx + c = {}".format(a*x*x + b*x + c))
print("ay² + by + c = {}".format(a*y*y + b*y + c))
ax² + bx + c = 0.0
ay² + by + c = 0.0

Exercice : envisager tous les cas possibles pour le choix de a, b et c : l’écriture d’un programme numériquement robuste pour le calcul des racines d’un trinôme du second degré est loin d’être simple.

Erreurs d’arrondi

Calcul des termes d’une suite :

Calcul de $u_{n+1} = 4 u_n - 1$ avec $u_0 = 1/3$

x = 1/3

for i in range(50):
    x = 4*x - 1
    print(x)
0.33333333333333326
0.33333333333333304
0.33333333333333215
0.3333333333333286
0.3333333333333144
0.33333333333325754
0.33333333333303017
0.3333333333321207
0.3333333333284827
0.3333333333139308
0.3333333332557231
0.3333333330228925
0.3333333320915699
0.3333333283662796
0.3333333134651184
0.33333325386047363
0.33333301544189453
0.3333320617675781
0.3333282470703125
0.33331298828125
0.333251953125
0.3330078125
0.33203125
0.328125
0.3125
0.25
0.0
-1.0
-5.0
-21.0
-85.0
-341.0
-1365.0
-5461.0
-21845.0
-87381.0
-349525.0
-1398101.0
-5592405.0
-22369621.0
-89478485.0
-357913941.0
-1431655765.0
-5726623061.0
-22906492245.0
-91625968981.0
-366503875925.0
-1466015503701.0
-5864062014805.0
-23456248059221.0

Calcul de $u_{n+1} = 3 u_n - 1$ avec $u_0 = 1/2$

x = 1/2

for i in range(50):
    x = 3*x - 1
    print(x)
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5

Problème mal conditionné

Résoudre un système linéaire $Ax = b$ avec $A$ quelconque

N = 40
A = np.random.rand(N, N)
b = np.arange(1, N+1)
x = np.linalg.solve(A, b)
x
array([  4.33620564,   1.62186228, -27.19954756,   4.36455062,
       -24.60731709, -11.28670069, -15.41964884,  12.18656733,
         0.45112402,  -3.87431386, -10.57541072,  -0.23093873,
         3.62740722,   8.90955365,   8.60412767,   7.34031902,
         5.9230333 ,   3.42822255,   1.41022622,  -6.54389414,
       -10.00094676,  -3.77363095,   7.87430301, -26.48354662,
        13.97865685,  27.29620051,   5.54632052,  -8.55102887,
         5.04585292,  -3.29214344, -21.88860293,   6.39257995,
        -1.94197313,   9.35981911,  25.8363687 ,  -3.36127592,
        15.9989246 ,  16.36192944, -24.42661185,  43.72604007])
np.max(np.abs(A @ x - b))
4.973799150320701e-14
np.linalg.cond(A)
584.8458633271287

Résoudre un système linéaire $Ax = b$ avec la matrice de Hilbert

$$A_{i,j}=\frac{1}{i+j-1}.$$

N = 40
A = np.fromfunction(lambda i, j: 1 / (i+j+1), (N, N)) # i+j+1 car i et j commencent à 0 en Python
b = np.arange(1, N+1)
x = np.linalg.solve(A, b)
x
array([ 4.87637519e+03, -7.72593266e+05,  3.03428996e+07, -5.17019809e+08,
        4.75675321e+09, -2.63265691e+10,  9.20658875e+10, -2.02259559e+11,
        2.50344415e+11, -9.34102481e+10, -9.14799063e+10, -1.69941076e+11,
        6.49204839e+11, -4.12056938e+11, -1.62136188e+11, -1.60962818e+10,
        2.02941397e+11,  8.08402238e+10, -1.13280864e+11,  2.25134855e+11,
       -1.83742135e+11,  1.35707012e+11, -6.90181450e+11,  6.03023373e+11,
       -5.93352918e+11,  6.69302159e+11,  1.90231397e+11, -5.04236249e+10,
       -4.17557593e+10, -4.54270557e+11,  1.89362489e+11, -1.20319327e+11,
       -2.92749525e+11,  2.34680270e+11,  1.96469712e+11,  1.97768878e+11,
        6.60265142e+11, -1.48402608e+12,  5.98386259e+11,  1.78115288e+10])
np.max(np.abs(A @ x - b))
2.288818359375e-05
np.linalg.cond(A)
9.220953338051718e+18

Résoudre un système linéaire $Ax = b$ avec la matrice du Laplacien 1D

Résoudre $$\Delta u = 0$$ avec $$\Delta u(x) \approx \frac{u(x-h) - 2u(x) + u(x+h)}{h^2}.$$

N = 40
A = np.diag(np.ones(N-1), 1) + np.diag(np.ones(N-1), -1) - 2 * np.eye(N, N)
b = np.arange(1, N+1)
x = np.linalg.solve(A, b)
x
array([ -280.,  -559.,  -836., -1110., -1380., -1645., -1904., -2156.,
       -2400., -2635., -2860., -3074., -3276., -3465., -3640., -3800.,
       -3944., -4071., -4180., -4270., -4340., -4389., -4416., -4420.,
       -4400., -4355., -4284., -4186., -4060., -3905., -3720., -3504.,
       -3256., -2975., -2660., -2310., -1924., -1501., -1040.,  -540.])
np.max(np.abs(A @ x - b))
1.8189894035458565e-12
np.linalg.cond(A)
680.6170700217184