Démos sur le calcul flottant

Sources

Performances des nombres sous-normaux

Pour tester la représentation en base 2 : https://www.h-schmidt.net/FloatConverter/IEEE754.html

import numpy as np
import time
import math
import matplotlib.pyplot as plt
%matplotlib inline 
# Python utilise des nombres flottants à double précision (64 bits) :
# - signe sur 1 bit
# - mantisse de 53 bits (52 stockés)
# - exposant sur 11 bits (de -1024 à +1023)
e_max = -(1024 - 100)
e_min = -(1024 + 60) # Pour légèrement dépasser la capacité des nombres sous-normaux
N = 100000

duration = np.empty(e_max - e_min + 1)

for e in range(e_min, e_max+1):
    values = (2**e) * np.ones(N)
    tic = time.process_time()
    values *= 3.14
    duration[e - e_min] = time.process_time() - tic;

plt.figure(figsize=(12, 6))
plt.semilogy(range(e_min, e_max+1), duration)
plt.xlabel('exposant')
plt.ylabel('temps');

png

Approximation des réels

a = 10000.0
b = 9999.5
c = 0.1
print("{:.20f}".format(c))
0.10000000000000000555
a1 = a + c
print("{:.20f}".format(a1))
10000.10000000000036379788
print("{:.20f}".format(a1 - b))
0.60000000000036379788
print("{:.20f}".format(0.6))
0.59999999999999997780

Les modes d’arrondi

Le code en C++ :

#include <iostream>
#include <iomanip>
#include <cfenv>

int main()
{
  double x = 1;

  std::cout << std::setprecision(50);

  std::fesetround(FE_TONEAREST); // Par défaut
  std::cout << "Au plus proche:  1/10  =  " << x/10 << std::endl;
  std::cout << "Au plus proche: -1/10  = " << -x/10 << std::endl;
  std::cout << std::endl;

  std::fesetround(FE_DOWNWARD);
  std::cout << "Vers -oo:        1/10  =  " << x/10 << std::endl;
  std::cout << "Vers -oo:       -1/10  = " << -x/10 << std::endl;
  std::cout << std::endl;

  std::fesetround(FE_UPWARD);
  std::cout << "Vers +oo:        1/10  =  " << x/10 << std::endl;
  std::cout << "Vers +oo:       -1/10  = " << -x/10 << std::endl;
  std::cout << std::endl;

  std::fesetround(FE_TOWARDZERO);
  std::cout << "Vers 0:          1/10  =  " << x/10 << std::endl;
  std::cout << "Vers 0:         -1/10  = " << -x/10 << std::endl;
  std::cout << std::endl;

  return 0;
}

Le résultat :

Au plus proche:  1/10  =  0.1000000000000000055511151231257827021181583404541
Au plus proche: -1/10  = -0.1000000000000000055511151231257827021181583404541

Vers -oo:        1/10  =  0.099999999999999991673327315311325946822762489318847
Vers -oo:       -1/10  = -0.10000000000000000555111512312578270211815834045411

Vers +oo:        1/10  =  0.10000000000000000555111512312578270211815834045411
Vers +oo:       -1/10  = -0.099999999999999991673327315311325946822762489318847

Vers 0:          1/10  =  0.099999999999999991673327315311325946822762489318847
Vers 0:         -1/10  = -0.099999999999999991673327315311325946822762489318847

Unit in the Last Place

def ulp(v):
    print("{}\t{}".format(v, v - np.nextafter(v, v-1)))

ulp(1000)
ulp(100)
ulp(10)
ulp(1)
ulp(0.1)
ulp(0.01)
ulp(0)
1000    1.1368683772161603e-13
100 1.4210854715202004e-14
10  1.7763568394002505e-15
1   1.1102230246251565e-16
0.1 1.3877787807814457e-17
0.01    1.734723475976807e-18
0   5e-324
(1000 + 1.5e-12) - 1000
1.4779288903810084e-12
(1000 + 1e-14) - 1000
0.0
(1 + 1e-16) - 1
0.0
(1 + 2e-16) - 1
2.220446049250313e-16
(1 + 3e-16) - 1
2.220446049250313e-16
(1 + 4e-16) - 1
4.440892098500626e-16

Non associativité des opérations

0.1 + (0.2 - 0.3)
2.7755575615628914e-17
(0.1 + 0.2) - 0.3
5.551115123125783e-17
1 + 1e-17 - 1
0.0
1 - 1 + 1e-17
1e-17

Annulation catastrophique

Discriminant

## Decimal floating point values with customizable precision
import decimal
from decimal import Decimal

decimal.getcontext().prec = 300 # Try after with 3 digits

a = Decimal('1.22')
b = Decimal('3.34')
c = Decimal('2.28')

print(f"b² = {b*b}")
print(f"4ac = {Decimal('4') * a * c}")
print(f"b² - 4ac = {b*b - Decimal('4') * a * c}")
b² = 11.1556
4ac = 11.1264
b² - 4ac = 0.0292

Calcul des racines d’un polynôme du second degré

a = 1.
b = 1e4
c = 1.
# Calcul des racines
delta = b*b - 4*a*c
x = (-b - math.sqrt(delta)) / (2*a)
y = (-b + math.sqrt(delta)) / (2*a)
print("x = {}".format(x))
print("y = {}".format(y))
x = -9999.999899999999
y = -0.00010000000111176632
# Vérification
print("ax² + bx + c = {}".format(a*x*x + b*x + c))
print("ay² + by + c = {}".format(a*y*y + b*y + c))
ax² + bx + c = 0.0
ay² + by + c = -1.1176630732023796e-09
b*b
100000000.0
a*c
1.0
math.sqrt(b*b - 4*a*c)
9999.999799999998

$y=\frac{-b+\sqrt{\delta}}{2a}$ est mal calculé (soustraction de $2$ grands nombres proches, mais $x$ est bien calculé.

On sait que $x \times y = c / a$, d’où :

y = c / (x*a)
print("y = {}".format(y))
y = -0.00010000000100000001
# Vérification
print("ax² + bx + c = {}".format(a*x*x + b*x + c))
print("ay² + by + c = {}".format(a*y*y + b*y + c))
ax² + bx + c = 0.0
ay² + by + c = 0.0

Exercice : envisager tous les cas possibles pour le choix de a, b et c : l’écriture d’un programme numériquement robuste pour le calcul des racines d’un trinôme du second degré est loin d’être simple.

Aire d’un triangle

Aire $A$ d’un triangle de longuers $a$, $b$ et $c$ :

$$A = \sqrt{s (s - a) (s - b) (s - c)}$$

avec

$$s = \frac{a + b + c}{2}$$

import decimal
from decimal import Decimal

decimal.getcontext().prec = 3 # Decimal digits

a = Decimal('9.0')
b = c = Decimal('4.55')

s = (a + b + c) / Decimal('2')
A = (s * (s - a) * (s - b) * (s - c)).sqrt()
print(f"s = {s}")
print(f"A = {A}")
s = 9.1
A = 4.34
# Kahan 1986
# Ensure a >= b >= c or rename variables accordingly
decimal.getcontext().prec = 3

a = Decimal('9.0')
b = c = Decimal('4.55')

A = (   (a + (b + c))
      * (c - (a - b))
      * (c + (a - b))
      * (a + (b - c))).sqrt() / Decimal('4')
print(A)
3.02

Différences finies

def f(x):
    return x**3

def df(x):
    return 3 * x**2

def deriv_fd1(f, x, h):
    return (f(x+h) - f(x)) / h

def deriv_fd2(f, x, h):
    return (f(x+h) - f(x-h)) / (2*h)

x = 1
h = np.logspace(-16, -1, 101)
err1 = abs(deriv_fd1(f, x, h) - df(x))
err2 = abs(deriv_fd2(f, x, h) - df(x))

plt.figure(figsize=(12, 6))
plt.loglog(h, err1)
plt.loglog(h, err2)
plt.legend(['Ordre 1', 'Ordre 2'])
plt.xlabel('h')
plt.ylabel('erreurr');

png

f(x) # = 1
1
f(x + 1e-9) # (1 + 1e-9)^3 = 1 + 3e-9 + 3e-18 + 1e-27
1.0000000030000002
f(x + 1e-9) - f(x - 1e-9) # = 6e-9 + 2e-27
6.000000163375319e-09
f(x + 1e-9) - f(x - 1e-9) - 6e-9 # = 2e-27
1.6337531864689604e-16
(f(x + 1e-9) - f(x - 1e-9)) / 2e-9 # = 3 + 1e-18
3.0000000816876593

Erreurs d’arrondi

Calcul des termes d’une suite :

Calcul de $u_{n+1} = 4 u_n - 1$ avec $u_0 = 1/3$

x = 1/3

for i in range(50):
    x = 4*x - 1
    print(x)
0.33333333333333326
0.33333333333333304
0.33333333333333215
0.3333333333333286
0.3333333333333144
0.33333333333325754
0.33333333333303017
0.3333333333321207
0.3333333333284827
0.3333333333139308
0.3333333332557231
0.3333333330228925
0.3333333320915699
0.3333333283662796
0.3333333134651184
0.33333325386047363
0.33333301544189453
0.3333320617675781
0.3333282470703125
0.33331298828125
0.333251953125
0.3330078125
0.33203125
0.328125
0.3125
0.25
0.0
-1.0
-5.0
-21.0
-85.0
-341.0
-1365.0
-5461.0
-21845.0
-87381.0
-349525.0
-1398101.0
-5592405.0
-22369621.0
-89478485.0
-357913941.0
-1431655765.0
-5726623061.0
-22906492245.0
-91625968981.0
-366503875925.0
-1466015503701.0
-5864062014805.0
-23456248059221.0

Calcul de $u_{n+1} = 3 u_n - 1$ avec $u_0 = 1/2$

x = 1/2

for i in range(50):
    x = 3*x - 1
    print(x)
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5

Epsilon

eps = 1.
while 1. + eps > 1.:
    eps = eps / 2.

print(eps*2.)
2.220446049250313e-16

Problème mal conditionné

Résoudre un système linéaire $Ax = b$ avec $A$ quelconque

N = 40
A = np.random.rand(N, N)
b = np.arange(1, N+1)
x = np.linalg.solve(A, b)
x
array([ -73.3868327 ,   35.67432243,   77.1533654 ,  175.40765276,
        170.77028239,  -69.44250334,  -57.86664728,   87.09067356,
        107.39445647,  -23.70290555, -113.56079487,  -95.410451  ,
        127.14302336,   29.15842657,   86.02583633,  -45.85817261,
        -37.91462834,   57.19365225, -157.04364942, -292.51573544,
        128.40473938,   12.44418047,   47.38783928,  121.50401725,
        -99.9062809 ,   52.35430073,   38.93485747,  -11.905154  ,
        -45.82993309, -119.28203053,   77.02136612,  142.27016536,
         36.02903745,  119.67430918,  -88.68281304, -192.38465227,
       -146.7645299 ,   10.72398435,  153.88318391, -170.95344894])
np.max(np.abs(A @ x - b))
3.410605131648481e-13
np.linalg.cond(A)
2135.965665762848

Résoudre un système linéaire $Ax = b$ avec la matrice de Hilbert

$$A_{i,j}=\frac{1}{i+j-1}.$$

N = 40
A = np.fromfunction(lambda i, j: 1 / (i+j+1), (N, N)) # i+j+1 car i et j commencent à 0 en Python
b = np.arange(1, N+1)
x = np.linalg.solve(A, b)
x
array([-7.27518147e+02,  3.09088878e+05, -2.13324442e+07,  5.51846356e+08,
       -7.10681820e+09,  5.19510679e+10, -2.29419337e+11,  6.23199316e+11,
       -1.01930959e+12,  9.54242503e+11, -5.86427620e+11,  5.79503577e+11,
       -4.05075650e+11, -4.78056373e+11,  4.45454231e+11,  5.51352795e+11,
        2.82695788e+10, -1.20602471e+12,  3.82971910e+11,  3.89205587e+11,
        1.07165357e+12, -1.55131334e+12, -3.17806967e+11,  4.99598659e+11,
       -2.70349873e+09,  6.74572887e+11,  7.08616235e+11, -1.62131350e+12,
        5.41932112e+11, -1.08743975e+11, -6.31695374e+11,  1.86068155e+11,
        9.14453952e+11, -5.05415524e+11, -1.16151764e+10,  1.52049700e+12,
       -2.25903886e+12,  2.73421652e+11,  8.69087434e+11, -3.25516595e+11])
np.max(np.abs(A @ x - b))
3.0517578125e-05
np.linalg.cond(A)
6.155603641678775e+18

Résoudre un système linéaire $Ax = b$ avec la matrice du Laplacien 1D

Résoudre $$\Delta u = 0$$ avec $$\Delta u(x) \approx \frac{u(x-h) - 2u(x) + u(x+h)}{h^2}.$$

N = 40
A = np.diag(np.ones(N-1), 1) + np.diag(np.ones(N-1), -1) - 2 * np.eye(N, N)
b = np.arange(1, N+1)
x = np.linalg.solve(A, b)
x
array([ -280.,  -559.,  -836., -1110., -1380., -1645., -1904., -2156.,
       -2400., -2635., -2860., -3074., -3276., -3465., -3640., -3800.,
       -3944., -4071., -4180., -4270., -4340., -4389., -4416., -4420.,
       -4400., -4355., -4284., -4186., -4060., -3905., -3720., -3504.,
       -3256., -2975., -2660., -2310., -1924., -1501., -1040.,  -540.])
np.max(np.abs(A @ x - b))
9.094947017729282e-13
np.linalg.cond(A)
680.6170700217226