/build/cargo-vendor-dir/libm-0.2.15/src/math/generic/scalbn.rs

Source
use crate::support::{CastFrom, CastInto, Float, IntTy, MinInt};

/// Scale the exponent.
///
/// From N3220:
///
/// > The scalbn and scalbln functions compute `x * b^n`, where `b = FLT_RADIX` if the return type
/// > of the function is a standard floating type, or `b = 10` if the return type of the function
/// > is a decimal floating type. A range error occurs for some finite x, depending on n.
/// >
/// > [...]
/// >
/// > * `scalbn(±0, n)` returns `±0`.
/// > * `scalbn(x, 0)` returns `x`.
/// > * `scalbn(±∞, n)` returns `±∞`.
/// >
/// > If the calculation does not overflow or underflow, the returned value is exact and
/// > independent of the current rounding direction mode.
#[inline]
pub fn scalbn<F: Float>(mut x: F, mut n: i32) -> F
where
    u32: CastInto<F::Int>,
    F::Int: CastFrom<i32>,
    F::Int: CastFrom<u32>,
{
    let zero = IntTy::<F>::ZERO;

    // Bits including the implicit bit
    let sig_total_bits = F::SIG_BITS + 1;

    // Maximum and minimum values when biased
    let exp_max = F::EXP_MAX;
    let exp_min = F::EXP_MIN;

    // 2 ^ Emax, maximum positive with null significand (0x1p1023 for f64)
    let f_exp_max = F::from_parts(false, F::EXP_BIAS << 1, zero);

    // 2 ^ Emin, minimum positive normal with null significand (0x1p-1022 for f64)
    let f_exp_min = F::from_parts(false, 1, zero);

    // 2 ^ sig_total_bits, moltiplier to normalize subnormals (0x1p53 for f64)
    let f_pow_subnorm = F::from_parts(false, sig_total_bits + F::EXP_BIAS, zero);

    /*
     * The goal is to multiply `x` by a scale factor that applies `n`. However, there are cases
     * where `2^n` is not representable by `F` but the result should be, e.g. `x = 2^Emin` with
     * `n = -EMin + 2` (one out of range of 2^Emax). To get around this, reduce the magnitude of
     * the final scale operation by prescaling by the max/min power representable by `F`.
     */

    if n > exp_max {
        // Worse case positive `n`: `x`  is the minimum subnormal value, the result is `F::MAX`.
        // This can be reached by three scaling multiplications (two here and one final).
        debug_assert!(-exp_min + F::SIG_BITS as i32 + exp_max <= exp_max * 3);

        x *= f_exp_max;
        n -= exp_max;
        if n > exp_max {
            x *= f_exp_max;
            n -= exp_max;
            if n > exp_max {
                n = exp_max;
            }
        }
    } else if n < exp_min {
        // When scaling toward 0, the prescaling is limited to a value that does not allow `x` to
        // go subnormal. This avoids double rounding.
        if F::BITS > 16 {
            // `mul` s.t. `!(x * mul).is_subnormal() ∀ x`
            let mul = f_exp_min * f_pow_subnorm;
            let add = -exp_min - sig_total_bits as i32;

            // Worse case negative `n`: `x`  is the maximum positive value, the result is `F::MIN`.
            // This must be reachable by three scaling multiplications (two here and one final).
            debug_assert!(-exp_min + F::SIG_BITS as i32 + exp_max <= add * 2 + -exp_min);

            x *= mul;
            n += add;

            if n < exp_min {
                x *= mul;
                n += add;

                if n < exp_min {
                    n = exp_min;
                }
            }
        } else {
            // `f16` is unique compared to other float types in that the difference between the
            // minimum exponent and the significand bits (`add = -exp_min - sig_total_bits`) is
            // small, only three. The above method depend on decrementing `n` by `add` two times;
            // for other float types this works out because `add` is a substantial fraction of
            // the exponent range. For `f16`, however, 3 is relatively small compared to the
            // exponent range (which is 39), so that requires ~10 prescale rounds rather than two.
            //
            // Work aroudn this by using a different algorithm that calculates the prescale
            // dynamically based on the maximum possible value. This adds more operations per round
            // since it needs to construct the scale, but works better in the general case.
            let add = -(n + sig_total_bits as i32).clamp(exp_min, sig_total_bits as i32);
            let mul = F::from_parts(false, (F::EXP_BIAS as i32 - add) as u32, zero);

            x *= mul;
            n += add;

            if n < exp_min {
                let add = -(n + sig_total_bits as i32).clamp(exp_min, sig_total_bits as i32);
                let mul = F::from_parts(false, (F::EXP_BIAS as i32 - add) as u32, zero);

                x *= mul;
                n += add;

                if n < exp_min {
                    n = exp_min;
                }
            }
        }
    }

    let scale = F::from_parts(false, (F::EXP_BIAS as i32 + n) as u32, zero);
    x * scale
}

Coverage Report

Created: 2025-11-20 14:58

Line	Count	Source
1		use crate::support::{CastFrom, CastInto, Float, IntTy, MinInt};
2
3		/// Scale the exponent.
4		///
5		/// From N3220:
6		///
7		/// > The scalbn and scalbln functions compute `x * b^n`, where `b = FLT_RADIX` if the return type
8		/// > of the function is a standard floating type, or `b = 10` if the return type of the function
9		/// > is a decimal floating type. A range error occurs for some finite x, depending on n.
10		/// >
11		/// > [...]
12		/// >
13		/// > * `scalbn(±0, n)` returns `±0`.
14		/// > * `scalbn(x, 0)` returns `x`.
15		/// > * `scalbn(±∞, n)` returns `±∞`.
16		/// >
17		/// > If the calculation does not overflow or underflow, the returned value is exact and
18		/// > independent of the current rounding direction mode.
19		#[inline]
20	0	pub fn scalbn<F: Float>(mut x: F, mut n: i32) -> F
21	0	where
22	0	u32: CastInto<F::Int>,
23	0	F::Int: CastFrom<i32>,
24	0	F::Int: CastFrom<u32>,
25	0	{
26	0	let zero = IntTy::<F>::ZERO;
27	0
28	0	// Bits including the implicit bit
29	0	let sig_total_bits = F::SIG_BITS + 1;
30	0
31	0	// Maximum and minimum values when biased
32	0	let exp_max = F::EXP_MAX;
33	0	let exp_min = F::EXP_MIN;
34	0
35	0	// 2 ^ Emax, maximum positive with null significand (0x1p1023 for f64)
36	0	let f_exp_max = F::from_parts(false, F::EXP_BIAS << 1, zero);
37	0
38	0	// 2 ^ Emin, minimum positive normal with null significand (0x1p-1022 for f64)
39	0	let f_exp_min = F::from_parts(false, 1, zero);
40	0
41	0	// 2 ^ sig_total_bits, moltiplier to normalize subnormals (0x1p53 for f64)
42	0	let f_pow_subnorm = F::from_parts(false, sig_total_bits + F::EXP_BIAS, zero);
43	0
44	0	/*
45	0	* The goal is to multiply `x` by a scale factor that applies `n`. However, there are cases
46	0	* where `2^n` is not representable by `F` but the result should be, e.g. `x = 2^Emin` with
47	0	* `n = -EMin + 2` (one out of range of 2^Emax). To get around this, reduce the magnitude of
48	0	* the final scale operation by prescaling by the max/min power representable by `F`.
49	0	*/
50	0
51	0	if n > exp_max {
52		// Worse case positive `n`: `x` is the minimum subnormal value, the result is `F::MAX`.
53		// This can be reached by three scaling multiplications (two here and one final).
54	0	debug_assert!(-exp_min + F::SIG_BITS as i32 + exp_max <= exp_max * 3);
55
56	0	x *= f_exp_max;
57	0	n -= exp_max;
58	0	if n > exp_max {
59	0	x *= f_exp_max;
60	0	n -= exp_max;
61	0	if n > exp_max {
62	0	n = exp_max;
63	0	}
64	0	}
65	0	} else if n < exp_min {
66		// When scaling toward 0, the prescaling is limited to a value that does not allow `x` to
67		// go subnormal. This avoids double rounding.
68	0	if F::BITS > 16 {
69		// `mul` s.t. `!(x * mul).is_subnormal() ∀ x`
70	0	let mul = f_exp_min * f_pow_subnorm;
71	0	let add = -exp_min - sig_total_bits as i32;
72	0
73	0	// Worse case negative `n`: `x` is the maximum positive value, the result is `F::MIN`.
74	0	// This must be reachable by three scaling multiplications (two here and one final).
75	0	debug_assert!(-exp_min + F::SIG_BITS as i32 + exp_max <= add * 2 + -exp_min);
76
77	0	x *= mul;
78	0	n += add;
79	0
80	0	if n < exp_min {
81	0	x *= mul;
82	0	n += add;
83	0
84	0	if n < exp_min {
85	0	n = exp_min;
86	0	}
87	0	}
88		} else {
89		// `f16` is unique compared to other float types in that the difference between the
90		// minimum exponent and the significand bits (`add = -exp_min - sig_total_bits`) is
91		// small, only three. The above method depend on decrementing `n` by `add` two times;
92		// for other float types this works out because `add` is a substantial fraction of
93		// the exponent range. For `f16`, however, 3 is relatively small compared to the
94		// exponent range (which is 39), so that requires ~10 prescale rounds rather than two.
95		//
96		// Work aroudn this by using a different algorithm that calculates the prescale
97		// dynamically based on the maximum possible value. This adds more operations per round
98		// since it needs to construct the scale, but works better in the general case.
99	0	let add = -(n + sig_total_bits as i32).clamp(exp_min, sig_total_bits as i32);
100	0	let mul = F::from_parts(false, (F::EXP_BIAS as i32 - add) as u32, zero);
101	0
102	0	x *= mul;
103	0	n += add;
104	0
105	0	if n < exp_min {
106	0	let add = -(n + sig_total_bits as i32).clamp(exp_min, sig_total_bits as i32);
107	0	let mul = F::from_parts(false, (F::EXP_BIAS as i32 - add) as u32, zero);
108	0
109	0	x *= mul;
110	0	n += add;
111	0
112	0	if n < exp_min {
113	0	n = exp_min;
114	0	}
115	0	}
116		}
117	0	}
118
119	0	let scale = F::from_parts(false, (F::EXP_BIAS as i32 + n) as u32, zero);
120	0	x * scale
121	0	}