Java中的半精度浮点-Java 学习之路

是否存在可以在IEEE 754 half-precision数字上执行计算或将它们转换为双精度和从双精度转换的Java库？

这些方法中的任何一种都是合适的：

将数字保持为半精度格式，并使用整数运算和bit-twiddling计算（如MicroFloat用于单精度和双精度）
以单精度或双精度执行所有计算，转换为半精度进行传输（在这种情况下，我需要的是经过良好测试的转换函数 . ）

Edit ：转换需要100％准确 - 输入文件中有大量的NaN，无穷大和次正规 .

相关问题，但对于JavaScript：Decompressing Half Precision Floats in Javascript

5 回答

我创建了一个名为HalfPrecisionFloat的java类，它使用x4u的解决方案 . 该类具有便捷方法和错误检查 . 它更进一步，有从2字节半精度值返回Double和Float的方法 .

希望这会对某人有所帮助 .

==>

import java.nio.ByteBuffer;

/**
 * Accepts various forms of a floating point half-precision (2 byte) number 
 * and contains methods to convert to a
 * full-precision floating point number Float and Double instance.
 * <p>
 * This implemention was inspired by x4u who is a user contributing 
 * to stackoverflow.com.
 * (https://stackoverflow.com/users/237321/x4u).
 *
 * @author dougestep
 */
public class HalfPrecisionFloat {
    private short halfPrecision;
    private Float fullPrecision;

    /**
     * Creates an instance of the class from the supplied the supplied 
     * byte array.  The byte array must be exactly two bytes in length.
     *
     * @param bytes the two-byte byte array.
     */
    public HalfPrecisionFloat(byte[] bytes) {
        if (bytes.length != 2) {
            throw new IllegalArgumentException("The supplied byte array " +
              "must be exactly two bytes in length");
        }

        final ByteBuffer buffer = ByteBuffer.wrap(bytes);
        this.halfPrecision = buffer.getShort();
    }

    /**
     * Creates an instance of this class from the supplied short number.
     *
     * @param number the number defined as a short.
     */
    public HalfPrecisionFloat(final short number) {
        this.halfPrecision = number;
        this.fullPrecision = toFullPrecision();
    }

    /**
     * Creates an instance of this class from the supplied 
     * full-precision floating point number.
     *
     * @param number the float number.
     */
    public HalfPrecisionFloat(final float number) {
        if (number > Short.MAX_VALUE) {
            throw new IllegalArgumentException("The supplied float is too "
              + "large for a two byte representation");
        }
        if (number < Short.MIN_VALUE) {
            throw new IllegalArgumentException("The supplied float is too "
              + "small for a two byte representation");
        }

        final int val = fromFullPrecision(number);
        this.halfPrecision = (short) val;
        this.fullPrecision = number;
    }

    /**
     * Returns the half-precision float as a number defined as a short.
     *
     * @return the short.
     */
    public short getHalfPrecisionAsShort() {
        return halfPrecision;
    }

    /**
     * Returns a full-precision floating pointing number from the 
     * half-precision value assigned on this instance.
     *
     * @return the full-precision floating pointing number.
     */
    public float getFullFloat() {
        if (fullPrecision == null) {
            fullPrecision = toFullPrecision();
        }
        return fullPrecision;
    }

    /**
     * Returns a full-precision double floating point number from the 
     * half-precision value assigned on this instance.
     *
     * @return the full-precision double floating pointing number.
     */
    public double getFullDouble() {
        return new Double(getFullFloat());
    }

    /**
     * Returns the full-precision float number from the half-precision 
     * value assigned on this instance.
     *
     * @return the full-precision floating pointing number.
     */
    private float toFullPrecision() {
        int mantisa = halfPrecision & 0x03ff;
        int exponent = halfPrecision & 0x7c00;

        if (exponent == 0x7c00) {
            exponent = 0x3fc00;
        } else if (exponent != 0) {
            exponent += 0x1c000;
            if (mantisa == 0 && exponent > 0x1c400) {
                return Float.intBitsToFloat(
                  (halfPrecision & 0x8000) << 16 | exponent << 13 | 0x3ff);
            }
        } else if (mantisa != 0) {
            exponent = 0x1c400;
            do {
                mantisa <<= 1;
                exponent -= 0x400;
            } while ((mantisa & 0x400) == 0);
            mantisa &= 0x3ff;
        }

        return Float.intBitsToFloat(
         (halfPrecision & 0x8000) << 16 | (exponent | mantisa) << 13);
    }

    /**
     * Returns the integer representation of the supplied 
     * full-precision floating pointing number.
     *
     * @param number the full-precision floating pointing number.
     * @return the integer representation.
     */
    private int fromFullPrecision(final float number) {
        int fbits = Float.floatToIntBits(number);
        int sign = fbits >>> 16 & 0x8000;

        int val = (fbits & 0x7fffffff) + 0x1000;

        if (val >= 0x47800000) {
            if ((fbits & 0x7fffffff) >= 0x47800000) {
                if (val < 0x7f800000) {
                    return sign | 0x7c00;
                }
                return sign | 0x7c00 | (fbits & 0x007fffff) >>> 13;
            }
            return sign | 0x7bff;
        }
        if (val >= 0x38800000) {
            return sign | val - 0x38000000 >>> 13;
        }
        if (val < 0x33000000) {
            return sign;
        }
        val = (fbits & 0x7fffffff) >>> 23;
        return sign | ((fbits & 0x7fffff | 0x800000) 
         + (0x800000 >>> val - 102) >>> 126 - val);
    }

这是单元测试

import org.junit.Assert;
import org.junit.Test;

import java.nio.ByteBuffer;

public class TestHalfPrecision {

  private byte[] simulateBytes(final float fullPrecision) {
    HalfPrecisionFloat halfFloat = new HalfPrecisionFloat(fullPrecision);
    short halfShort = halfFloat.getHalfPrecisionAsShort();

    ByteBuffer buffer = ByteBuffer.allocate(2);
    buffer.putShort(halfShort);
    return buffer.array();
  }

  @Test
  public void testHalfPrecisionToFloatApproach() {
    final float startingValue = 1.2f;
    final float closestValue = 1.2001953f;
    final short shortRepresentation = (short) 15565;

    byte[] bytes = simulateBytes(startingValue);
    HalfPrecisionFloat halfFloat = new HalfPrecisionFloat(bytes);
    final float retFloat = halfFloat.getFullFloat();
    Assert.assertEquals(new Float(closestValue), new Float(retFloat));

    HalfPrecisionFloat otherWay = new HalfPrecisionFloat(retFloat);
    final short shrtValue = otherWay.getHalfPrecisionAsShort();
    Assert.assertEquals(new Short(shortRepresentation), new Short(shrtValue));

    HalfPrecisionFloat backAgain = new HalfPrecisionFloat(shrtValue);
    final float backFlt = backAgain.getFullFloat();
    Assert.assertEquals(new Float(closestValue), new Float(backFlt));

    HalfPrecisionFloat dbl = new HalfPrecisionFloat(startingValue);
    final double retDbl = dbl.getFullDouble();
    Assert.assertEquals(new Double(startingValue), new Double(retDbl));
  }

  @Test(expected = IllegalArgumentException.class)
  public void testInvalidByteArray() {
    ByteBuffer buffer = ByteBuffer.allocate(4);
    buffer.putFloat(Float.MAX_VALUE);
    byte[] bytes = buffer.array();

    new HalfPrecisionFloat(bytes);
  }

  @Test(expected = IllegalArgumentException.class)
  public void testInvalidMaxFloat() {
    new HalfPrecisionFloat(Float.MAX_VALUE);
  }

  @Test(expected = IllegalArgumentException.class)
  public void testInvalidMinFloat() {
    new HalfPrecisionFloat(-35000);
  }

  @Test
  public void testCreateWithShort() {
    HalfPrecisionFloat sut = new HalfPrecisionFloat(Short.MAX_VALUE);
    Assert.assertEquals(Short.MAX_VALUE, sut.getHalfPrecisionAsShort());
  }
}

回复于 2024-04-26T19:22:30+08:00

我对小正浮点数感兴趣，所以我使用 12 bits mantissa, no sign bit, and 4 bits exponent, with bias 15 构建了这个变体，这样它可以表示0到1.00之间的数字（独占）非常好 . 它在尾数额外有2位分辨率，但是相同的指数低 .

public static float toFloat(int hbits) {
    int mant = hbits & 0x0fff;            // 12 bits mantissa
    int exp =  (hbits & 0xf000) >>> 12;   // 4 bits exponent
    if (exp == 0xf) {
        exp = 0xff;
    } else {
        if (exp != 0) { // normal value
            exp += 127 - 15;
        } else { // subnormal value
            if (mant != 0) { // not zero
                exp += 127 - 15;
                // make it noral
                exp++;
                do {
                    mant <<= 1;
                    exp--;
                } while ((mant & 0x1000) == 0);
                mant &= 0x0fff;
            }
        }
    }
    return Float.intBitsToFloat(exp << 23 | mant << 11);
}

public static int fromFloat(float fval) {
    int fbits = Float.floatToIntBits( fval );
    int val = ( fbits & 0x7fffffff ) + 0x400; // rounded value
    if( val < 0x32000000 )                // too small for subnormal or negative
        return 0;                         // becomes 0

    if( val >= 0x47800000 )               // might be or become NaN/Inf
    {                                     // avoid Inf due to rounding
        if( ( fbits & 0x7fffffff ) >= 0x47800000 )
        {                                 // is or must become NaN/Inf
            if( val < 0x7f800000 )        // was value but too large
                return 0xf000;            // make it +/-Inf
            return 0xf000 |               // remains +/-Inf or NaN
                ( fbits & 0x007fffff ) >>> 11; // keep NaN (and Inf) bits
        }
        return 0x7fff;                    // unrounded not quite Inf
    }
    if( val >= 0x38800000 )               // remains normalized value
        return val - 0x38000000 >>> 11;   // exp - 127 + 15

    val = ( fbits & 0x7fffffff ) >>> 23;  // tmp exp for subnormal calc
    return ( ( fbits & 0x7f_ffff | 0x80_0000 ) // add subnormal bit
            + ( 0x800000 >>> val - 100 )     // round depending on cut off
            >>> 124 - val );   // div by 2^(1-(exp-127+15)) and >> 11 | exp=0
}

测试给出：

Smallest subnormal float      : 0.0000000149
Largest  subnormal float      : 0.0000610203
Smallest    normal float      : 0.0000610352
Smallest    normal float + ups: 0.0000610501
E=1, M=fff (max)              : 0.0001220554
Largest     normal float      : 0.0078115463

法线：

0.9990000129  => 3f7fbe77 => eff8  => 0.9990234375  | error: 0.002%
0.8991000056  => 3f662b6b => ecc5  => 0.8990478516  | error: 0.006%
0.8091899753  => 3f4f2713 => e9e5  => 0.8092041016  | error: 0.002%
0.7282709479  => 3f3a6ff7 => e74e  => 0.7282714844  | error: 0.000%
0.6554438472  => 3f27cb2b => e4f9  => 0.6553955078  | error: 0.007%
0.5898994207  => 3f1703a6 => e2e0  => 0.5898437500  | error: 0.009%
0.5309094787  => 3f07e9af => e0fd  => 0.5308837891  | error: 0.005%
0.4778185189  => 3ef4a4a1 => de95  => 0.4778442383  | error: 0.005%
0.4300366640  => 3edc2dc4 => db86  => 0.4300537109  | error: 0.004%
0.3870329857  => 3ec62930 => d8c5  => 0.3870239258  | error: 0.002%
0.3483296633  => 3eb25844 => d64b  => 0.3483276367  | error: 0.001%
0.3134966791  => 3ea082a3 => d410  => 0.3134765625  | error: 0.006%
0.2821469903  => 3e907592 => d20f  => 0.2821655273  | error: 0.007%
0.2539322972  => 3e82036a => d040  => 0.2539062500  | error: 0.010%
0.2285390645  => 3e6a0625 => cd41  => 0.2285461426  | error: 0.003%
0.2056851536  => 3e529f21 => ca54  => 0.2056884766  | error: 0.002%
0.1851166338  => 3e3d8f37 => c7b2  => 0.1851196289  | error: 0.002%
0.1666049659  => 3e2a9a7e => c553  => 0.1665954590  | error: 0.006%
0.1499444693  => 3e198b0b => c331  => 0.1499328613  | error: 0.008%
0.1349500120  => 3e0a3056 => c146  => 0.1349487305  | error: 0.001%
0.1214550063  => 3df8bd67 => bf18  => 0.1214599609  | error: 0.004%
0.1093095019  => 3ddfdda9 => bbfc  => 0.1093139648  | error: 0.004%
0.0983785465  => 3dc97ab1 => b92f  => 0.0983734131  | error: 0.005%
0.0885406882  => 3db554d2 => b6ab  => 0.0885467529  | error: 0.007%
0.0796866193  => 3da332bd => b466  => 0.0796813965  | error: 0.007%
0.0717179552  => 3d92e0dd => b25c  => 0.0717163086  | error: 0.002%
0.0645461604  => 3d8430c7 => b086  => 0.0645446777  | error: 0.002%
0.0580915436  => 3d6df166 => adbe  => 0.0580902100  | error: 0.002%
0.0522823893  => 3d56260f => aac5  => 0.0522842407  | error: 0.004%
0.0470541492  => 3d40bbda => a817  => 0.0470504761  | error: 0.008%
0.0423487313  => 3d2d75dd => a5af  => 0.0423507690  | error: 0.005%
0.0381138586  => 3d1c1d47 => a384  => 0.0381164551  | error: 0.007%
0.0343024731  => 3d0c80c0 => a190  => 0.0343017578  | error: 0.002%
0.0308722258  => 3cfce7c0 => 9f9d  => 0.0308723450  | error: 0.000%
0.0277850032  => 3ce39d60 => 9c74  => 0.0277862549  | error: 0.005%
0.0250065029  => 3cccda70 => 999b  => 0.0250053406  | error: 0.005%
0.0225058515  => 3cb85e31 => 970c  => 0.0225067139  | error: 0.004%
0.0202552658  => 3ca5ee5f => 94be  => 0.0202560425  | error: 0.004%
0.0182297379  => 3c955688 => 92ab  => 0.0182304382  | error: 0.004%
0.0164067633  => 3c86677a => 90cd  => 0.0164070129  | error: 0.002%
0.0147660868  => 3c71ed75 => 8e3e  => 0.0147666931  | error: 0.004%
0.0132894777  => 3c59bc1c => 8b38  => 0.0132904053  | error: 0.007%
0.0119605297  => 3c43f619 => 887f  => 0.0119609833  | error: 0.004%
0.0107644768  => 3c305d7d => 860c  => 0.0107650757  | error: 0.006%
0.0096880291  => 3c1eba8a => 83d7  => 0.0096874237  | error: 0.006%
0.0087192263  => 3c0edb16 => 81db  => 0.0087184906  | error: 0.008%
0.0078473035  => 3c0091fa => 8012  => 0.0078468323  | error: 0.006%
0.0070625730  => 3be76d28 => 7cee  => 0.0070629120  | error: 0.005%
0.0063563157  => 3bd048a4 => 7a09  => 0.0063562393  | error: 0.001%
0.0057206838  => 3bbb7493 => 776f  => 0.0057210922  | error: 0.007%
0.0051486152  => 3ba8b5b7 => 7517  => 0.0051488876  | error: 0.005%
0.0046337536  => 3b97d6be => 72fb  => 0.0046339035  | error: 0.003%
0.0041703782  => 3b88a7ab => 7115  => 0.0041704178  | error: 0.001%
0.0037533403  => 3b75fa9a => 6ebf  => 0.0037531853  | error: 0.004%
0.0033780062  => 3b5d618a => 6bac  => 0.0033779144  | error: 0.003%
0.0030402055  => 3b473e2f => 68e8  => 0.0030403137  | error: 0.004%
0.0027361847  => 3b335190 => 666a  => 0.0027360916  | error: 0.003%
0.0024625661  => 3b216301 => 642c  => 0.0024623871  | error: 0.007%
0.0022163095  => 3b113f81 => 6228  => 0.0022163391  | error: 0.001%
0.0019946785  => 3b02b927 => 6057  => 0.0019946098  | error: 0.003%
0.0017952106  => 3aeb4d46 => 5d6a  => 0.0017952919  | error: 0.005%
0.0016156895  => 3ad3c58b => 5a79  => 0.0016157627  | error: 0.005%
0.0014541205  => 3abe9830 => 57d3  => 0.0014541149  | error: 0.000%
0.0013087085  => 3aab88f8 => 5571  => 0.0013086796  | error: 0.002%
0.0011778376  => 3a9a61ac => 534c  => 0.0011777878  | error: 0.004%
0.0010600538  => 3a8af181 => 515e  => 0.0010600090  | error: 0.004%
0.0009540484  => 3a7a191b => 4f43  => 0.0009540319  | error: 0.002%
0.0008586436  => 3a611698 => 4c23  => 0.0008586645  | error: 0.002%
0.0007727792  => 3a4a9455 => 4953  => 0.0007728338  | error: 0.007%
0.0006955012  => 3a36524c => 46ca  => 0.0006954670  | error: 0.005%
0.0006259511  => 3a2416de => 4483  => 0.0006259680  | error: 0.003%
0.0005633560  => 3a13ae2e => 4276  => 0.0005633831  | error: 0.005%
0.0005070204  => 3a04e990 => 409d  => 0.0005069971  | error: 0.005%
0.0004563183  => 39ef3e03 => 3de8  => 0.0004563332  | error: 0.003%
0.0004106865  => 39d75169 => 3aea  => 0.0004106760  | error: 0.003%
0.0003696179  => 39c1c945 => 3839  => 0.0003696084  | error: 0.003%
0.0003326561  => 39ae6857 => 35cd  => 0.0003326535  | error: 0.001%
0.0002993904  => 399cf781 => 339f  => 0.0002993941  | error: 0.001%
0.0002694514  => 398d4527 => 31a9  => 0.0002694726  | error: 0.008%
0.0002425062  => 397e4946 => 2fc9  => 0.0002425015  | error: 0.002%
0.0002182556  => 3964db8b => 2c9b  => 0.0002182424  | error: 0.006%
0.0001964300  => 394df8ca => 29bf  => 0.0001964271  | error: 0.001%
0.0001767870  => 39395fe9 => 272c  => 0.0001767874  | error: 0.000%
0.0001591083  => 3926d651 => 24db  => 0.0001591146  | error: 0.004%
0.0001431975  => 39162749 => 22c5  => 0.0001432002  | error: 0.002%
0.0001288777  => 3907235b => 20e4  => 0.0001288652  | error: 0.010%
0.0001159900  => 38f33fa3 => 1e68  => 0.0001159906  | error: 0.001%
0.0001043910  => 38daec79 => 1b5e  => 0.0001043975  | error: 0.006%
0.0000939519  => 38c50806 => 18a1  => 0.0000939518  | error: 0.000%
0.0000845567  => 38b15405 => 162b  => 0.0000845641  | error: 0.009%
0.0000761010  => 389f986b => 13f3  => 0.0000761002  | error: 0.001%
0.0000684909  => 388fa2c6 => 11f4  => 0.0000684857  | error: 0.008%
0.0000616418  => 388145b2 => 1029  => 0.0000616461  | error: 0.007%

对于次正常测试：

0.0000554776  => 3868b0a6 => 0e8b  => 0.0000554770  | error: 0.001%
0.0000499299  => 38516bc8 => 0d17  => 0.0000499338  | error: 0.008%
0.0000449369  => 383c7a9a => 0bc8  => 0.0000449419  | error: 0.011%
0.0000404432  => 3829a18a => 0a9a  => 0.0000404418  | error: 0.004%
0.0000363989  => 3818aafc => 098b  => 0.0000364035  | error: 0.013%
0.0000327590  => 380966af => 0896  => 0.0000327528  | error: 0.019%
0.0000294831  => 37f7526e => 07bb  => 0.0000294894  | error: 0.021%
0.0000265348  => 37de96fc => 06f5  => 0.0000265390  | error: 0.016%
0.0000238813  => 37c854af => 0643  => 0.0000238866  | error: 0.022%
0.0000214932  => 37b44c37 => 05a2  => 0.0000214875  | error: 0.026%
0.0000193438  => 37a24498 => 0512  => 0.0000193417  | error: 0.011%
0.0000174095  => 37920a89 => 0490  => 0.0000174046  | error: 0.028%
0.0000156685  => 37836fe1 => 041b  => 0.0000156611  | error: 0.047%
0.0000141017  => 376c962e => 03b2  => 0.0000140965  | error: 0.037%
0.0000126915  => 3754ed8f => 0354  => 0.0000126958  | error: 0.034%
0.0000114223  => 373fa29a => 02ff  => 0.0000114292  | error: 0.060%
0.0000102801  => 372c78be => 02b2  => 0.0000102818  | error: 0.016%
0.0000092521  => 371b3978 => 026d  => 0.0000092536  | error: 0.016%
0.0000083269  => 370bb3b9 => 022f  => 0.0000083297  | error: 0.034%
0.0000074942  => 36fb76b3 => 01f7  => 0.0000074953  | error: 0.014%
0.0000067448  => 36e2513a => 01c5  => 0.0000067502  | error: 0.081%
0.0000060703  => 36cbaf81 => 0197  => 0.0000060648  | error: 0.091%
0.0000054633  => 36b75127 => 016f  => 0.0000054687  | error: 0.100%
0.0000049169  => 36a4fc3c => 014a  => 0.0000049174  | error: 0.009%
0.0000044253  => 36947c9c => 0129  => 0.0000044256  | error: 0.009%
0.0000039827  => 3685a359 => 010b  => 0.0000039786  | error: 0.103%
0.0000035845  => 36708c6d => 00f1  => 0.0000035912  | error: 0.188%
0.0000032260  => 36587e62 => 00d8  => 0.0000032187  | error: 0.228%
0.0000029034  => 3642d825 => 00c3  => 0.0000029057  | error: 0.080%
0.0000026131  => 362f5c21 => 00af  => 0.0000026077  | error: 0.205%
0.0000023518  => 361dd2ea => 009e  => 0.0000023544  | error: 0.112%
0.0000021166  => 360e0a9f => 008e  => 0.0000021160  | error: 0.029%
0.0000019049  => 35ffacb7 => 0080  => 0.0000019073  | error: 0.127%
0.0000017144  => 35e61b71 => 0073  => 0.0000017136  | error: 0.047%
0.0000015430  => 35cf18b2 => 0068  => 0.0000015497  | error: 0.436%
0.0000013887  => 35ba6306 => 005d  => 0.0000013858  | error: 0.208%
0.0000012498  => 35a7bf85 => 0054  => 0.0000012517  | error: 0.150%
0.0000011248  => 3596f92b => 004b  => 0.0000011176  | error: 0.645%
0.0000010124  => 3587e040 => 0044  => 0.0000010133  | error: 0.091%
0.0000009111  => 357493a6 => 003d  => 0.0000009090  | error: 0.236%
0.0000008200  => 355c1e7b => 0037  => 0.0000008196  | error: 0.054%
0.0000007380  => 35461b6e => 0032  => 0.0000007451  | error: 0.955%
0.0000006642  => 35324be3 => 002d  => 0.0000006706  | error: 0.955%
0.0000005978  => 3520777f => 0028  => 0.0000005960  | error: 0.291%
0.0000005380  => 35106b8c => 0024  => 0.0000005364  | error: 0.291%
0.0000004842  => 3501fa64 => 0020  => 0.0000004768  | error: 1.522%
0.0000004358  => 34e9f5e7 => 001d  => 0.0000004321  | error: 0.838%
0.0000003922  => 34d29083 => 001a  => 0.0000003874  | error: 1.218%
0.0000003530  => 34bd820f => 0018  => 0.0000003576  | error: 1.315%
0.0000003177  => 34aa8ea7 => 0015  => 0.0000003129  | error: 1.499%
0.0000002859  => 34998063 => 0013  => 0.0000002831  | error: 0.978%
0.0000002573  => 348a26bf => 0011  => 0.0000002533  | error: 1.557%
0.0000002316  => 3478ac24 => 0010  => 0.0000002384  | error: 2.947%
0.0000002084  => 345fce20 => 000e  => 0.0000002086  | error: 0.087%
0.0000001876  => 34496cb6 => 000d  => 0.0000001937  | error: 3.264%
0.0000001688  => 3435483d => 000b  => 0.0000001639  | error: 2.914%
0.0000001519  => 3423276a => 000a  => 0.0000001490  | error: 1.933%
0.0000001368  => 3412d6ac => 0009  => 0.0000001341  | error: 1.933%
0.0000001231  => 3404279b => 0008  => 0.0000001192  | error: 3.144%
0.0000001108  => 33ede0e3 => 0007  => 0.0000001043  | error: 5.834%
0.0000000997  => 33d61732 => 0007  => 0.0000001043  | error: 4.629%
0.0000000897  => 33c0ae79 => 0006  => 0.0000000894  | error: 0.354%
0.0000000808  => 33ad69d3 => 0005  => 0.0000000745  | error: 7.735%
0.0000000727  => 339c1271 => 0005  => 0.0000000745  | error: 2.517%
0.0000000654  => 338c76ff => 0004  => 0.0000000596  | error: 8.874%
0.0000000589  => 337cd631 => 0004  => 0.0000000596  | error: 1.251%
0.0000000530  => 33638d92 => 0004  => 0.0000000596  | error: 12.501%
0.0000000477  => 334ccc36 => 0003  => 0.0000000447  | error: 6.249%
0.0000000429  => 33385163 => 0003  => 0.0000000447  | error: 4.168%
0.0000000386  => 3325e2d9 => 0003  => 0.0000000447  | error: 15.742%
0.0000000348  => 33154c29 => 0002  => 0.0000000298  | error: 14.265%
0.0000000313  => 33065e25 => 0002  => 0.0000000298  | error: 4.739%
0.0000000282  => 32f1dca9 => 0002  => 0.0000000298  | error: 5.846%
0.0000000253  => 32d9acfe => 0002  => 0.0000000298  | error: 17.606%
0.0000000228  => 32c3e87e => 0002  => 0.0000000298  | error: 30.673%
0.0000000205  => 32b0513e => 0001  => 0.0000000149  | error: 27.404%
0.0000000185  => 329eaf84 => 0001  => 0.0000000149  | error: 19.337%
0.0000000166  => 328ed12a => 0001  => 0.0000000149  | error: 10.375%
0.0000000150  => 3280890c => 0001  => 0.0000000149  | error: 0.416%
0.0000000135  => 32675d15 => 0001  => 0.0000000149  | error: 10.648%
0.0000000121  => 32503a2c => 0001  => 0.0000000149  | error: 22.943%
0.0000000109  => 323b678e => 0001  => 0.0000000149  | error: 36.603%
0.0000000098  => 3228aa00 => 0001  => 0.0000000149  | error: 51.781%
0.0000000088  => 3217cc33 => 0001  => 0.0000000149  | error: 68.646%
0.0000000080  => 32089e2e => 0001  => 0.0000000149  | error: 87.384%
0.0000000072  => 31f5e986 => 0000  => 0.0000000000  | error: 100.000%

回复于 2024-04-26T19:22:30+08:00

您可以使用 Float.intBitsToFloat() 和 Float.floatToIntBits() 将它们转换为原始浮点值和从原始浮点值转换它们 . 如果你可以使用截断的精度（而不是舍入），那么转换应该可以通过几个位移来实现 .

我现在已经付出了更多的努力，结果并没有我在开始时预期的那么简单 . 这个版本现在在我能想象的每个方面进行测试和验证，我非常有信心它可以为所有可能的输入值生成精确的结果 . 它支持任意方向的精确舍入和次正规转换 .

// ignores the higher 16 bits
public static float toFloat( int hbits )
{
    int mant = hbits & 0x03ff;            // 10 bits mantissa
    int exp =  hbits & 0x7c00;            // 5 bits exponent
    if( exp == 0x7c00 )                   // NaN/Inf
        exp = 0x3fc00;                    // -> NaN/Inf
    else if( exp != 0 )                   // normalized value
    {
        exp += 0x1c000;                   // exp - 15 + 127
        if( mant == 0 && exp > 0x1c400 )  // smooth transition
            return Float.intBitsToFloat( ( hbits & 0x8000 ) << 16
                                            | exp << 13 | 0x3ff );
    }
    else if( mant != 0 )                  // && exp==0 -> subnormal
    {
        exp = 0x1c400;                    // make it normal
        do {
            mant <<= 1;                   // mantissa * 2
            exp -= 0x400;                 // decrease exp by 1
        } while( ( mant & 0x400 ) == 0 ); // while not normal
        mant &= 0x3ff;                    // discard subnormal bit
    }                                     // else +/-0 -> +/-0
    return Float.intBitsToFloat(          // combine all parts
        ( hbits & 0x8000 ) << 16          // sign  << ( 31 - 15 )
        | ( exp | mant ) << 13 );         // value << ( 23 - 10 )
}

// returns all higher 16 bits as 0 for all results
public static int fromFloat( float fval )
{
    int fbits = Float.floatToIntBits( fval );
    int sign = fbits >>> 16 & 0x8000;          // sign only
    int val = ( fbits & 0x7fffffff ) + 0x1000; // rounded value

    if( val >= 0x47800000 )               // might be or become NaN/Inf
    {                                     // avoid Inf due to rounding
        if( ( fbits & 0x7fffffff ) >= 0x47800000 )
        {                                 // is or must become NaN/Inf
            if( val < 0x7f800000 )        // was value but too large
                return sign | 0x7c00;     // make it +/-Inf
            return sign | 0x7c00 |        // remains +/-Inf or NaN
                ( fbits & 0x007fffff ) >>> 13; // keep NaN (and Inf) bits
        }
        return sign | 0x7bff;             // unrounded not quite Inf
    }
    if( val >= 0x38800000 )               // remains normalized value
        return sign | val - 0x38000000 >>> 13; // exp - 127 + 15
    if( val < 0x33000000 )                // too small for subnormal
        return sign;                      // becomes +/-0
    val = ( fbits & 0x7fffffff ) >>> 23;  // tmp exp for subnormal calc
    return sign | ( ( fbits & 0x7fffff | 0x800000 ) // add subnormal bit
         + ( 0x800000 >>> val - 102 )     // round depending on cut off
      >>> 126 - val );   // div by 2^(1-(exp-127+15)) and >> 13 | exp=0
}

与本书相比，我实现了两个小扩展，因为16位浮点数的一般精度相当低，这可能使浮点格式的固有异常在视觉上可感知，而较大的浮点类型由于其足够的精度而通常不会被注意到 .

第一个是 toFloat() 函数中的这两行：

if( mant == 0 && exp > 0x1c400 )  // smooth transition
    return Float.intBitsToFloat( ( hbits & 0x8000 ) << 16 | exp << 13 | 0x3ff );

类型大小的正常范围内的浮点数采用指数，因此精度为值的大小 . 但这并非顺利采用，而是按步骤进行：切换到下一个更高的指数会导致精度降低一半 . 对于尾数的所有值，精度现在保持不变，直到下一个跳到下一个更高的指数 . 上面的扩展代码通过返回该特定半浮点值的覆盖32位浮点范围的地理中心中的值，使这些转换更平滑 . 每个正常的半浮点值都精确映射到8192个32位浮点值 . 返回的值应该恰好位于这些值的中间 . 但是在半浮点指数的转换处，较低的4096值具有两倍于上4096值的精度，因此覆盖的数字空间仅为另一侧的一半 . 所有这些8192 32位浮点值映射到相同的半浮点值，因此将半浮点数转换为32位并返回导致相同的半浮点值，无论选择了哪个8192中间32位值 . 现在，扩展现在在转换时产生更平滑的半步长因子sqrt（2），如右图所示，而左图像应该将锐步步骤可视化为2而没有抗锯齿 . 您可以安全地从代码中删除这两行以获得标准行为 .

covered number space on either side of the returned value:
       6.0E-8             #######                  ##########
       4.5E-8             |                       #
       3.0E-8     #########               ########

第二个扩展名在 fromFloat() 函数中：

{                                     // avoid Inf due to rounding
        if( ( fbits & 0x7fffffff ) >= 0x47800000 )
...
        return sign | 0x7bff;             // unrounded not quite Inf
    }

此扩展稍微扩展了半浮点数格式的数字范围，方法是保存一些32位值，从而将其提升为Infinity . 受影响的值是那些在没有舍入的情况下小于无穷大的值，并且由于舍入而仅变为无穷大 . 如果您不想要此扩展名，可以安全地删除上面显示的行 .

我试图尽可能多地优化 fromFloat() 函数中正常值的路径，这使得它由于使用了预先计算和未移位的常量而变得不那么可读 . 我没有't put as much effort into ' toFloat（）'，因为它无论如何都不会超过查找表的性能 . 因此，如果速度真的很重要，可以使用 toFloat() 函数仅填充带有0x10000元素的静态查找表，而不是使用此表进行实际转换 . 使用当前的x64服务器虚拟机大约快3倍，使用x86客户端虚拟机大约快5倍 .

我把代码放在公共领域 .

回复于 2024-04-26T19:22:30+08:00

0

x4u的代码正确地将值1编码为0x3c00（ref：https://en.wikipedia.org/wiki/Half-precision_floating-point_format） . 但是具有平滑度改进的解码器将其解码为1.000122 . 维基百科条目表示可以表示整数值0..2048究竟 . 不太好...
从toFloat代码中删除 "| 0x3ff" 确保 toFloat(fromFloat(k)) == k 对于-2048..2048范围内的整数k，可能代价是平滑度稍差 .

回复于 2024-04-26T19:22:30+08:00

在我看到这里发布的解决方案之前，我已经掀起了一些简单的事情：

public static float toFloat(int nHalf)
    {
    int S = (nHalf >>> 15) & 0x1;                                                             
    int E = (nHalf >>> 10) & 0x1F;                                                            
    int T = (nHalf       ) & 0x3FF;                                                           

    E = E == 0x1F                                                                            
            ? 0xFF  // it's 2^w-1; it's all 1's, so keep it all 1's for the 32-bit float       
            : E - 15 + 127;     // adjust the exponent from the 16-bit bias to the 32-bit bias

    // sign S is now bit 31                                                                    
    // exp E is from bit 30 to bit 23                                                          
    // scale T by 13 binary digits (it grew from 10 to 23 bits)                                
    return Float.intBitsToFloat(S << 31 | E << 23 | T << 13);                               
    }

不过，我确实喜欢其他发布解决方案中的方法 . 以供参考：

// notes from the IEEE-754 specification:

    // left to right bits of a binary floating point number:
    // size        bit ids       name  description
    // ----------  ------------  ----  ---------------------------
    // 1 bit                       S   sign
    // w bits      E[0]..E[w-1]    E   biased exponent
    // t=p-1 bits  d[1]..d[p-1]    T   trailing significant field

    // The range of the encoding’s biased exponent E shall include:
    // ― every integer between 1 and 2^w − 2, inclusive, to encode normal numbers
    // ― the reserved value 0 to encode ±0 and subnormal numbers
    // ― the reserved value 2w − 1 to encode +/-infinity and NaN

    // The representation r of the floating-point datum, and value v of the floating-point datum
    // represented, are inferred from the constituent fields as follows:
    // a) If E == 2^w−1 and T != 0, then r is qNaN or sNaN and v is NaN regardless of S
    // b) If E == 2^w−1 and T == 0, then r=v=(−1)^S * (+infinity)
    // c) If 1 <= E <= 2^w−2, then r is (S, (E−bias), (1 + 2^(1−p) * T))
    //    the value of the corresponding floating-point number is
    //        v = (−1)^S * 2^(E−bias) * (1 + 2^(1−p) * T)
    //    thus normal numbers have an implicit leading significand bit of 1
    // d) If E == 0 and T != 0, then r is (S, emin, (0 + 2^(1−p) * T))
    //    the value of the corresponding floating-point number is
    //        v = (−1)^S * 2^emin * (0 + 2^(1−p) * T)
    //    thus subnormal numbers have an implicit leading significand bit of 0
    // e) If E == 0 and T ==0, then r is (S, emin, 0) and v = (−1)^S * (+0)

    // parameter                                      bin16  bin32
    // --------------------------------------------   -----  -----
    // k, storage width in bits                         16     32
    // p, precision in bits                             11     24
    // emax, maxiumum exponent e                        15    127
    // bias, E-e                                        15    127
    // sign bit                                          1      1
    // w, exponent field width in bits                   5      8
    // t, trailing significant field width in bits      10     23

回复于 2024-04-26T19:22:30+08:00