179,99999991618096828460693359375° (0x7FFFFFFF)
calc: -1,4629180792671596820949490891972e-9
VC-CRT: -1.4629183534635463e-009
ich: -1.4629180e-009
180° (0x80000000)
calc: 0
VC-CRT: -1.2246467991473532e-016
ich: 0.00000000
Es ist immer das gleiche Muster. Der Fehler tritt ausschließlich bei Winkeln >179 ° auf. Das bedeutet: Die VC-CRT benutzt für [0, 180[ ° eine ähnliche Annäherungsfunktion wie ich; darüber allerdings ausschließlich mit positiven Winkeln nach Modulo – und vertut sich dadurch (wie es wohl auch meine Implementierung bis heute abend getan hatte).
Sie ist an diesen Stellen nur auf 3 ULPs genau; und die liegen auch noch so ungünstig, dass selbst nach dem Runden zu float noch eine ULP Abweichung bestehen bleibt.
Hier ist mein Quelltext; die Funktion ist in dieser Form unter x64 doppelt so schnell wie das VC-CRT-tan():
Code: Alles auswählen
struct Angle { unsigned int unorm; };
typedef float Float4B;
typedef double Float8B;
Float4B tangentOf(
Angle angle
) {
// Tangent can be divided into four sectors:
// • 0: [ 0°, 45°[
// • 1: [ 45°, 90°[
// • 2: [ 90°, 135°[ (inversion of sector 1)
// • 3: [135°, 180°[ (inversion of sector 0)
// At 180°, the function repeats.
// Since the angle is given as an unsigned normalized integer, and each sector is exactly the 8th of the overall range,
// the sector can be determined from bits 30 and 29 (mask 0x60000000):
// • 0x00000000: 0
// • 0x20000000: 1
// • 0x40000000: 2
// • 0x60000000: 3
auto const sectorID = angle.unorm & 0x60000000;
// Sector 0 is computed via Green, Robin — "Faster Math Functions", pg. 15 (http://www.research.scea.com/research/pdfs/RGREENfastermath_GDC02.pdf):
//
// 0.999999986 x - 0.0958010197 x³
// tan(x) = ————————————————————————————————————
// 1 - 0.429135022 x² + 0.00971659383 x²²
//
// Sector 1 is computed by a similar term, but with numerator and denominator flipped after assigning
//
// x = pi/2 - x
//
// 1 - 0.429135022 x² + 0.00971659383 x²²
// tan(x) = ————————————————————————————————————
// 0.999999986 x - 0.0958010197 x³
//
// When converting the angle to a floating-point number, one must multiply by (2 pi) / 2³². This multiplication can be
// saved by applying it to the constant factors:
//
// 1.4629180587863065713111022695911e-9 x - 2.9993707578799427148695747771533e-28 x³
// tan(x) = ——————————————————————————————————————————————————————————————————————————————————————
// 1 - 9.1840443709068308636681822066321e-19 x² + 4.4503490744640484968825016740227e-38 x²²
//
// Gather all constants in the same memory block, which is aligned on a cache line boundary and does not exceed cache
// line size. Since constants will always be read from read-only memory (even trivial ones like 1.0), this will make
// them consume just a single cache line. If they were scattered all over the read-only section, the CPU would in worst
// case be touching one cache line per constant.
__declspec(align(64)) static struct {
Float8B one; // 1.0
Float8B a; // 0.999999986 * 1.4629180792671596810513378043098e-9,
Float8B b; // -0.0958010197 * 3.1308338546629715204060346522108e-27,
Float8B c; // -0.429135022 * 2.1401293066467156958511258973014e-18,
Float8B d; // 0.00971659383 * 4.5801534491681520631005954830806e-36
Float8B padding[3];
} const constants = {
1.0,
1.4629180587863065713111022695911e-9,
-2.9993707578799427148695747771533e-28,
-9.1840443709068308636681822066321e-19,
4.4503490744640484968825016740227e-38
};
// Modulate the range to 180° only AFTER the sector is determined. This will hopefully break any dependency of branching
// on modified angle, thus suppress mispredictions.
angle.unorm = angle.unorm & 0x7FFFFFFF;
// Adjust x for the computation according to the angle's sector.
Float8B x;
if(0x00000000u == sectorID) {
x = Float8B(int(angle.unorm));
} else if(0x20000000u == sectorID || 0x40000000u == sectorID) {
x = Float8B(int(0x40000000u - angle.unorm));
} else { assert("binary logic broke", 0x60000000u == sectorID);
x = Float8B(int(angle.unorm - 0x80000000));
}
auto const x² = x * x;
auto const x³ = x² * x;
auto const x²² = x² * x²;
auto const termA = constants.a * x + constants.b * x³;
auto const termB = constants.one + constants.c * x² + constants.d * x²²;
// Perform final division:
if(0x00000000u == sectorID || 0x60000000u == sectorID) {
return Float4B(termA / termB);
} else { assert("binary logic broke", 0x20000000u == sectorID || 0x40000000u == sectorID);
return Float4B(termB / termA);
}
}