Skydiver
Staff member
So here is a version of the CRC32 computation using
and here is the original code that used
On my ancient machine, here's the relative throughput I got:
Span<T>
:
Using Span and take out branches inside tight loop:
protected void LocalCrcCompute(uint[][] crcTable, byte[] data, int index, int length)
{
if (data == null) throw new ArgumentNullHashLibException(nameof(data));
Debug.Assert(index >= 0);
Debug.Assert(length >= 0);
Debug.Assert(index + length <= data.Length);
const int unroll = 4;
const int bytesAtOnce = 16 * unroll;
var crc = ~CurrentCRC;
var leftovers = BitConverter.IsLittleEndian ? ComputeLittleEndianBlocks()
: ComputeBigEndianBlocks();
// remaining 1 to 63 bytes (standard algorithm)
foreach (var b in leftovers)
crc = (crc >> 8) ^ crcTable[0][(crc & 0xFF) ^ b];
CurrentCRC = ~crc;
ReadOnlySpan<byte> ComputeLittleEndianBlocks()
{
var dataSpan = data.AsSpan(index, length);
while (dataSpan.Length >= bytesAtOnce)
{
var dataUints = MemoryMarshal.Cast<byte, uint>(dataSpan);
for (int unrolling = 0; unrolling < unroll; unrolling++, dataUints = dataUints.Slice(4))
{
var one = dataUints[0] ^ crc;
var two = dataUints[1];
var three = dataUints[2];
var four = dataUints[3];
crc = crcTable[0][(four >> 24) & 0xFF] ^
crcTable[1][(four >> 16) & 0xFF] ^
crcTable[2][(four >> 8) & 0xFF] ^
crcTable[3][four & 0xFF] ^
crcTable[4][(three >> 24) & 0xFF] ^
crcTable[5][(three >> 16) & 0xFF] ^
crcTable[6][(three >> 8) & 0xFF] ^
crcTable[7][three & 0xFF] ^
crcTable[8][(two >> 24) & 0xFF] ^
crcTable[9][(two >> 16) & 0xFF] ^
crcTable[10][(two >> 8) & 0xFF] ^
crcTable[11][two & 0xFF] ^
crcTable[12][(one >> 24) & 0xFF] ^
crcTable[13][(one >> 16) & 0xFF] ^
crcTable[14][(one >> 8) & 0xFF] ^
crcTable[15][one & 0xFF];
}
dataSpan = dataSpan.Slice(bytesAtOnce);
}
return dataSpan;
}
ReadOnlySpan<byte> ComputeBigEndianBlocks()
{
var dataSpan = data.AsSpan(index, length);
while (dataSpan.Length >= bytesAtOnce)
{
var dataUints = MemoryMarshal.Cast<byte, uint>(dataSpan);
for (int unrolling = 0; unrolling < unroll; unrolling++, dataUints = dataUints.Slice(4))
{
var one = dataUints[0] ^ Bits.ReverseBytesUInt32(crc);
var two = dataUints[1];
var three = dataUints[2];
var four = dataUints[3];
crc = crcTable[0][four & 0xFF] ^
crcTable[1][(four >> 8) & 0xFF] ^
crcTable[2][(four >> 16) & 0xFF] ^
crcTable[3][(four >> 24) & 0xFF] ^
crcTable[4][three & 0xFF] ^
crcTable[5][(three >> 8) & 0xFF] ^
crcTable[6][(three >> 16) & 0xFF] ^
crcTable[7][(three >> 24) & 0xFF] ^
crcTable[8][two & 0xFF] ^
crcTable[9][(two >> 8) & 0xFF] ^
crcTable[10][(two >> 16) & 0xFF] ^
crcTable[11][(two >> 24) & 0xFF] ^
crcTable[12][one & 0xFF] ^
crcTable[13][(one >> 8) & 0xFF] ^
crcTable[14][(one >> 16) & 0xFF] ^
crcTable[15][(one >> 24) & 0xFF];
}
dataSpan = dataSpan.Slice(bytesAtOnce);
}
return dataSpan;
}
}
and here is the original code that used
unsafe
pointers as well the branch inside the loop:
Original unsafe code with pointers and branch inside tight loop:
protected unsafe void LocalCrcCompute(uint[][] crcTable, byte[] data, int index,
int length)
{
if (data == null) throw new ArgumentNullHashLibException(nameof(data));
Debug.Assert(index >= 0);
Debug.Assert(length >= 0);
Debug.Assert(index + length <= data.Length);
const int unroll = 4;
const int bytesAtOnce = 16 * unroll;
var crc = ~CurrentCRC;
fixed (byte* dataPtr = data)
{
var srcPtr = (uint*) (dataPtr + index);
while (length >= bytesAtOnce)
{
var unrolling = 0;
while (unrolling < unroll)
{
var one = Converters.ReadPCardinalAsUInt32(srcPtr)
^ Converters.le2me_32(crc);
srcPtr++;
var two = Converters.ReadPCardinalAsUInt32(srcPtr);
srcPtr++;
var three = Converters.ReadPCardinalAsUInt32(srcPtr);
srcPtr++;
var four = Converters.ReadPCardinalAsUInt32(srcPtr);
srcPtr++;
if (BitConverter.IsLittleEndian)
{
crc = crcTable[0][(four >> 24) & 0xFF] ^ crcTable[1]
[(four >> 16) & 0xFF] ^ crcTable[2][(four >> 8) & 0xFF]
^ crcTable[3][four & 0xFF] ^ crcTable[4]
[(three >> 24) & 0xFF] ^ crcTable[5][(three >> 16) & 0xFF]
^ crcTable[6][(three >> 8) & 0xFF] ^ crcTable[7]
[three & 0xFF] ^ crcTable[8][(two >> 24) & 0xFF] ^ crcTable
[9][(two >> 16) & 0xFF] ^ crcTable[10][(two >> 8) & 0xFF]
^ crcTable[11][two & 0xFF] ^ crcTable[12][(one >> 24) & 0xFF]
^ crcTable[13][(one >> 16) & 0xFF] ^ crcTable[14]
[(one >> 8) & 0xFF] ^ crcTable[15][one & 0xFF];
}
else
{
crc = crcTable[0][four & 0xFF] ^ crcTable[1]
[(four >> 8) & 0xFF] ^ crcTable[2][(four >> 16) & 0xFF]
^ crcTable[3][(four >> 24) & 0xFF] ^ crcTable[4]
[three & 0xFF] ^ crcTable[5][(three >> 8) & 0xFF] ^ crcTable
[6][(three >> 16) & 0xFF] ^ crcTable[7][(three >> 24) & 0xFF]
^ crcTable[8][two & 0xFF] ^ crcTable[9][(two >> 8) & 0xFF]
^ crcTable[10][(two >> 16) & 0xFF] ^ crcTable[11]
[(two >> 24) & 0xFF] ^ crcTable[12][one & 0xFF] ^ crcTable
[13][(one >> 8) & 0xFF] ^ crcTable[14][(one >> 16) & 0xFF]
^ crcTable[15][(one >> 24) & 0xFF];
}
unrolling++;
}
length -= bytesAtOnce;
}
var srcPtr2 = (byte*) srcPtr;
// remaining 1 to 63 bytes (standard algorithm)
while (length != 0)
{
crc = (crc >> 8) ^ crcTable[0][(crc & 0xFF) ^ *srcPtr2];
srcPtr2++;
length--;
}
CurrentCRC = ~crc;
}
}
On my ancient machine, here's the relative throughput I got:
C#:
657 659
684 680
676 677
Timings on a old AMD system circa 2012:
Castagnoli PKZip (in MB/s)
Original 657 659
Original without branch 684 680
Span<T> without branch 676 677