diff --git a/include/bx/radixsort.h b/include/bx/radixsort.h index 05f3586..02ba753 100644 --- a/include/bx/radixsort.h +++ b/include/bx/radixsort.h @@ -15,18 +15,34 @@ namespace bx #define BX_RADIXSORT_BIT_MASK (BX_RADIXSORT_HISTOGRAM_SIZE-1) template - void radixSort32(uint32_t* _keys, uint32_t* _tempKeys, Ty* _values, Ty* _tempValues, uint32_t _size) + void radixSort32(uint32_t* __restrict _keys, uint32_t* __restrict _tempKeys, Ty* __restrict _values, Ty* __restrict _tempValues, uint32_t _size) { + uint32_t* __restrict keys = _keys; + uint32_t* __restrict tempKeys = _tempKeys; + Ty* __restrict values = _values; + Ty* __restrict tempValues = _tempValues; + uint16_t histogram[BX_RADIXSORT_HISTOGRAM_SIZE]; uint16_t shift = 0; - for (uint32_t pass = 0; pass < 3; ++pass) + uint32_t pass = 0; + for (; pass < 3; ++pass) { memset(histogram, 0, sizeof(uint16_t)*BX_RADIXSORT_HISTOGRAM_SIZE); - for (uint32_t ii = 0; ii < _size; ++ii) + + bool sorted = true; + uint32_t key = keys[0]; + uint32_t prevKey = key; + for (uint32_t ii = 0; ii < _size; ++ii, prevKey = key) { - uint32_t key = _keys[ii]; + key = keys[ii]; uint16_t index = (key>>shift)&BX_RADIXSORT_BIT_MASK; ++histogram[index]; + sorted &= prevKey <= key; + } + + if (sorted) + { + goto done; } uint16_t offset = 0; @@ -39,38 +55,65 @@ namespace bx for (uint32_t ii = 0; ii < _size; ++ii) { - uint32_t key = _keys[ii]; + uint32_t key = keys[ii]; uint16_t index = (key>>shift)&BX_RADIXSORT_BIT_MASK; uint16_t dest = histogram[index]++; - _tempKeys[dest] = key; - _tempValues[dest] = _values[ii]; + tempKeys[dest] = key; + tempValues[dest] = values[ii]; } - uint32_t* swapKeys = _tempKeys; - _tempKeys = _keys; - _keys = swapKeys; + uint32_t* swapKeys = tempKeys; + tempKeys = keys; + keys = swapKeys; - Ty* swapValues = _tempValues; - _tempValues = _values; - _values = swapValues; + Ty* swapValues = tempValues; + tempValues = values; + values = swapValues; shift += BX_RADIXSORT_BITS; } + +done: + if (0 != (pass&1) ) + { + // Odd number of passes needs to do copy to the destination. + memcpy(_keys, _tempKeys, _size*sizeof(uint32_t) ); + for (uint32_t ii = 0; ii < _size; ++ii) + { + _values[ii] = _tempValues[ii]; + } + } } template - void radixSort64(uint64_t* _keys, uint64_t* _tempKeys, Ty* _values, Ty* _tempValues, uint32_t _size) + void radixSort64(uint64_t* __restrict _keys, uint64_t* __restrict _tempKeys, Ty* __restrict _values, Ty* __restrict _tempValues, uint32_t _size) { + uint64_t* __restrict keys = _keys; + uint64_t* __restrict tempKeys = _tempKeys; + Ty* __restrict values = _values; + Ty* __restrict tempValues = _tempValues; + uint16_t histogram[BX_RADIXSORT_HISTOGRAM_SIZE]; uint16_t shift = 0; - for (uint32_t pass = 0; pass < 6; ++pass) + uint32_t pass = 0; + for (; pass < 6; ++pass) { memset(histogram, 0, sizeof(uint16_t)*BX_RADIXSORT_HISTOGRAM_SIZE); - for (uint32_t ii = 0; ii < _size; ++ii) + + bool sorted = true; + uint64_t key = keys[0]; + uint64_t prevKey = key; + for (uint32_t ii = 0; ii < _size; ++ii, prevKey = key) { - uint64_t key = _keys[ii]; + key = keys[ii]; uint16_t index = (key>>shift)&BX_RADIXSORT_BIT_MASK; ++histogram[index]; + sorted &= prevKey <= key; + } + + if (sorted) + { + goto done; } uint16_t offset = 0; @@ -83,23 +126,34 @@ namespace bx for (uint32_t ii = 0; ii < _size; ++ii) { - uint64_t key = _keys[ii]; + uint64_t key = keys[ii]; uint16_t index = (key>>shift)&BX_RADIXSORT_BIT_MASK; uint16_t dest = histogram[index]++; - _tempKeys[dest] = key; - _tempValues[dest] = _values[ii]; + tempKeys[dest] = key; + tempValues[dest] = values[ii]; } - uint64_t* swapKeys = _tempKeys; - _tempKeys = _keys; - _keys = swapKeys; + uint64_t* swapKeys = tempKeys; + tempKeys = keys; + keys = swapKeys; - Ty* swapValues = _tempValues; - _tempValues = _values; - _values = swapValues; + Ty* swapValues = tempValues; + tempValues = values; + values = swapValues; shift += BX_RADIXSORT_BITS; } + +done: + if (0 != (pass&1) ) + { + // Odd number of passes needs to do copy to the destination. + memcpy(_keys, _tempKeys, _size*sizeof(uint64_t) ); + for (uint32_t ii = 0; ii < _size; ++ii) + { + _values[ii] = _tempValues[ii]; + } + } } #undef BX_RADIXSORT_BITS