-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfradix16-64.c
174 lines (153 loc) · 4.65 KB
/
fradix16-64.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
#define _GNU_SOURCE
#include <stdint.h>
#include <stdlib.h> // malloc
#include <string.h> // memset
#include "fradix16-64.h"
// plan
// - use 16 bit shifts, in 2 passes
// - compute all histograms in one pass
// - call repeatedly for additional keys, vs. building it into the API
// - allow reuse of allocated buffers
// - try computing negative space, with special final pass
// - try flopping negative bits
// - try reporting if the data is fully sorted
// can we do this as we go? if not, check on the way in, during histogram.
#define offsetSize (1 << 16)
static int *offset0;
static int *offset1;
#define NAN_FLOP
#ifdef NAN_FLOP
// nan, +inf, -inf
// s111 1111 1xxx xxxx xxxx xxxx xxxx xxxx
// +inf s is 0, high x is 0
// -inf s is 1, high x is 0
// nan: s is 0, high x is 1
// All other non-zero x are also nan, but 7FC is the canonical one we
// will see in the data.
// In order to sort negatives correctly, we have to 1) swap the sign bit
// 2) swap all the other bits if the number is negative. Additionally, since
// nan is bitwise above +inf, and we want to sort it as -inf, explictly map
// it to the flop of -inf.
#define flop(x) ((x) == 0x7FC00000U ? 0x7FFFFFU : ((x) ^ (-((x) >> 31) | 0x80000000U)))
#else /* NAN_FLOP */
#define flop(x) ((x) ^ (-((x) >> 31) | 0x80000000U))
#endif
#if 0
uint32_t flop(uint32_t x) {
return x == 0x7FC00000U ? 0x7FFFFFU : (x ^ (-(x >> 31) | 0x80000000U));
}
#endif
#define mask ((1 << 16) - 1)
#define part0(x) (mask & (x))
#define part1(x) (mask & ((x) >> 16))
static void zero(int *a, int n) {
memset(a, 0, n * sizeof(int));
}
// XXX Consider caching these. They won't change, for a given
// column. If the size is prohibitive, consider more passes:
// cached histo + more passes might be faster than computing histo &
// fewer passes. Maybe look at some profiles to see how it breaks down.
static void computeHisto(uint32_t *vals, int n, direction dir) {
zero(offset0, offsetSize);
zero(offset1, offsetSize);
for (int i = 0; i < n; ++i) {
uint32_t flopped = flop(vals[i]);
offset0[part0(flopped)]++;
offset1[part1(flopped)]++;
}
if (dir == UP) {
for (int i = 1; i < offsetSize; i++) {
offset0[i] += offset0[i - 1];
offset1[i] += offset1[i - 1];
}
} else {
for (int i = offsetSize - 2; i >= 0; i--) {
offset0[i] += offset0[i + 1];
offset1[i] += offset1[i + 1];
}
}
}
// Sorting indicies is quite a bit slower than direct sorting. Is there
// any way to combine the two? Maybe merge the indicies into a 64 bit value,
// and use 64 bit store/loads?
// We start with a val array & an index array.
// We want to emit or update the index array.
// We make two passes over it, here. If we have multiple
// columns, we will make more passes.
// [v], [i]
// copy together
// [v, i]
// Do passes over this.
// extract i
//
// Multi-key
// copy together
// do passes
// copy next key
// do passes
// copy next key
// do passes
// extract i
#ifndef NAN_FLOP
// method to move nans to the bottom
void nans(uint32_t *vals, int *indicies, int n) {
int *m = indicies + n;
while (vals[*--m] == 0x7FC00000U) { }
++m;
if (m != indicies + n) {
// buffer shift to m
int *j = indicies;
int *k = m;
int *last = indicies + n;
// can this be written as a set of memcpy, or memmove?
while (j != k) {
uint32_t v0 = *j;
*j++ = *k;
*k++ = v0;
if (k == last) {
k = m;
} else if (j == m) {
m = k;
}
}
}
}
#endif
void fradixSort16_64(uint32_t *vals, int n, int *indicies, uint64_t *scratch,
direction dir) {
computeHisto(vals, n, dir);
for (int i = n - 1; i >= 0; i--) {
int ii = indicies[i];
uint32_t v = vals[ii];
// contatenate index with value into 64 bits, to take advantage of
// long word read/write, and avoid a 2nd array lookup, later.
// XXX we could store the flopped value, to avoid flopping it again.
// Probably this will not be measurable, since flopping is very fast.
// Weirdly, it seems to be slower if we store the flop. Maybe due to
// additional register load, or something?
scratch[offset0[part0(flop(v))]-- - 1] = ((uint64_t)ii << 32) | v;
}
for (int i = n - 1; i >= 0; i--) {
indicies[offset1[part1(flop(scratch[i] & 0xFFFFFFFFU))]-- - 1] = scratch[i] >> 32;
}
#ifndef NAN_FLOP
nans(vals, indicies, n);
#endif
}
void fradixSortL16_64(uint32_t **valsList, direction *dirs, int m, int n, int *indicies) {
uint64_t *scratch = malloc(n * sizeof(uint64_t));
while (m) {
int i = --m;
fradixSort16_64(valsList[i], n, indicies, scratch, dirs[i]);
}
free(scratch);
}
static int initialized = 0;
// set up working space.
void fradixSort16_64_init() {
if (!initialized) {
offset0 = malloc(offsetSize * sizeof(int));
offset1 = malloc(offsetSize * sizeof(int));
initialized = 1;
}
}