aboutsummaryrefslogtreecommitdiff
path: root/src/findmask.c
blob: 1b10988d85c6307f13f2e8c8c2ec816f794736e5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
/* Copyright 2013 Google Inc. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


/*
 * This "tool" can be used to brute force the XOR bitmask that a memory
 * controller uses to interleave addresses onto its two channels. To use it,
 * you need to have a bunch of addresses that are known to go to only one
 * of the memory channels... easiest way to get these is to run stressapptest on
 * a machine while holding a soldering iron close to the chips of one channel.
 * Generate about a thousand failures and extract their physical addresses
 * from the output. Write them to findmask.inc in a way that forms a valid
 * definition for the addrs array. Make and run on a big machine.
 *
 * The program iterates over all possible bitmasks within the first NUM_BITS,
 * parallelizing execution over NUM_THREADS. Every integer is masked
 * onto all supplied addresses, counting the amount of times this results in
 * an odd or even amount of bits. If all but NOISE addresses fall on one side,
 * it will print that mask to stdout. Note that the script will always "find"
 * the mask 0x0, and may also report masks such as 0x100000000 depending on
 * your test machines memory size... you will need to use your own judgement to
 * interpret the results.
 *
 * As the program might run for a long time, you can send SIGUSR1 to it to
 * output the last mask that was processed and get a rough idea of the
 * current progress.
 */

#include <inttypes.h>
#include <pthread.h>
#include <signal.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>

#define NOISE 20
#define NUM_BITS 32
#define NUM_THREADS 128  // keep this a power of two

static uint64_t addrs[] = {
#include "findmask.inc"
};
static uint64_t lastmask;

__attribute__((optimize(3, "unroll-loops")))
void* thread_func(void* arg) {
  register uint64_t mask;
  register uintptr_t num = (uintptr_t)arg;

  for (mask = num; mask < (1ULL << (NUM_BITS + 1)); mask += NUM_THREADS) {
    register const uint64_t* cur;
    register int a = 0;
    register int b = 0;

    for (cur = addrs; (char*)cur < (char*)addrs + sizeof(addrs); cur++) {
#ifdef __x86_64__
      register uint64_t addr asm("rdx") = *cur & mask;
      register uint32_t tmp asm("ebx");

      // Behold: the dark bit counting magic!
      asm (
        // Fold high and low 32 bits onto each other
        "MOVl %%edx, %%ebx\n\t"
        "SHRq $32, %%rdx\n\t"
        "XORl %%ebx, %%edx\n\t"
        // Fold high and low 16 bits onto each other
        "MOVl %%edx, %%ebx\n\t"
        "SHRl $16, %%edx\n\t"
        "XORw %%bx, %%dx\n\t"
        // Fold high and low 8 bits onto each other
        "XORb %%dh, %%dl\n\t"
        // Invoke ancient 8086 parity flag (only counts lowest byte)
        "SETnp %%bl\n\t"
        "SETp %%dl\n\t"
        // Stupid SET instruction can only affect the lowest byte...
        "ANDl $1, %%ebx\n\t"
        "ANDl $1, %%edx\n\t"
        // Increment either 'a' or 'b' without needing another branch
        "ADDl %%ebx, %2\n\t"
        "ADDl %%edx, %1\n\t"
        : "=b" (tmp), "+r"(a), "+r"(b) : "d"(addr) : "cc");

#else  // generic processor
      register uint64_t addr = *cur & mask;
      register uint32_t low = (uint32_t)addr;
      register uint32_t high = (uint32_t)(addr >> 32);

      // Takes about twice as long as the version above... take that GCC!
      __builtin_parity(low) ^ __builtin_parity(high) ? a++ : b++;
#endif

      // Early abort: probably still the most valuable optimization in here
      if (a >= NOISE && b >= NOISE) break;
    }

    if (a < NOISE) b = a;
    if (b < NOISE) {
      printf("Found mask with just %d deviations: 0x%" PRIx64 "\n", b, mask);
      fflush(stdout);
    }

    // I'm a little paranoid about performance: don't write to memory too often
    if (!(mask & 0x7ff)) lastmask = mask;
  }

  return 0;
}

void signal_handler(int signum) {
  printf("Received signal... currently evaluating mask 0x%" PRIx64 "!\n",
         lastmask);
  fflush(stdout);
}

int main(int argc, char** argv) {
  uintptr_t i;
  pthread_t threads[NUM_THREADS];

  signal(SIGUSR1, signal_handler);

  for (i = 0; i < NUM_THREADS; i++)
    pthread_create(&threads[i], 0, thread_func, (void*)i);

  for (i = 0; i < NUM_THREADS; i++)
    pthread_join(threads[i], 0);

  return 0;
}