Skip to content

Acceleration

Backend modules for high-performance SC operations.

Module Purpose
vector_ops Packed uint64 bitwise AND, popcount, pack/unpack
gpu_backend CuPy GPU dispatch (transparent NumPy fallback)
jax_backend JAX JIT-compiled LIF step for TPU/GPU scaling
jit_kernels Numba-accelerated inner loops
mpi_driver MPI-based distributed simulation

Vector Operations

sc_neurocore.accel.vector_ops

pack_bitstream(bitstream)

Packs a uint8 bitstream (0s and 1s) into uint64 integers. This allows processing 64 time steps in parallel.

Parameters:

Name Type Description Default
bitstream ndarray[Any, Any]

Shape (N,) or (Batch, N) of uint8 {0,1}

required

Returns:

Name Type Description
packed ndarray[Any, Any]

Shape (ceil(N/64),) or (Batch, ceil(N/64)) of uint64

Source code in src/sc_neurocore/accel/vector_ops.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
def pack_bitstream(bitstream: np.ndarray[Any, Any]) -> np.ndarray[Any, Any]:
    """
    Packs a uint8 bitstream (0s and 1s) into uint64 integers.
    This allows processing 64 time steps in parallel.

    Args:
        bitstream: Shape (N,) or (Batch, N) of uint8 {0,1}

    Returns:
        packed: Shape (ceil(N/64),) or (Batch, ceil(N/64)) of uint64
    """
    bitstream = np.asarray(bitstream, dtype=np.uint8)

    if bitstream.ndim == 1:
        # 1D case: single bitstream
        length = bitstream.size
        pad_len = (64 - (length % 64)) % 64
        if pad_len > 0:
            bitstream = np.append(bitstream, np.zeros(pad_len, dtype=np.uint8))

        chunks = bitstream.reshape(-1, 64)
        powers = 1 << np.arange(64, dtype=np.uint64)
        packed = (chunks * powers).sum(axis=1, dtype=np.uint64)
        return packed

    elif bitstream.ndim == 2:
        # 2D case: batch of bitstreams
        batch_size, length = bitstream.shape
        pad_len = (64 - (length % 64)) % 64

        if pad_len > 0:
            padding = np.zeros((batch_size, pad_len), dtype=np.uint8)
            bitstream = np.concatenate([bitstream, padding], axis=1)

        # Reshape to (batch, num_chunks, 64)
        num_chunks = bitstream.shape[1] // 64
        chunks = bitstream.reshape(batch_size, num_chunks, 64)

        powers = 1 << np.arange(64, dtype=np.uint64)
        packed = (chunks * powers).sum(axis=2, dtype=np.uint64)
        return packed

    else:
        raise ValueError(f"Expected 1D or 2D array, got {bitstream.ndim}D")

unpack_bitstream(packed, original_length, original_shape=None)

Unpacks uint64 array back to uint8 bitstream.

Parameters:

Name Type Description Default
packed ndarray[Any, Any]

Packed uint64 array (1D or 2D)

required
original_length int

Total number of bits to extract

required
original_shape Optional[tuple[Any, ...]]

Optional tuple for reshaping output (batch, length)

None

Returns:

Type Description
ndarray[Any, Any]

Unpacked bitstream of shape (original_length,) or original_shape

Source code in src/sc_neurocore/accel/vector_ops.py
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
def unpack_bitstream(
    packed: np.ndarray[Any, Any],
    original_length: int,
    original_shape: Optional[tuple[Any, ...]] = None,
) -> np.ndarray[Any, Any]:
    """
    Unpacks uint64 array back to uint8 bitstream.

    Args:
        packed: Packed uint64 array (1D or 2D)
        original_length: Total number of bits to extract
        original_shape: Optional tuple for reshaping output (batch, length)

    Returns:
        Unpacked bitstream of shape (original_length,) or original_shape
    """
    if packed.ndim == 1:
        # 1D packed array
        bits = ((packed[:, None] & (1 << np.arange(64, dtype=np.uint64))) > 0).astype(np.uint8)
        unpacked = bits.flatten()
        return unpacked[:original_length]

    elif packed.ndim == 2:
        # 2D packed array: (batch, num_chunks)
        batch_size, num_chunks = packed.shape
        # Extract bits: (batch, num_chunks, 64)
        bits = ((packed[:, :, None] & (1 << np.arange(64, dtype=np.uint64))) > 0).astype(np.uint8)
        # Reshape to (batch, num_chunks * 64)
        unpacked = bits.reshape(batch_size, -1)

        if original_shape is not None:
            return unpacked[:, : original_shape[1]]
        else:
            # Assume original_length is per-batch
            per_batch_len = original_length // batch_size
            return unpacked[:, :per_batch_len]

    else:
        raise ValueError(f"Expected 1D or 2D packed array, got {packed.ndim}D")

vec_and(a_packed, b_packed)

Bitwise AND on packed arrays. Simulates SC Multiplication.

Source code in src/sc_neurocore/accel/vector_ops.py
 99
100
101
102
103
def vec_and(a_packed: np.ndarray[Any, Any], b_packed: np.ndarray[Any, Any]) -> np.ndarray[Any, Any]:
    """
    Bitwise AND on packed arrays. Simulates SC Multiplication.
    """
    return np.bitwise_and(a_packed, b_packed)

vec_xnor(a_packed, b_packed)

Bitwise XNOR on packed arrays. SC bipolar multiplication: P(A XNOR B) = P(A)P(B) + (1-P(A))(1-P(B)).

Source code in src/sc_neurocore/accel/vector_ops.py
106
107
108
109
110
def vec_xnor(
    a_packed: np.ndarray[Any, Any], b_packed: np.ndarray[Any, Any]
) -> np.ndarray[Any, Any]:
    """Bitwise XNOR on packed arrays. SC bipolar multiplication: P(A XNOR B) = P(A)*P(B) + (1-P(A))*(1-P(B))."""
    return ~np.bitwise_xor(a_packed, b_packed)

vec_not(packed)

Bitwise NOT on packed arrays. SC complement: P(NOT A) = 1 - P(A).

Source code in src/sc_neurocore/accel/vector_ops.py
113
114
115
def vec_not(packed: np.ndarray[Any, Any]) -> np.ndarray[Any, Any]:
    """Bitwise NOT on packed arrays. SC complement: P(NOT A) = 1 - P(A)."""
    return ~packed

vec_mux(select_packed, a_packed, b_packed)

Bitwise MUX on packed arrays. SC scaled addition: P(out) = P(sel)P(A) + (1-P(sel))P(B).

When sel is a Bernoulli(0.5) stream, this computes the average (A+B)/2.

Source code in src/sc_neurocore/accel/vector_ops.py
118
119
120
121
122
123
124
125
126
127
def vec_mux(
    select_packed: np.ndarray[Any, Any],
    a_packed: np.ndarray[Any, Any],
    b_packed: np.ndarray[Any, Any],
) -> np.ndarray[Any, Any]:
    """Bitwise MUX on packed arrays. SC scaled addition: P(out) = P(sel)*P(A) + (1-P(sel))*P(B).

    When sel is a Bernoulli(0.5) stream, this computes the average (A+B)/2.
    """
    return (select_packed & a_packed) | (~select_packed & b_packed)

vec_popcount(packed)

Count total set bits (1s) in the packed array. Used for integration/accumulation.

Source code in src/sc_neurocore/accel/vector_ops.py
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
def vec_popcount(packed: np.ndarray[Any, Any]) -> int:
    """
    Count total set bits (1s) in the packed array.
    Used for integration/accumulation.
    """
    # Using numpy's ability to cast to specialized types or simple lookup?
    # Actually, Python 3.10+ int.bit_count() is fast, but for numpy arrays:
    # We can use a trick or just loop if C-extension isn't available.
    # A generic parallel popcount on uint64 in pure numpy is tricky without looping or lookup tables.
    # However, we can map to python int and sum.

    # For speed in pure python/numpy env without heavy deps:
    # Use binary decomposition for vectorized popcount
    x = packed.copy()
    x -= (x >> 1) & 0x5555555555555555
    x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333)
    x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0F
    x = (x * 0x0101010101010101) >> 56
    return np.sum(x)

GPU Backend

sc_neurocore.accel.gpu_backend

to_device(arr)

Move a NumPy array to the active backend (GPU copy or no-op).

Source code in src/sc_neurocore/accel/gpu_backend.py
66
67
68
69
70
def to_device(arr: np.ndarray[Any, Any]) -> xp.ndarray:  # type: ignore
    """Move a NumPy array to the active backend (GPU copy or no-op)."""
    if HAS_CUPY:  # pragma: no cover
        return cp.asarray(arr)
    return arr

to_host(arr)

Bring an array back to host RAM as a NumPy array.

Source code in src/sc_neurocore/accel/gpu_backend.py
73
74
75
76
77
def to_host(arr) -> np.ndarray[Any, Any]:  # type: ignore
    """Bring an array back to host RAM as a NumPy array."""
    if HAS_CUPY and isinstance(arr, cp.ndarray):  # pragma: no cover
        return cp.asnumpy(arr)
    return np.asarray(arr)

gpu_pack_bitstream(bits)

Pack uint8 {0,1} array into uint64 words.

Works on both CuPy and NumPy arrays.

Parameters:

Name Type Description Default
bits ndarray

Shape (N,) or (B, N) of uint8.

required

Returns:

Type Description
ndarray

Packed uint64 array, shape (ceil(N/64),) or (B, ceil(N/64)).

Source code in src/sc_neurocore/accel/gpu_backend.py
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
def gpu_pack_bitstream(bits: xp.ndarray) -> xp.ndarray:  # type: ignore
    """
    Pack uint8 {0,1} array into uint64 words.

    Works on both CuPy and NumPy arrays.

    Args:
        bits: Shape ``(N,)`` or ``(B, N)`` of uint8.

    Returns:
        Packed uint64 array, shape ``(ceil(N/64),)`` or ``(B, ceil(N/64))``.
    """
    _warn_cpu_fallback()
    bits = xp.asarray(bits, dtype=xp.uint8)

    if bits.ndim == 1:
        length = bits.size
        pad = (64 - length % 64) % 64
        if pad:
            bits = xp.concatenate([bits, xp.zeros(pad, dtype=xp.uint8)])
        chunks = bits.reshape(-1, 64)
        powers = xp.uint64(1) << xp.arange(64, dtype=xp.uint64)
        return (chunks.astype(xp.uint64) * powers).sum(axis=1)

    elif bits.ndim == 2:
        B, length = bits.shape
        pad = (64 - length % 64) % 64
        if pad:
            bits = xp.concatenate([bits, xp.zeros((B, pad), dtype=xp.uint8)], axis=1)
        n_words = bits.shape[1] // 64
        chunks = bits.reshape(B, n_words, 64)
        powers = xp.uint64(1) << xp.arange(64, dtype=xp.uint64)
        return (chunks.astype(xp.uint64) * powers).sum(axis=2)

    raise ValueError(f"Expected 1-D or 2-D, got {bits.ndim}-D")

gpu_vec_and(a, b)

Bitwise AND on packed uint64 arrays (SC multiplication).

Source code in src/sc_neurocore/accel/gpu_backend.py
122
123
124
125
def gpu_vec_and(a: xp.ndarray, b: xp.ndarray) -> xp.ndarray:  # type: ignore
    """Bitwise AND on packed uint64 arrays (SC multiplication)."""
    _warn_cpu_fallback()
    return xp.bitwise_and(a, b)

gpu_popcount(packed)

Vectorised SWAR popcount on uint64 arrays — returns per-element counts.

On CuPy this runs as a fused GPU kernel; on NumPy it uses the same SWAR bit-trick as vector_ops.vec_popcount but returns an array instead of a scalar.

Source code in src/sc_neurocore/accel/gpu_backend.py
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
def gpu_popcount(packed: xp.ndarray) -> xp.ndarray:  # type: ignore
    """
    Vectorised SWAR popcount on uint64 arrays — returns per-element counts.

    On CuPy this runs as a fused GPU kernel; on NumPy it uses the same
    SWAR bit-trick as ``vector_ops.vec_popcount`` but returns an array
    instead of a scalar.
    """
    _warn_cpu_fallback()
    x = packed.astype(xp.uint64).copy()
    m1 = xp.uint64(0x5555555555555555)
    m2 = xp.uint64(0x3333333333333333)
    m4 = xp.uint64(0x0F0F0F0F0F0F0F0F)
    h01 = xp.uint64(0x0101010101010101)

    x -= (x >> xp.uint64(1)) & m1
    x = (x & m2) + ((x >> xp.uint64(2)) & m2)
    x = (x + (x >> xp.uint64(4))) & m4
    return (x * h01) >> xp.uint64(56)

gpu_vec_mac(packed_weights, packed_inputs)

GPU-accelerated multiply-accumulate for a dense SC layer.

Parameters:

Name Type Description Default
packed_weights ndarray

(n_neurons, n_inputs, n_words) uint64

required
packed_inputs ndarray

(n_inputs, n_words) uint64

required

Returns:

Type Description
ndarray

(n_neurons,) total bit counts (= SC dot products).

Source code in src/sc_neurocore/accel/gpu_backend.py
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
def gpu_vec_mac(
    packed_weights: xp.ndarray,  # type: ignore
    packed_inputs: xp.ndarray,  # type: ignore
) -> xp.ndarray:  # type: ignore
    """
    GPU-accelerated multiply-accumulate for a dense SC layer.

    Args:
        packed_weights: ``(n_neurons, n_inputs, n_words)`` uint64
        packed_inputs:  ``(n_inputs, n_words)`` uint64

    Returns:
        ``(n_neurons,)`` total bit counts (= SC dot products).
    """
    _warn_cpu_fallback()
    # Broadcast AND: (N, I, W) & (1, I, W) -> (N, I, W)
    products = xp.bitwise_and(packed_weights, packed_inputs[None, :, :])

    # Per-element popcount, then sum across inputs and words
    counts = gpu_popcount(products)  # (N, I, W) uint64
    return counts.sum(axis=(1, 2))  # (N,)

JAX Backend

sc_neurocore.accel.jax_backend

JAX backend for SC-NeuroCore.

Provides JAX-accelerated primitives for stochastic computing, unlocking automatic differentiation, JIT compilation (XLA), and native TPU/GPU scaling.

Usage::

from sc_neurocore.accel.jax_backend import jnp, HAS_JAX, to_jax, to_host
from sc_neurocore.accel.jax_backend import jax_pack_bitstream, jax_vec_mac

if HAS_JAX:
    bits = jnp.array([1, 0, 1, 1], dtype=jnp.uint8)
    packed = jax_pack_bitstream(bits)

to_jax(arr)

Move a NumPy array to the JAX device.

Source code in src/sc_neurocore/accel/jax_backend.py
53
54
55
56
57
def to_jax(arr: Any) -> Any:
    """Move a NumPy array to the JAX device."""
    if HAS_JAX:
        return jnp.asarray(arr)
    return arr

to_host(arr)

Bring a JAX array back to host RAM as a NumPy array.

Source code in src/sc_neurocore/accel/jax_backend.py
60
61
62
63
64
def to_host(arr: Any) -> np.ndarray[Any, Any]:
    """Bring a JAX array back to host RAM as a NumPy array."""
    if HAS_JAX and isinstance(arr, jax.Array):
        return np.asarray(arr)
    return np.asarray(arr)

jax_pack_bitstream(bits)

Pack uint8 {0,1} array into uint64 words using JAX.

Source code in src/sc_neurocore/accel/jax_backend.py
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
def jax_pack_bitstream(bits: Any) -> Any:
    """
    Pack uint8 {0,1} array into uint64 words using JAX.
    """
    if not HAS_JAX:
        from sc_neurocore.exceptions import SCDependencyError

        raise SCDependencyError("JAX is not available.")

    bits = jnp.asarray(bits, dtype=jnp.uint8)

    if bits.ndim == 1:
        return _jax_pack_1d(bits)
    elif bits.ndim == 2:
        return _jax_pack_2d(bits)

    from sc_neurocore.exceptions import SCEncodingError

    raise SCEncodingError(f"Expected 1-D or 2-D, got {bits.ndim}-D")

jax_vec_and(a, b)

Bitwise AND on packed uint64 arrays (SC multiplication).

Source code in src/sc_neurocore/accel/jax_backend.py
118
119
120
121
@jax.jit
def jax_vec_and(a: jax.Array, b: jax.Array) -> jax.Array:
    """Bitwise AND on packed uint64 arrays (SC multiplication)."""
    return jnp.bitwise_and(a, b)

jax_popcount(packed)

Vectorised SWAR popcount on uint64 arrays using JAX.

Source code in src/sc_neurocore/accel/jax_backend.py
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
@jax.jit
def jax_popcount(packed: jax.Array) -> jax.Array:
    """
    Vectorised SWAR popcount on uint64 arrays using JAX.
    """
    x = packed.astype(jnp.uint64)
    m1 = jnp.uint64(0x5555555555555555)
    m2 = jnp.uint64(0x3333333333333333)
    m4 = jnp.uint64(0x0F0F0F0F0F0F0F0F)
    h01 = jnp.uint64(0x0101010101010101)

    x = x - ((x >> jnp.uint64(1)) & m1)
    x = (x & m2) + ((x >> jnp.uint64(2)) & m2)
    x = (x + (x >> jnp.uint64(4))) & m4
    res: jax.Array = (x * h01) >> jnp.uint64(56)
    return res

jax_vec_mac(packed_weights, packed_inputs)

JAX-accelerated multiply-accumulate for a dense SC layer.

Source code in src/sc_neurocore/accel/jax_backend.py
140
141
142
143
144
145
146
147
148
@jax.jit
def jax_vec_mac(packed_weights: jax.Array, packed_inputs: jax.Array) -> jax.Array:
    """
    JAX-accelerated multiply-accumulate for a dense SC layer.
    """
    products = jnp.bitwise_and(packed_weights, packed_inputs[None, :, :])
    counts = jax_popcount(products)
    res: jax.Array = jnp.sum(counts, axis=(1, 2))
    return res

jax_lif_step(v, I_t, v_rest, v_reset, v_threshold, alpha, resistance, noise)

Vectorized LIF step using JAX.

dv = (v_rest - v) * alpha + I_t * resistance + noise

Source code in src/sc_neurocore/accel/jax_backend.py
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
@jax.jit
def jax_lif_step(
    v: jax.Array,
    I_t: jax.Array,
    v_rest: float,
    v_reset: float,
    v_threshold: float,
    alpha: float,
    resistance: float,
    noise: jax.Array,
) -> tuple[jax.Array, jax.Array]:
    """
    Vectorized LIF step using JAX.

    dv = (v_rest - v) * alpha + I_t * resistance + noise
    """
    v_next = v + (v_rest - v) * alpha + I_t * resistance + noise
    spikes = v_next >= v_threshold
    v_next = jnp.where(spikes, v_reset, v_next)
    return v_next, spikes.astype(jnp.uint8)

jax_forward_pass(weights, x, n_steps, v_rest=0.0, v_reset=0.0, v_threshold=1.0, alpha=0.9)

Multi-layer SNN forward pass with LIF neurons.

Returns (spike_trains_per_layer, final_membrane_potentials). Each layer: s = Heaviside(v - threshold), v = alpha * v * (1-s) + W @ s_prev

Source code in src/sc_neurocore/accel/jax_backend.py
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
def jax_forward_pass(
    weights: list[jax.Array],
    x: jax.Array,
    n_steps: int,
    v_rest: float = 0.0,
    v_reset: float = 0.0,
    v_threshold: float = 1.0,
    alpha: float = 0.9,
) -> tuple[list[jax.Array], jax.Array]:
    """
    Multi-layer SNN forward pass with LIF neurons.

    Returns (spike_trains_per_layer, final_membrane_potentials).
    Each layer: s = Heaviside(v - threshold), v = alpha * v * (1-s) + W @ s_prev
    """
    batch = x.shape[0]
    spikes = x
    all_spikes = []

    for W in weights:
        n_out = W.shape[0]
        v = jnp.full((batch, n_out), v_rest)
        layer_spikes = []

        for _t in range(n_steps):
            current = spikes @ W.T
            v = alpha * v * (1.0 - v_reset) + current
            s = (v >= v_threshold).astype(jnp.float32)
            v = jnp.where(s > 0.5, v_reset, v)
            layer_spikes.append(s)

        # Output spikes = mean firing rate over time
        spikes = jnp.stack(layer_spikes, axis=0).mean(axis=0)
        all_spikes.append(jnp.stack(layer_spikes, axis=0))

    return all_spikes, v

jax_surrogate_gradient_step(weights, x, targets, n_steps=25, lr=0.001, beta=10.0)

One training step with surrogate gradient (fast sigmoid).

Uses jax.grad on a cross-entropy loss over mean output spike rates. Returns (updated_weights, loss_value).

Source code in src/sc_neurocore/accel/jax_backend.py
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
def jax_surrogate_gradient_step(
    weights: list[jax.Array],
    x: jax.Array,
    targets: jax.Array,
    n_steps: int = 25,
    lr: float = 1e-3,
    beta: float = 10.0,
) -> tuple[list[jax.Array], float]:
    """
    One training step with surrogate gradient (fast sigmoid).

    Uses jax.grad on a cross-entropy loss over mean output spike rates.
    Returns (updated_weights, loss_value).
    """

    def loss_fn(ws):
        batch = x.shape[0]
        spikes_in = x
        for W in ws:
            n_out = W.shape[0]
            v = jnp.zeros((batch, n_out))
            spike_sum = jnp.zeros((batch, n_out))
            for _t in range(n_steps):
                current = spikes_in @ W.T
                v = 0.9 * v + current
                # Fast sigmoid surrogate: σ(β(v-θ)) / β
                sg = 1.0 / (1.0 + jnp.abs(beta * (v - 1.0)))
                spike_sum = spike_sum + sg
                # Straight-through estimator: hard reset forward, surrogate backward
                spike_hard = (v >= 1.0).astype(v.dtype)
                spike_st = sg + jax.lax.stop_gradient(spike_hard - sg)
                v = v * (1.0 - spike_st)
            spikes_in = spike_sum / n_steps
        logits = spikes_in
        log_softmax = logits - jax.nn.logsumexp(logits, axis=-1, keepdims=True)
        ce = -jnp.sum(targets * log_softmax) / batch
        return ce

    loss_val, grads = jax.value_and_grad(loss_fn)(weights)
    updated = [w - lr * g for w, g in zip(weights, grads)]
    return updated, float(loss_val)

JIT Kernels

sc_neurocore.accel.jit_kernels

jit_pack_bits(bitstream, packed_arr)

Packs a uint8 bitstream into uint64 array. bitstream: (N,) uint8 {0, 1} packed_arr: (N//64,) uint64

Source code in src/sc_neurocore/accel/jit_kernels.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
@jit(nopython=True)
def jit_pack_bits(
    bitstream: np.ndarray[Any, Any], packed_arr: np.ndarray[Any, Any]
) -> None:  # pragma: no cover
    """
    Packs a uint8 bitstream into uint64 array.
    bitstream: (N,) uint8 {0, 1}
    packed_arr: (N//64,) uint64
    """
    n = bitstream.size
    n_packed = n // 64

    for i in range(n_packed):
        val = np.uint64(0)
        base = i * 64
        for j in range(64):
            if bitstream[base + j] > 0:
                val |= np.uint64(1) << np.uint64(j)
        packed_arr[i] = val

jit_vec_mac(packed_weights, packed_inputs, outputs)

Vectorized Multiply-Accumulate (MAC). Simulates: Output[i] = Sum(Weights[i] AND Inputs) weights: (n_neurons, n_inputs, n_words) inputs: (n_inputs, n_words) outputs: (n_neurons,)

Source code in src/sc_neurocore/accel/jit_kernels.py
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
@jit(nopython=True)
def jit_vec_mac(  # type: ignore
    packed_weights: np.ndarray[Any, Any],
    packed_inputs: np.ndarray[Any, Any],
    outputs: np.ndarray[Any, Any],
):  # pragma: no cover
    """
    Vectorized Multiply-Accumulate (MAC).
    Simulates: Output[i] = Sum(Weights[i] AND Inputs)
    weights: (n_neurons, n_inputs, n_words)
    inputs: (n_inputs, n_words)
    outputs: (n_neurons,)
    """
    n_neurons = packed_weights.shape[0]
    n_inputs = packed_weights.shape[1]
    n_words = packed_weights.shape[2]

    for i in range(n_neurons):
        total_bits = 0
        for j in range(n_inputs):
            for k in range(n_words):
                # Bitwise AND = SC Multiplication
                res = packed_weights[i, j, k] & packed_inputs[j, k]

                # Popcount (Hamming Weight)
                # SWAR Algorithm for 64-bit popcount (Safe for Numba nopython mode)
                x = res
                x = x - ((x >> np.uint64(1)) & np.uint64(0x5555555555555555))
                x = (x & np.uint64(0x3333333333333333)) + (
                    (x >> np.uint64(2)) & np.uint64(0x3333333333333333)
                )
                x = (x + (x >> np.uint64(4))) & np.uint64(0x0F0F0F0F0F0F0F0F)
                x = (x * np.uint64(0x0101010101010101)) >> np.uint64(56)

                total_bits += x
        outputs[i] = total_bits

MPI Driver

sc_neurocore.accel.mpi_driver

MPIDriver

Distributed SC-NeuroCore Driver using MPI. Handles partitioning and synchronization of bitstreams across cluster nodes.

Source code in src/sc_neurocore/accel/mpi_driver.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
class MPIDriver:
    """
    Distributed SC-NeuroCore Driver using MPI.
    Handles partitioning and synchronization of bitstreams across cluster nodes.
    """

    def __init__(self) -> None:
        if HAS_MPI:  # pragma: no cover
            self.comm = MPI.COMM_WORLD
            self.rank = self.comm.Get_rank()
            self.size = self.comm.Get_size()
        else:
            self.comm = None
            self.rank = 0
            self.size = 1

    def scatter_workload(self, global_inputs: np.ndarray[Any, Any]) -> np.ndarray[Any, Any]:
        """
        Distributes a large input array across nodes.
        Splits along axis 0 (Batch or Neurons).
        """
        if not HAS_MPI or self.size == 1:
            return global_inputs

        # MPI multi-node path  # pragma: no cover
        total_len = len(global_inputs)  # pragma: no cover
        chunk_size = total_len // self.size  # pragma: no cover
        local_input = np.zeros(chunk_size, dtype=global_inputs.dtype)  # pragma: no cover
        self.comm.Scatter(global_inputs, local_input, root=0)  # pragma: no cover
        return local_input  # pragma: no cover

    def gather_results(self, local_results: np.ndarray[Any, Any]) -> np.ndarray[Any, Any]:
        """
        Collects results from all nodes to Root.
        """
        if not HAS_MPI or self.size == 1:
            return local_results

        # MPI multi-node path  # pragma: no cover
        total_len = len(local_results) * self.size  # pragma: no cover
        global_results = None  # pragma: no cover
        if self.rank == 0:  # pragma: no cover
            global_results = np.zeros(total_len, dtype=local_results.dtype)  # pragma: no cover
        self.comm.Gather(local_results, global_results, root=0)  # pragma: no cover
        if global_results is None:
            return np.zeros(0)
        return global_results

    def barrier(self) -> None:
        """Synchronize all nodes."""
        if HAS_MPI:  # pragma: no cover
            self.comm.Barrier()

scatter_workload(global_inputs)

Distributes a large input array across nodes. Splits along axis 0 (Batch or Neurons).

Source code in src/sc_neurocore/accel/mpi_driver.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
def scatter_workload(self, global_inputs: np.ndarray[Any, Any]) -> np.ndarray[Any, Any]:
    """
    Distributes a large input array across nodes.
    Splits along axis 0 (Batch or Neurons).
    """
    if not HAS_MPI or self.size == 1:
        return global_inputs

    # MPI multi-node path  # pragma: no cover
    total_len = len(global_inputs)  # pragma: no cover
    chunk_size = total_len // self.size  # pragma: no cover
    local_input = np.zeros(chunk_size, dtype=global_inputs.dtype)  # pragma: no cover
    self.comm.Scatter(global_inputs, local_input, root=0)  # pragma: no cover
    return local_input  # pragma: no cover

gather_results(local_results)

Collects results from all nodes to Root.

Source code in src/sc_neurocore/accel/mpi_driver.py
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
def gather_results(self, local_results: np.ndarray[Any, Any]) -> np.ndarray[Any, Any]:
    """
    Collects results from all nodes to Root.
    """
    if not HAS_MPI or self.size == 1:
        return local_results

    # MPI multi-node path  # pragma: no cover
    total_len = len(local_results) * self.size  # pragma: no cover
    global_results = None  # pragma: no cover
    if self.rank == 0:  # pragma: no cover
        global_results = np.zeros(total_len, dtype=local_results.dtype)  # pragma: no cover
    self.comm.Gather(local_results, global_results, root=0)  # pragma: no cover
    if global_results is None:
        return np.zeros(0)
    return global_results

barrier()

Synchronize all nodes.

Source code in src/sc_neurocore/accel/mpi_driver.py
69
70
71
72
def barrier(self) -> None:
    """Synchronize all nodes."""
    if HAS_MPI:  # pragma: no cover
        self.comm.Barrier()