Skip to content

Transformers

SC-native transformer blocks built on stochastic attention.

  • StochasticTransformerBlock — S-Former: spiking transformer with bitstream multi-head attention. Architecture: Input -> SC Multi-Head Attention -> Add & Norm -> SC Dense FF -> Add & Norm -> Output. Softmax approximated via CORDIV.
from sc_neurocore.transformers import StochasticTransformerBlock

block = StochasticTransformerBlock(d_model=64, n_heads=4, length=256)
output = block.forward(input_sequence)

See Tutorial 54: Spiking Transformers.

sc_neurocore.transformers.block

StochasticTransformerBlock dataclass

Spiking Transformer Block (S-Former). Structure: Input -> Multi-Head Attention -> Add & Norm -> Feed Forward -> Add & Norm -> Output

Source code in src/sc_neurocore/transformers/block.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
@dataclass
class StochasticTransformerBlock:
    """
    Spiking Transformer Block (S-Former).
    Structure:
    Input -> Multi-Head Attention -> Add & Norm -> Feed Forward -> Add & Norm -> Output
    """

    d_model: int
    n_heads: int
    length: int = 1024

    def __post_init__(self) -> None:
        # We simplify Multi-Head to Single-Head for this demo
        self.attention = StochasticAttention(dim_k=self.d_model)

        # Feed Forward Network (FFN)
        # 2-layer MLP: d_model -> 4*d_model -> d_model
        # We use our Vectorized Layer for efficiency
        self.ffn_1 = VectorizedSCLayer(
            n_inputs=self.d_model, n_neurons=4 * self.d_model, length=self.length
        )
        self.ffn_2 = VectorizedSCLayer(
            n_inputs=4 * self.d_model, n_neurons=self.d_model, length=self.length
        )

    def forward(self, x: np.ndarray[Any, Any]) -> np.ndarray[Any, Any]:
        """
        x: (d_model,) or (Sequence_Length, d_model). Returns same shape.
        """
        input_1d = x.ndim == 1
        attn_out = self.attention.forward(Q=x, K=x, V=x)

        # Match shapes for residual: attention may add a batch dim
        if input_1d and attn_out.ndim > 1:
            attn_out = attn_out.reshape(-1)[: x.shape[0]]

        res1 = np.clip(0.5 * x + 0.5 * attn_out, 0.0, 1.0)

        # Position-wise FFN: apply same weights to each token
        def _ffn(token: np.ndarray) -> np.ndarray:
            vals = token.tolist() if hasattr(token, "tolist") else token
            h = np.clip(self.ffn_1.forward(vals), 0.0, 1.0)
            return self.ffn_2.forward(h.tolist() if hasattr(h, "tolist") else h)

        if res1.ndim > 1:
            ff_out = np.zeros_like(res1)
            for t in range(res1.shape[0]):
                ff_out[t] = _ffn(res1[t])
        else:
            ff_out = _ffn(res1)

        return 0.5 * res1 + 0.5 * ff_out

forward(x)

x: (d_model,) or (Sequence_Length, d_model). Returns same shape.

Source code in src/sc_neurocore/transformers/block.py
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
def forward(self, x: np.ndarray[Any, Any]) -> np.ndarray[Any, Any]:
    """
    x: (d_model,) or (Sequence_Length, d_model). Returns same shape.
    """
    input_1d = x.ndim == 1
    attn_out = self.attention.forward(Q=x, K=x, V=x)

    # Match shapes for residual: attention may add a batch dim
    if input_1d and attn_out.ndim > 1:
        attn_out = attn_out.reshape(-1)[: x.shape[0]]

    res1 = np.clip(0.5 * x + 0.5 * attn_out, 0.0, 1.0)

    # Position-wise FFN: apply same weights to each token
    def _ffn(token: np.ndarray) -> np.ndarray:
        vals = token.tolist() if hasattr(token, "tolist") else token
        h = np.clip(self.ffn_1.forward(vals), 0.0, 1.0)
        return self.ffn_2.forward(h.tolist() if hasattr(h, "tolist") else h)

    if res1.ndim > 1:
        ff_out = np.zeros_like(res1)
        for t in range(res1.shape[0]):
            ff_out[t] = _ffn(res1[t])
    else:
        ff_out = _ffn(res1)

    return 0.5 * res1 + 0.5 * ff_out