DALL-E generated image. It's on fire because it's so fast!

Neural Network in C Part 3: Assembly VS C

Our Neural Network is fast but we can make it even faster.

We will convert the code to assembly, then race the two side by side! This really brings back memories of classes at SFSU. I remember long hours of coding assembly for one of my elective classes. It can be fun to go line by line searching for that assembly bug.

Here is a command to generate assembly code from our neural_network.c file. Note it uses gcc.

gcc -S -O3 -o neural_network.s neural_network.c        

Oh no! I have an error.

(.venv) bot@botnet:~/PycharmProjects/neural$ gcc -S -O3 -o neural_network.s neural_network.c
neural_network.c: In function ‘read_uint32’:
neural_network.c:128:5: warning: ignoring return value of ‘fread’ declared with attribute ‘warn_unused_result’ [-Wunused-result]
  128 |     fread(&result, sizeof(result), 1, f);
      |     ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
neural_network.c: In function ‘load_mnist_images’:
neural_network.c:146:5: warning: ignoring return value of ‘fread’ declared with attribute ‘warn_unused_result’ [-Wunused-result]
  146 |     fread(images, *image_size, *num_images, f);
      |     ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
neural_network.c: In function ‘load_mnist_labels’:
neural_network.c:163:5: warning: ignoring return value of ‘fread’ declared with attribute ‘warn_unused_result’ [-Wunused-result]
  163 |     fread(labels, 1, *num_labels, f);
      |     ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
(.venv) bot@botnet:~/PycharmProjects/neural$ gc        

the fread function's return value is being ignored. This return value should be checked to ensure the read operation was successful. So we need to make some changes. We need to change uint32_t read_uint32(FILE *f).

uint32_t read_uint32(FILE *f) {
    uint32_t result;
    if (fread(&result, sizeof(result), 1, f) != 1) {
        perror("Failed to read uint32_t");
    return __builtin_bswap32(result);

In addition, uint8_t *labels must change.

  uint8_t *labels = (uint8_t*)malloc(*num_labels);
    if (fread(labels, 1, *num_labels, f) != (size_t)(*num_labels)) {
        perror("Failed to read labels");

And that should be it.

Lets save the new code as new_neural_network.c. Use the assembler (as) to assemble the .s file into an object file (.o).

as -o new_neural_network.o new_neural_network.s        

Use the linker (ld) to link the object file and create the executable.

gcc -o new_neural_network new_neural_network.o -lm        

Now for the race! Lets run the generated executable for both neural_network.c and new_neural_network.c!

I have two terminal windows up for this, lets make sure we have 10 epochs for each. We have to make the race somewhat fair.

I'm so confident in our new assembly render I gave ./neural_network about a second headstart!

About 3 minutes into the race, ./new_neural_network is working on it's third epoch. Will ./neural_network catch up?

Oh boy, 6 minutes and 10 seconds in, ./new_neural_network is on it's 8th epoch!

./new_neural_network wins! It took 7 minutes and 51 seconds! Accuracy is 94.05%!

./neural_network took way longer, 11 minutes and 45 seconds. Accuracy is 93.82%

Here is the file new_neural_network.s. Our speed demon.

	.file	"new_neural_network.c"
	.section	.rodata.str1.1,"aMS",@progbits,1
	.string	"Failed to read uint32_t"
	.p2align 4
	.type	read_uint32.part.0, @function
	pushq	%rax
	.cfi_def_cfa_offset 16
	popq	%rax
	.cfi_def_cfa_offset 8
	leaq	.LC0(%rip), %rdi
	subq	$8, %rsp
	.cfi_def_cfa_offset 16
	call	perror@PLT
	movl	$1, %edi
	call	exit@PLT
	.size	read_uint32.part.0, .-read_uint32.part.0
	.p2align 4
	.globl	initialize_layer
	.type	initialize_layer, @function
	pushq	%r14
	.cfi_def_cfa_offset 16
	.cfi_offset 14, -16
	pushq	%r13
	.cfi_def_cfa_offset 24
	.cfi_offset 13, -24
	pushq	%r12
	.cfi_def_cfa_offset 32
	.cfi_offset 12, -32
	movq	%rsi, %r12
	pushq	%rbp
	.cfi_def_cfa_offset 40
	.cfi_offset 6, -40
	movq	%rdi, %rbp
	xorl	%edi, %edi
	pushq	%rbx
	.cfi_def_cfa_offset 48
	.cfi_offset 3, -48
	leaq	803840(%rbp), %r13
	leaq	1024(%rbp), %rbx
	call	time@PLT
	movl	%eax, %edi
	call	srand@PLT
	leaq	-1024(%rbx), %r14
	.p2align 4,,10
	.p2align 3
	call	rand@PLT
	pxor	%xmm0, %xmm0
	addq	$8, %r14
	cvtsi2sdl	%eax, %xmm0
	divsd	.LC1(%rip), %xmm0
	addsd	%xmm0, %xmm0
	subsd	.LC2(%rip), %xmm0
	movsd	%xmm0, -8(%r14)
	cmpq	%rbx, %r14
	jne	.L6
	leaq	1024(%r14), %rbx
	cmpq	%r13, %rbx
	jne	.L5
	addq	$802816, %rbp
	leaq	80(%r12), %rbx
	leaq	10320(%r12), %r13
	call	rand@PLT
	pxor	%xmm0, %xmm0
	leaq	-80(%rbx), %r14
	cvtsi2sdl	%eax, %xmm0
	divsd	.LC1(%rip), %xmm0
	addsd	%xmm0, %xmm0
	subsd	.LC2(%rip), %xmm0
	movsd	%xmm0, 0(%rbp)
	.p2align 4,,10
	.p2align 3
	call	rand@PLT
	pxor	%xmm0, %xmm0
	addq	$8, %r14
	cvtsi2sdl	%eax, %xmm0
	divsd	.LC1(%rip), %xmm0
	addsd	%xmm0, %xmm0
	subsd	.LC2(%rip), %xmm0
	movsd	%xmm0, -8(%r14)
	cmpq	%rbx, %r14
	jne	.L8
	leaq	80(%r14), %rbx
	addq	$8, %rbp
	cmpq	%r13, %rbx
	jne	.L9
	addq	$10240, %r12
	call	rand@PLT
	pxor	%xmm0, %xmm0
	addq	$8, %r12
	cvtsi2sdl	%eax, %xmm0
	divsd	.LC1(%rip), %xmm0
	addsd	%xmm0, %xmm0
	subsd	.LC2(%rip), %xmm0
	movsd	%xmm0, -8(%r12)
	cmpq	%r13, %r12
	jne	.L10
	popq	%rbx
	.cfi_def_cfa_offset 40
	popq	%rbp
	.cfi_def_cfa_offset 32
	popq	%r12
	.cfi_def_cfa_offset 24
	popq	%r13
	.cfi_def_cfa_offset 16
	popq	%r14
	.cfi_def_cfa_offset 8
	.size	initialize_layer, .-initialize_layer
	.p2align 4
	.globl	sigmoid
	.type	sigmoid, @function
	subq	$8, %rsp
	.cfi_def_cfa_offset 16
	xorpd	.LC3(%rip), %xmm0
	call	exp@PLT
	movsd	.LC2(%rip), %xmm1
	addq	$8, %rsp
	.cfi_def_cfa_offset 8
	addsd	%xmm1, %xmm0
	divsd	%xmm0, %xmm1
	movapd	%xmm1, %xmm0
	.size	sigmoid, .-sigmoid
	.p2align 4
	.globl	sigmoid_derivative
	.type	sigmoid_derivative, @function
	movapd	%xmm0, %xmm1
	movsd	.LC2(%rip), %xmm0
	subsd	%xmm1, %xmm0
	mulsd	%xmm1, %xmm0
	.size	sigmoid_derivative, .-sigmoid_derivative
	.p2align 4
	.globl	forward_pass
	.type	forward_pass, @function
	pushq	%r15
	.cfi_def_cfa_offset 16
	.cfi_offset 15, -16
	leaq	802816(%rdi), %r15
	pushq	%r14
	.cfi_def_cfa_offset 24
	.cfi_offset 14, -24
	movq	%rcx, %r14
	pushq	%r13
	.cfi_def_cfa_offset 32
	.cfi_offset 13, -32
	movq	%rsi, %r13
	pushq	%r12
	.cfi_def_cfa_offset 40
	.cfi_offset 12, -40
	movq	%rcx, %r12
	pushq	%rbp
	.cfi_def_cfa_offset 48
	.cfi_offset 6, -48
	movq	%rdx, %rbp
	pushq	%rbx
	.cfi_def_cfa_offset 56
	.cfi_offset 3, -56
	movq	%rdi, %rbx
	addq	$803840, %rbx
	subq	$24, %rsp
	.cfi_def_cfa_offset 80
	movsd	.LC4(%rip), %xmm2
	movq	%r8, 8(%rsp)
	movsd	(%r15), %xmm1
	movq	%rbp, %rdx
	leaq	-802816(%r15), %rax
	.p2align 4,,10
	.p2align 3
	movzbl	(%rdx), %ecx
	pxor	%xmm0, %xmm0
	addq	$1024, %rax
	addq	$1, %rdx
	cvtsi2sdl	%ecx, %xmm0
	divsd	%xmm2, %xmm0
	mulsd	-1024(%rax), %xmm0
	addsd	%xmm0, %xmm1
	cmpq	%rax, %r15
	jne	.L20
	xorpd	.LC3(%rip), %xmm1
	addq	$8, %r15
	addq	$8, %r12
	movapd	%xmm1, %xmm0
	call	exp@PLT
	addsd	.LC2(%rip), %xmm0
	movsd	.LC2(%rip), %xmm1
	movq	.LC4(%rip), %rax
	divsd	%xmm0, %xmm1
	movq	%rax, %xmm2
	movsd	%xmm1, -8(%r12)
	cmpq	%r15, %rbx
	jne	.L21
	movq	8(%rsp), %r12
	leaq	10240(%r13), %rdx
	addq	$10320, %r13
	movsd	(%rdx), %xmm1
	leaq	-10240(%rdx), %rbx
	movq	%r14, %rax
	.p2align 4,,10
	.p2align 3
	movsd	(%rbx), %xmm0
	movupd	(%rax), %xmm3
	addq	$160, %rbx
	addq	$16, %rax
	movhpd	-80(%rbx), %xmm0
	mulpd	%xmm3, %xmm0
	addsd	%xmm0, %xmm1
	unpckhpd	%xmm0, %xmm0
	addsd	%xmm0, %xmm1
	cmpq	%rdx, %rbx
	jne	.L22
	xorpd	.LC3(%rip), %xmm1
	addq	$8, %r12
	movapd	%xmm1, %xmm0
	call	exp@PLT
	addsd	.LC2(%rip), %xmm0
	leaq	8(%rbx), %rdx
	movsd	.LC2(%rip), %xmm1
	divsd	%xmm0, %xmm1
	movsd	%xmm1, -8(%r12)
	cmpq	%rdx, %r13
	jne	.L23
	addq	$24, %rsp
	.cfi_def_cfa_offset 56
	popq	%rbx
	.cfi_def_cfa_offset 48
	popq	%rbp
	.cfi_def_cfa_offset 40
	popq	%r12
	.cfi_def_cfa_offset 32
	popq	%r13
	.cfi_def_cfa_offset 24
	popq	%r14
	.cfi_def_cfa_offset 16
	popq	%r15
	.cfi_def_cfa_offset 8
	.size	forward_pass, .-forward_pass
	.p2align 4
	.globl	backward_pass
	.type	backward_pass, @function
	pushq	%rbp
	.cfi_def_cfa_offset 16
	.cfi_offset 6, -16
	movq	%rdi, %r10
	movq	%rdx, %rdi
	pxor	%xmm3, %xmm3
	pushq	%rbx
	.cfi_def_cfa_offset 24
	.cfi_offset 3, -24
	movdqa	%xmm3, %xmm5
	movq	%rcx, %r11
	movq	%r8, %rax
	subq	$1128, %rsp
	.cfi_def_cfa_offset 1152
	movapd	.LC7(%rip), %xmm1
	movsd	.LC2(%rip), %xmm8
	movq	%fs:40, %rdx
	movq	%rdx, 1112(%rsp)
	xorl	%edx, %edx
	movzbl	%r9b, %edx
	movd	%edx, %xmm6
	movapd	%xmm1, %xmm7
	pshufd	$0, %xmm6, %xmm0
	movupd	16(%r8), %xmm6
	movdqa	%xmm0, %xmm2
	pcmpeqd	.LC8(%rip), %xmm0
	pcmpeqd	.LC6(%rip), %xmm2
	pcmpgtd	%xmm0, %xmm3
	pcmpgtd	%xmm2, %xmm5
	movdqa	%xmm2, %xmm4
	punpckldq	%xmm5, %xmm4
	punpckhdq	%xmm5, %xmm2
	movupd	(%r8), %xmm5
	andpd	%xmm1, %xmm4
	andpd	%xmm1, %xmm2
	subpd	%xmm5, %xmm7
	subpd	%xmm5, %xmm4
	subpd	%xmm6, %xmm2
	mulpd	%xmm5, %xmm7
	movapd	%xmm1, %xmm5
	subpd	%xmm6, %xmm5
	mulpd	%xmm6, %xmm5
	movupd	32(%r8), %xmm6
	mulpd	%xmm4, %xmm7
	movapd	%xmm1, %xmm4
	subpd	%xmm6, %xmm4
	mulpd	%xmm2, %xmm5
	movdqa	%xmm0, %xmm2
	punpckhdq	%xmm3, %xmm0
	mulpd	%xmm6, %xmm4
	punpckldq	%xmm3, %xmm2
	andpd	%xmm1, %xmm0
	movaps	%xmm7, (%rsp)
	andpd	%xmm1, %xmm2
	movupd	48(%r8), %xmm3
	subpd	%xmm6, %xmm2
	subpd	%xmm3, %xmm0
	movaps	%xmm5, 16(%rsp)
	mulpd	%xmm2, %xmm4
	movapd	%xmm1, %xmm2
	subpd	%xmm3, %xmm2
	mulpd	%xmm3, %xmm2
	movaps	%xmm4, 32(%rsp)
	mulpd	%xmm0, %xmm2
	movsd	64(%r8), %xmm0
	movaps	%xmm2, 48(%rsp)
	cmpl	$8, %edx
	je	.L30
	movapd	%xmm8, %xmm3
	pxor	%xmm6, %xmm6
	subsd	%xmm0, %xmm3
	movapd	%xmm6, %xmm1
	subsd	%xmm0, %xmm1
	mulsd	%xmm0, %xmm3
	movapd	%xmm8, %xmm0
	mulsd	%xmm1, %xmm3
	movsd	%xmm3, 64(%rsp)
	cmpl	$9, %edx
	jne	.L53
	movsd	72(%rax), %xmm1
	movapd	%xmm8, %xmm9
	movapd	%xmm7, %xmm13
	movapd	%xmm5, %xmm12
	movapd	%xmm4, %xmm11
	movapd	%xmm2, %xmm10
	unpckhpd	%xmm7, %xmm7
	movq	%rsi, %r8
	subsd	%xmm1, %xmm9
	subsd	%xmm1, %xmm0
	unpckhpd	%xmm5, %xmm5
	movq	%rsi, %rax
	unpckhpd	%xmm4, %xmm4
	unpckhpd	%xmm2, %xmm2
	leaq	80(%rsp), %rcx
	xorl	%edx, %edx
	mulsd	%xmm1, %xmm9
	mulsd	%xmm0, %xmm9
	movsd	%xmm9, 72(%rsp)
	movsd	(%rax), %xmm0
	movsd	8(%rax), %xmm14
	addq	$80, %rax
	movsd	(%r11,%rdx), %xmm1
	addq	$8, %rdx
	mulsd	%xmm13, %xmm0
	mulsd	%xmm7, %xmm14
	addsd	%xmm6, %xmm0
	addsd	%xmm14, %xmm0
	movsd	-64(%rax), %xmm14
	mulsd	%xmm12, %xmm14
	addsd	%xmm14, %xmm0
	movsd	-56(%rax), %xmm14
	mulsd	%xmm5, %xmm14
	addsd	%xmm14, %xmm0
	movsd	-48(%rax), %xmm14
	mulsd	%xmm11, %xmm14
	addsd	%xmm14, %xmm0
	movsd	-40(%rax), %xmm14
	mulsd	%xmm4, %xmm14
	addsd	%xmm14, %xmm0
	movsd	-32(%rax), %xmm14
	mulsd	%xmm10, %xmm14
	addsd	%xmm14, %xmm0
	movsd	-24(%rax), %xmm14
	mulsd	%xmm2, %xmm14
	addsd	%xmm14, %xmm0
	movsd	-16(%rax), %xmm14
	mulsd	%xmm3, %xmm14
	addsd	%xmm14, %xmm0
	movsd	-8(%rax), %xmm14
	mulsd	%xmm9, %xmm14
	addsd	%xmm14, %xmm0
	movapd	%xmm8, %xmm14
	subsd	%xmm1, %xmm14
	mulsd	%xmm14, %xmm1
	mulsd	%xmm1, %xmm0
	movsd	%xmm0, -8(%rdx,%rcx)
	cmpq	$1024, %rdx
	jne	.L32
	leaq	10320(%rsi), %rbp
	movq	%rcx, %rax
	movq	%rsp, %rbx
	movsd	.LC9(%rip), %xmm3
	leaq	10240(%rsi), %r9
	movq	%rbp, %rcx
	leaq	1024(%r11), %rsi
	movq	%rax, %rbp
	movsd	(%rbx), %xmm2
	movsd	(%r9), %xmm0
	leaq	-72(%r9), %rax
	mulsd	%xmm3, %xmm2
	addsd	%xmm2, %xmm0
	movsd	%xmm0, (%r9)
	cmpq	%rax, %r11
	jnb	.L43
	cmpq	%r8, %rsi
	ja	.L33
	unpcklpd	%xmm2, %xmm2
	movq	%r11, %rdx
	movq	%r8, %rax
	.p2align 4,,10
	.p2align 3
	movupd	(%rdx), %xmm0
	movsd	(%rax), %xmm1
	addq	$160, %rax
	addq	$16, %rdx
	mulpd	%xmm2, %xmm0
	movhpd	-80(%rax), %xmm1
	addpd	%xmm1, %xmm0
	movlpd	%xmm0, -160(%rax)
	movhpd	%xmm0, -80(%rax)
	cmpq	%rax, %r9
	jne	.L35
	addq	$8, %r9
	addq	$8, %r8
	addq	$8, %rbx
	cmpq	%rcx, %r9
	jne	.L38
	movsd	.LC4(%rip), %xmm2
	leaq	802816(%r10), %rsi
	movq	%rbp, %r8
	addq	$803840, %r10
	movsd	(%r8), %xmm1
	movsd	(%rsi), %xmm0
	leaq	-802816(%rsi), %rax
	movq	%rdi, %rdx
	mulsd	%xmm3, %xmm1
	addsd	%xmm1, %xmm0
	movsd	%xmm0, (%rsi)
	.p2align 4,,10
	.p2align 3
	movzbl	(%rdx), %ecx
	pxor	%xmm0, %xmm0
	addq	$1024, %rax
	addq	$1, %rdx
	cvtsi2sdl	%ecx, %xmm0
	divsd	%xmm2, %xmm0
	mulsd	%xmm1, %xmm0
	addsd	-1024(%rax), %xmm0
	movsd	%xmm0, -1024(%rax)
	cmpq	%rax, %rsi
	jne	.L39
	addq	$8, %rsi
	addq	$8, %r8
	cmpq	%r10, %rsi
	jne	.L40
	movq	1112(%rsp), %rax
	subq	%fs:40, %rax
	jne	.L54
	addq	$1128, %rsp
	.cfi_def_cfa_offset 24
	popq	%rbx
	.cfi_def_cfa_offset 16
	popq	%rbp
	.cfi_def_cfa_offset 8
	movq	%r11, %rdx
	movq	%r8, %rax
	.p2align 4,,10
	.p2align 3
	movsd	(%rdx), %xmm0
	addq	$80, %rax
	addq	$8, %rdx
	mulsd	%xmm2, %xmm0
	addsd	-80(%rax), %xmm0
	movsd	%xmm0, -80(%rax)
	cmpq	%rax, %r9
	jne	.L37
	jmp	.L36
	movapd	%xmm8, %xmm1
	pxor	%xmm6, %xmm6
	subsd	%xmm0, %xmm1
	mulsd	%xmm1, %xmm0
	movapd	%xmm0, %xmm3
	movapd	%xmm6, %xmm0
	mulsd	%xmm1, %xmm3
	movsd	%xmm3, 64(%rsp)
	jmp	.L31
	movapd	%xmm6, %xmm0
	jmp	.L31
	call	__stack_chk_fail@PLT
	.size	backward_pass, .-backward_pass
	.section	.rodata.str1.1
	.string	"Epoch %d/%d completed\n"
	.p2align 4
	.globl	train
	.type	train, @function
	pushq	%r15
	.cfi_def_cfa_offset 16
	.cfi_offset 15, -16
	pushq	%r14
	.cfi_def_cfa_offset 24
	.cfi_offset 14, -24
	pushq	%r13
	.cfi_def_cfa_offset 32
	.cfi_offset 13, -32
	movq	%rdi, %r13
	pushq	%r12
	.cfi_def_cfa_offset 40
	.cfi_offset 12, -40
	movq	%rsi, %r12
	pushq	%rbp
	.cfi_def_cfa_offset 48
	.cfi_offset 6, -48
	pushq	%rbx
	.cfi_def_cfa_offset 56
	.cfi_offset 3, -56
	subq	$1176, %rsp
	.cfi_def_cfa_offset 1232
	movq	%rdx, 32(%rsp)
	movq	%rcx, 40(%rsp)
	movl	%r8d, 24(%rsp)
	movl	%r9d, 28(%rsp)
	movq	%fs:40, %rax
	movq	%rax, 1160(%rsp)
	xorl	%eax, %eax
	leal	-1(%r8), %eax
	movl	$0, 20(%rsp)
	leaq	1(%rcx,%rax), %rax
	movq	%rax, 8(%rsp)
	testl	%r9d, %r9d
	jle	.L55
	.p2align 4,,10
	.p2align 3
	movl	24(%rsp), %eax
	testl	%eax, %eax
	jle	.L60
	movq	32(%rsp), %r14
	movq	40(%rsp), %rbx
	leaq	48(%rsp), %r15
	leaq	128(%rsp), %rbp
	.p2align 4,,10
	.p2align 3
	movq	%r14, %rdx
	movq	%r15, %r8
	movq	%rbp, %rcx
	movq	%r12, %rsi
	movq	%r13, %rdi
	addq	$1, %rbx
	call	forward_pass
	movzbl	-1(%rbx), %r9d
	movq	%r14, %rdx
	movq	%r15, %r8
	movq	%rbp, %rcx
	movq	%r12, %rsi
	movq	%r13, %rdi
	addq	$784, %r14
	call	backward_pass
	cmpq	8(%rsp), %rbx
	jne	.L58
	addl	$1, 20(%rsp)
	movl	28(%rsp), %r15d
	xorl	%eax, %eax
	leaq	.LC10(%rip), %rsi
	movl	20(%rsp), %ebx
	movl	$1, %edi
	movl	%r15d, %ecx
	movl	%ebx, %edx
	call	__printf_chk@PLT
	cmpl	%r15d, %ebx
	jne	.L56
	movq	1160(%rsp), %rax
	subq	%fs:40, %rax
	jne	.L68
	addq	$1176, %rsp
	.cfi_def_cfa_offset 56
	popq	%rbx
	.cfi_def_cfa_offset 48
	popq	%rbp
	.cfi_def_cfa_offset 40
	popq	%r12
	.cfi_def_cfa_offset 32
	popq	%r13
	.cfi_def_cfa_offset 24
	popq	%r14
	.cfi_def_cfa_offset 16
	popq	%r15
	.cfi_def_cfa_offset 8
	call	__stack_chk_fail@PLT
	.size	train, .-train
	.p2align 4
	.globl	recognize
	.type	recognize, @function
	subq	$1128, %rsp
	.cfi_def_cfa_offset 1136
	movq	%fs:40, %rax
	movq	%rax, 1112(%rsp)
	xorl	%eax, %eax
	leaq	80(%rsp), %rcx
	movq	%rsp, %r8
	call	forward_pass
	movsd	(%rsp), %xmm1
	movsd	8(%rsp), %xmm0
	movl	$1, %eax
	comisd	%xmm1, %xmm0
	ja	.L70
	movapd	%xmm1, %xmm0
	xorl	%eax, %eax
	movsd	16(%rsp), %xmm1
	comisd	%xmm0, %xmm1
	ja	.L81
	movapd	%xmm0, %xmm1
	movsd	24(%rsp), %xmm0
	comisd	%xmm1, %xmm0
	ja	.L82
	movapd	%xmm1, %xmm0
	movsd	32(%rsp), %xmm1
	comisd	%xmm0, %xmm1
	ja	.L83
	movapd	%xmm0, %xmm1
	movsd	40(%rsp), %xmm0
	comisd	%xmm1, %xmm0
	ja	.L84
	movapd	%xmm1, %xmm0
	movsd	48(%rsp), %xmm1
	comisd	%xmm0, %xmm1
	ja	.L85
	movapd	%xmm0, %xmm1
	movsd	56(%rsp), %xmm0
	comisd	%xmm1, %xmm0
	ja	.L86
	movapd	%xmm1, %xmm0
	movsd	64(%rsp), %xmm1
	comisd	%xmm0, %xmm1
	ja	.L87
	movapd	%xmm0, %xmm1
	movsd	72(%rsp), %xmm0
	movl	$9, %edx
	ucomisd	%xmm1, %xmm0
	cmova	%edx, %eax
	movq	1112(%rsp), %rdx
	subq	%fs:40, %rdx
	jne	.L91
	addq	$1128, %rsp
	.cfi_def_cfa_offset 8
	.p2align 4,,10
	.p2align 3
	movsd	24(%rsp), %xmm0
	movl	$2, %eax
	comisd	%xmm1, %xmm0
	jbe	.L92
	movsd	32(%rsp), %xmm1
	movl	$3, %eax
	comisd	%xmm0, %xmm1
	jbe	.L93
	movsd	40(%rsp), %xmm0
	movl	$4, %eax
	comisd	%xmm1, %xmm0
	jbe	.L94
	movsd	48(%rsp), %xmm1
	movl	$5, %eax
	comisd	%xmm0, %xmm1
	jbe	.L95
	movsd	56(%rsp), %xmm0
	movl	$6, %eax
	comisd	%xmm1, %xmm0
	jbe	.L96
	movsd	64(%rsp), %xmm1
	movl	$7, %eax
	comisd	%xmm0, %xmm1
	jbe	.L97
	movl	$8, %eax
	jmp	.L77
	call	__stack_chk_fail@PLT
	.size	recognize, .-recognize
	.p2align 4
	.globl	read_uint32
	.type	read_uint32, @function
	subq	$24, %rsp
	.cfi_def_cfa_offset 32
	movq	%rdi, %rcx
	movl	$1, %edx
	movl	$4, %esi
	movq	%fs:40, %rax
	movq	%rax, 8(%rsp)
	xorl	%eax, %eax
	leaq	4(%rsp), %rdi
	call	fread@PLT
	cmpq	$1, %rax
	jne	.L102
	movl	4(%rsp), %eax
	bswap	%eax
	movq	8(%rsp), %rdx
	subq	%fs:40, %rdx
	jne	.L103
	addq	$24, %rsp
	.cfi_def_cfa_offset 8
	call	read_uint32.part.0
	call	__stack_chk_fail@PLT
	.size	read_uint32, .-read_uint32
	.section	.rodata.str1.1
	.string	"rb"
	.string	"Failed to open file"
	.string	"Failed to read images"
	.p2align 4
	.globl	load_mnist_images
	.type	load_mnist_images, @function
	pushq	%r13
	.cfi_def_cfa_offset 16
	.cfi_offset 13, -16
	pushq	%r12
	.cfi_def_cfa_offset 24
	.cfi_offset 12, -24
	pushq	%rbp
	.cfi_def_cfa_offset 32
	.cfi_offset 6, -32
	movq	%rdx, %rbp
	pushq	%rbx
	.cfi_def_cfa_offset 40
	.cfi_offset 3, -40
	movq	%rsi, %rbx
	leaq	.LC11(%rip), %rsi
	subq	$40, %rsp
	.cfi_def_cfa_offset 80
	movq	%fs:40, %rax
	movq	%rax, 24(%rsp)
	xorl	%eax, %eax
	call	fopen@PLT
	testq	%rax, %rax
	je	.L111
	leaq	20(%rsp), %rdi
	movq	%rax, %rcx
	movl	$1, %edx
	movq	%rax, %r12
	movl	$4, %esi
	call	fread@PLT
	cmpq	$1, %rax
	jne	.L107
	leaq	16(%rsp), %rdi
	movq	%r12, %rcx
	movl	$1, %edx
	movl	$4, %esi
	call	fread@PLT
	cmpq	$1, %rax
	jne	.L107
	movl	16(%rsp), %eax
	leaq	12(%rsp), %rdi
	movq	%r12, %rcx
	movl	$1, %edx
	movl	$4, %esi
	bswap	%eax
	movl	%eax, (%rbx)
	call	fread@PLT
	cmpq	$1, %rax
	jne	.L107
	movl	12(%rsp), %r13d
	leaq	8(%rsp), %rdi
	movq	%r12, %rcx
	movl	$1, %edx
	movl	$4, %esi
	bswap	%r13d
	call	fread@PLT
	cmpq	$1, %rax
	jne	.L107
	movl	8(%rsp), %edi
	bswap	%edi
	imull	%r13d, %edi
	movl	%edi, 0(%rbp)
	imull	(%rbx), %edi
	movslq	%edi, %rdi
	call	malloc@PLT
	movslq	0(%rbp), %rdx
	movslq	(%rbx), %rcx
	movq	%r12, %r8
	movq	$-1, %rsi
	movq	%rax, %rdi
	movq	%rax, %r13
	call	__fread_chk@PLT
	movslq	(%rbx), %rdx
	cmpq	%rax, %rdx
	jne	.L112
	movq	%r12, %rdi
	call	fclose@PLT
	movq	24(%rsp), %rax
	subq	%fs:40, %rax
	jne	.L113
	addq	$40, %rsp
	.cfi_def_cfa_offset 40
	movq	%r13, %rax
	popq	%rbx
	.cfi_def_cfa_offset 32
	popq	%rbp
	.cfi_def_cfa_offset 24
	popq	%r12
	.cfi_def_cfa_offset 16
	popq	%r13
	.cfi_def_cfa_offset 8
	call	read_uint32.part.0
	leaq	.LC12(%rip), %rdi
	call	perror@PLT
	movl	$1, %edi
	call	exit@PLT
	call	__stack_chk_fail@PLT
	leaq	.LC13(%rip), %rdi
	call	perror@PLT
	movl	$1, %edi
	call	exit@PLT
	.size	load_mnist_images, .-load_mnist_images
	.section	.rodata.str1.1
	.string	"Failed to read labels"
	.p2align 4
	.globl	load_mnist_labels
	.type	load_mnist_labels, @function
	pushq	%r12
	.cfi_def_cfa_offset 16
	.cfi_offset 12, -16
	pushq	%rbp
	.cfi_def_cfa_offset 24
	.cfi_offset 6, -24
	pushq	%rbx
	.cfi_def_cfa_offset 32
	.cfi_offset 3, -32
	movq	%rsi, %rbx
	leaq	.LC11(%rip), %rsi
	subq	$16, %rsp
	.cfi_def_cfa_offset 48
	movq	%fs:40, %rax
	movq	%rax, 8(%rsp)
	xorl	%eax, %eax
	call	fopen@PLT
	testq	%rax, %rax
	je	.L121
	leaq	4(%rsp), %rdi
	movq	%rax, %rcx
	movl	$1, %edx
	movq	%rax, %rbp
	movl	$4, %esi
	call	fread@PLT
	cmpq	$1, %rax
	jne	.L117
	movq	%rsp, %rdi
	movq	%rbp, %rcx
	movl	$1, %edx
	movl	$4, %esi
	call	fread@PLT
	cmpq	$1, %rax
	jne	.L117
	movl	(%rsp), %edi
	bswap	%edi
	movl	%edi, (%rbx)
	movslq	%edi, %rdi
	call	malloc@PLT
	movslq	(%rbx), %rdx
	movq	%rbp, %rcx
	movl	$1, %esi
	movq	%rax, %rdi
	movq	%rax, %r12
	call	fread@PLT
	movslq	(%rbx), %rdx
	cmpq	%rax, %rdx
	jne	.L122
	movq	%rbp, %rdi
	call	fclose@PLT
	movq	8(%rsp), %rax
	subq	%fs:40, %rax
	jne	.L123
	addq	$16, %rsp
	.cfi_def_cfa_offset 32
	movq	%r12, %rax
	popq	%rbx
	.cfi_def_cfa_offset 24
	popq	%rbp
	.cfi_def_cfa_offset 16
	popq	%r12
	.cfi_def_cfa_offset 8
	call	read_uint32.part.0
	leaq	.LC12(%rip), %rdi
	call	perror@PLT
	movl	$1, %edi
	call	exit@PLT
	call	__stack_chk_fail@PLT
	leaq	.LC14(%rip), %rdi
	call	perror@PLT
	movl	$1, %edi
	call	exit@PLT
	.size	load_mnist_labels, .-load_mnist_labels
	.p2align 4
	.globl	calculate_accuracy
	.type	calculate_accuracy, @function
	pushq	%r15
	.cfi_def_cfa_offset 16
	.cfi_offset 15, -16
	pushq	%r14
	.cfi_def_cfa_offset 24
	.cfi_offset 14, -24
	pushq	%r13
	.cfi_def_cfa_offset 32
	.cfi_offset 13, -32
	pushq	%r12
	.cfi_def_cfa_offset 40
	.cfi_offset 12, -40
	pushq	%rbp
	.cfi_def_cfa_offset 48
	.cfi_offset 6, -48
	pushq	%rbx
	.cfi_def_cfa_offset 56
	.cfi_offset 3, -56
	subq	$24, %rsp
	.cfi_def_cfa_offset 80
	movl	%r8d, 12(%rsp)
	testl	%r8d, %r8d
	jle	.L128
	movl	%r8d, %eax
	movq	%rdi, %r12
	movq	%rsi, %r13
	movq	%rdx, %r14
	subl	$1, %eax
	movq	%rcx, %rbx
	xorl	%ebp, %ebp
	leaq	1(%rcx,%rax), %r15
	.p2align 4,,10
	.p2align 3
	movq	%r14, %rdx
	movq	%r13, %rsi
	movq	%r12, %rdi
	call	recognize
	cmpb	%al, (%rbx)
	sete	%al
	addq	$1, %rbx
	addq	$784, %r14
	movzbl	%al, %eax
	addl	%eax, %ebp
	cmpq	%r15, %rbx
	jne	.L127
	pxor	%xmm0, %xmm0
	cvtsi2sdl	%ebp, %xmm0
	pxor	%xmm1, %xmm1
	cvtsi2sdl	12(%rsp), %xmm1
	addq	$24, %rsp
	.cfi_def_cfa_offset 56
	popq	%rbx
	.cfi_def_cfa_offset 48
	popq	%rbp
	.cfi_def_cfa_offset 40
	popq	%r12
	.cfi_def_cfa_offset 32
	popq	%r13
	.cfi_def_cfa_offset 24
	popq	%r14
	.cfi_def_cfa_offset 16
	popq	%r15
	.cfi_def_cfa_offset 8
	divsd	%xmm1, %xmm0
	.p2align 4,,10
	.p2align 3
	pxor	%xmm0, %xmm0
	jmp	.L125
	.size	calculate_accuracy, .-calculate_accuracy
	.section	.rodata.str1.8,"aMS",@progbits,1
	.align 8
	.string	"mnist_data/train-images-idx3-ubyte"
	.align 8
	.string	"mnist_data/train-labels-idx1-ubyte"
	.align 8
	.string	"Number of images and labels do not match\n"
	.section	.rodata.str1.1
	.string	"Training accuracy: %.2f%%\n"
	.section	.rodata.str1.8
	.align 8
	.string	"Image %d: Recognized digit = %d, Actual digit = %d\n"
	.section	.text.startup,"ax",@progbits
	.p2align 4
	.globl	main
	.type	main, @function
	pushq	%r15
	.cfi_def_cfa_offset 16
	.cfi_offset 15, -16
	pushq	%r14
	.cfi_def_cfa_offset 24
	.cfi_offset 14, -24
	pushq	%r13
	.cfi_def_cfa_offset 32
	.cfi_offset 13, -32
	pushq	%r12
	.cfi_def_cfa_offset 40
	.cfi_offset 12, -40
	pushq	%rbp
	.cfi_def_cfa_offset 48
	.cfi_offset 6, -48
	pushq	%rbx
	.cfi_def_cfa_offset 56
	.cfi_offset 3, -56
	leaq	-815104(%rsp), %r11
	.cfi_def_cfa 11, 815160
	subq	$4096, %rsp
	orq	$0, (%rsp)
	cmpq	%r11, %rsp
	jne	.LPSRL0
	.cfi_def_cfa_register 7
	subq	$232, %rsp
	.cfi_def_cfa_offset 815392
	leaq	.LC15(%rip), %rdi
	movq	%fs:40, %rax
	movq	%rax, 815320(%rsp)
	xorl	%eax, %eax
	leaq	40(%rsp), %rdx
	leaq	36(%rsp), %rsi
	call	load_mnist_images
	leaq	44(%rsp), %rsi
	leaq	.LC16(%rip), %rdi
	movq	%rax, 24(%rsp)
	call	load_mnist_labels
	movq	%rax, 8(%rsp)
	movl	44(%rsp), %eax
	cmpl	%eax, 36(%rsp)
	jne	.L169
	leaq	1152(%rsp), %r12
	leaq	11472(%rsp), %rbp
	xorl	%r13d, %r13d
	movq	%r12, %rsi
	movq	%rbp, %rdi
	call	initialize_layer
	movq	8(%rsp), %r15
	movq	%r12, %rsi
	movq	%rbp, %rdi
	movq	24(%rsp), %rbx
	movl	36(%rsp), %r8d
	movl	$10, %r9d
	movq	%r15, %rcx
	movq	%rbx, %rdx
	call	train
	movl	36(%rsp), %r8d
	movq	%r15, %rcx
	movq	%rbx, %rdx
	movq	%r12, %rsi
	movq	%rbp, %rdi
	leaq	48(%rsp), %r15
	call	calculate_accuracy
	movl	$1, %edi
	movl	$1, %eax
	leaq	.LC19(%rip), %rsi
	mulsd	.LC18(%rip), %xmm0
	call	__printf_chk@PLT
	leaq	128(%rsp), %rax
	movq	%rax, 16(%rsp)
	.p2align 4,,10
	.p2align 3
	movq	16(%rsp), %rcx
	movq	%r15, %r8
	movq	%rbx, %rdx
	movq	%r12, %rsi
	movq	%rbp, %rdi
	movl	%r13d, %r14d
	call	forward_pass
	movsd	48(%rsp), %xmm0
	movsd	56(%rsp), %xmm1
	xorl	%ecx, %ecx
	comisd	%xmm0, %xmm1
	jbe	.L133
	movapd	%xmm1, %xmm0
	movl	$1, %ecx
	movsd	64(%rsp), %xmm1
	comisd	%xmm0, %xmm1
	jbe	.L135
	movapd	%xmm1, %xmm0
	movl	$2, %ecx
	movsd	72(%rsp), %xmm1
	comisd	%xmm0, %xmm1
	jbe	.L137
	movapd	%xmm1, %xmm0
	movl	$3, %ecx
	movsd	80(%rsp), %xmm1
	comisd	%xmm0, %xmm1
	jbe	.L139
	movapd	%xmm1, %xmm0
	movl	$4, %ecx
	movsd	88(%rsp), %xmm1
	comisd	%xmm0, %xmm1
	jbe	.L141
	movapd	%xmm1, %xmm0
	movl	$5, %ecx
	movsd	96(%rsp), %xmm1
	comisd	%xmm0, %xmm1
	jbe	.L143
	movapd	%xmm1, %xmm0
	movl	$6, %ecx
	movsd	104(%rsp), %xmm1
	comisd	%xmm0, %xmm1
	jbe	.L145
	movapd	%xmm1, %xmm0
	movl	$7, %ecx
	movsd	112(%rsp), %xmm1
	comisd	%xmm0, %xmm1
	jbe	.L147
	movapd	%xmm1, %xmm0
	movl	$8, %ecx
	movsd	120(%rsp), %xmm1
	movq	8(%rsp), %rax
	comisd	%xmm0, %xmm1
	movzbl	(%rax,%r13), %r8d
	jbe	.L168
	movl	$9, %ecx
	movl	%r14d, %edx
	leaq	.LC20(%rip), %rsi
	xorl	%eax, %eax
	addq	$1, %r13
	movl	$1, %edi
	addq	$784, %rbx
	call	__printf_chk@PLT
	cmpq	$10, %r13
	jne	.L152
	movq	24(%rsp), %rdi
	call	free@PLT
	movq	8(%rsp), %rdi
	call	free@PLT
	movq	815320(%rsp), %rax
	subq	%fs:40, %rax
	jne	.L170
	addq	$815336, %rsp
	.cfi_def_cfa_offset 56
	xorl	%eax, %eax
	popq	%rbx
	.cfi_def_cfa_offset 48
	popq	%rbp
	.cfi_def_cfa_offset 40
	popq	%r12
	.cfi_def_cfa_offset 32
	popq	%r13
	.cfi_def_cfa_offset 24
	popq	%r14
	.cfi_def_cfa_offset 16
	popq	%r15
	.cfi_def_cfa_offset 8
	movq	stderr(%rip), %rcx
	movl	$41, %edx
	movl	$1, %esi
	leaq	.LC17(%rip), %rdi
	call	fwrite@PLT
	movl	$1, %edi
	call	exit@PLT
	call	__stack_chk_fail@PLT
	.size	main, .-main
	.section	.rodata.cst8,"aM",@progbits,8
	.align 8
	.long	-4194304
	.long	1105199103
	.set	.LC2,.LC7
	.section	.rodata.cst16,"aM",@progbits,16
	.align 16
	.long	0
	.long	-2147483648
	.long	0
	.long	0
	.section	.rodata.cst8
	.align 8
	.long	0
	.long	1081073664
	.section	.rodata.cst16
	.align 16
	.long	0
	.long	1
	.long	2
	.long	3
	.align 16
	.long	0
	.long	1072693248
	.long	0
	.long	1072693248
	.align 16
	.long	4
	.long	5
	.long	6
	.long	7
	.section	.rodata.cst8
	.align 8
	.long	1202590843
	.long	1065646817
	.align 8
	.long	0
	.long	1079574528
	.ident	"GCC: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0"
	.section	.note.GNU-stack,"",@progbits
	.align 8
	.long	1f - 0f
	.long	4f - 1f
	.long	5
	.string	"GNU"
	.align 8
	.long	0xc0000002
	.long	3f - 2f
	.long	0x3
	.align 8

In conclusion, converting C code to assembly will make it faster. Let me know of any improvements to make. I also was able to update the github repo. -Henry

