[org 0x0100]
jmp start
multiplicand: dd 123122
multiplier: dd 66341
result: dd 0
start:
push word [multiplier+2]
push word [multiplier]
push word [multiplicand+2]
push word [multiplicand]
call multiply
add sp, 8 ; free arguments
mov [result], ax ; expect result in dx:ax
mov [result+2], dx
mov ax, 0x4c00
int 0x21
multiply:
push bp
mov bp, sp
mov ax, [bp+4]
mul word [bp+8] ; xl * yl
mov cx, [bp+4]
imul cx, [bp+10] ; xl * yh
add dx, cx
mov cx, [bp+6]
imul cx, [bp+8] ; xh * yl
add dx, cx
mov sp, bp
pop bp
ret
It's not clear whether you need a 64 bit result, the above code produces 32 bits.
A 64 bit version may look like this:
[org 0x0100]
jmp start
multiplicand: dd 123122
multiplier: dd 66341
result: dd 0, 0
start:
push word [multiplier+2]
push word [multiplier]
push word [multiplicand+2]
push word [multiplicand]
push result ; pointer for result
call multiply
add sp, 10 ; free arguments
mov ax, 0x4c00
int 0x21
multiply:
push bp
mov bp, sp
push bx
mov bx, [bp+4] ; result
mov ax, [bp+6]
mul word [bp+10] ; xl * yl
mov [bx], ax ; r0
mov [bx+2], dx ; r1
mov ax, [bp+6]
mul word [bp+12] ; xl * yh
add [bx+2], ax ; r1
adc dx, 0
mov [bx+4], dx ; r2
mov ax, [bp+8]
mul word [bp+10] ; xh * yl
add [bx+2], ax
adc [bx+4], dx ; carry into the highest limb is possible here
mov dx, 0 ; inefficient but doesn't affect FLAGS
adc dx, 0 ; setc dl
mov [bx+6], dx ; r3
mov ax, [bp+8]
mul word [bp+12] ; xh * yh
add [bx+4], ax ; r2
adc [bx+6], dx ; r3
mov ax, bx ; return result
pop bx
mov sp, bp
pop bp
ret
(More efficient might be to keep the results of both of the last two multiplies in registers before adding, so we can avoid storing and then doing a memory-destination adc.)
Disclaimer: I have just backported the usual 32 bit convention, whereby an extra hidden argument is used to point to a caller reserved location for the result, which pointer is also returned. This code works, but no idea if 16 bit compilers really used this convention.