Verilog -- 改進的Booth乘法(基4)
@(verilog)
1. 背景
之前已經介紹過Booth乘法算法的基本原理以及代碼,實際上之前的算法是基2的booth算法,每次對乘數編碼都只考慮兩位。因此在實際實現時往往效率不高,考慮最壞情況,使用基2的booth算法計算兩個8位數據的乘法,除了編碼復雜,計算時需要累加8個部分積,可見最壞情況跟普通陣列乘法器需要累加的部分積個數一樣,因此代價不低。
改進的Booth乘法為了減少部分積的累加,現在基本很少采用基2的booth算法了,而是采用基4甚至基8的形式,下面主要介紹一下基4的booth算法。
2. 原理
跟基2的算法一樣,假設A和B是乘數和被乘數,且有:
其中,\(a_{-1}\)是末尾補的0,\(a_{2n},a_{2n+1}\)是擴展的兩位符號位。可以將乘數A表示為:
同樣可以將兩數的積表示為:
紅色部分即為基4booth的編碼方式。
3. 算法實現
有了公式就可以比較方便地推導算法步驟了,首先給出基4booth的編碼表:
乘數位 \((a_{2k-1}+a_{2k}-2a_{2k+1})\) | 編碼操作 |
---|---|
000 | 0 |
001 | +B |
010 | +B |
011 | +2B |
100 | -2B |
101 | -B |
110 | -B |
111 | 0 |
所有操作過后都會移位兩次。
示例:
\(A = -7,B = -3\)
首先,計算編碼需要的操作數:
\(+B = 1111 1101\)
\(-B = 0000 0011\)
\(+2B = 1111 1010\)
\(-2B = 0000 0110\)
下面對\(A\)進行編碼:
\(A => (11) 1001 (0)=> (111) (100) (010)=> (0) (-2X) (+X)\)
計算過程:
+ 1111 1101 +B
+ 0001 10 -2B << <<
-----------
= 0001 0101 = 21
可以發現,對於8bit的乘法,基4的booth算法最多只需要計算4個部分積的累加,極大簡化了求和邏輯。
4. Verilog 代碼
verilog代碼參考的是fanhu大神寫的,鏈接: https://pan.baidu.com/s/1bR0SK0NeeaenLC73E1kKNg 提取碼: 4kat
下面的代碼針對上面的做了部分修改。
`timescale 1ns/1ps
module booth_radix4 #(
parameter WIDTH_M = 8,
parameter WIDTH_R = 8
)(
input clk,
input rstn,
input vld_in,
input [WIDTH_M-1:0] multiplicand,
input [WIDTH_R-1:0] multiplier,
output [WIDTH_M+WIDTH_R-1:0] mul_out,
output reg done
);
parameter IDLE = 2'b00,
ADD = 2'b01,
SHIFT = 2'b11,
OUTPUT = 2'b10;
reg [1:0] current_state, next_state;
reg [WIDTH_M+WIDTH_R+2:0] add1;
reg [WIDTH_M+WIDTH_R+2:0] sub1;
reg [WIDTH_M+WIDTH_R+2:0] add_x2;
reg [WIDTH_M+WIDTH_R+2:0] sub_x2;
reg [WIDTH_M+WIDTH_R+2:0] p_dct;
reg [WIDTH_R-1:0] count;
always @(posedge clk or negedge rstn)
if(!rstn) current_state = IDLE;
else if (!vld_in) current_state = IDLE;
else current_state <= next_state;
always @* begin
next_state = 2'bx;
case (current_state)
IDLE : if (vld_in) next_state = ADD;
else next_state = IDLE;
ADD : next_state = SHIFT ;
SHIFT : if (count==WIDTH_R/2) next_state = OUTPUT;
else next_state = ADD;
OUTPUT : next_state = IDLE;
default: next_state = IDLE;
endcase
end
always @(posedge clk or negedge rstn) begin
if(!rstn) begin
{add1,sub1,add_x2,sub_x2,p_dct,count,done} <= 0;
end else begin
case(current_state)
IDLE: begin
add1 <= {{2{multiplicand[WIDTH_R-1]}},multiplicand,{WIDTH_R+1{1'b0}}};
sub1 <= {-{{2{multiplicand[WIDTH_R-1]}},multiplicand},{WIDTH_R+1{1'b0}}};
add_x2 <= {{multiplicand[WIDTH_M-1],multiplicand,1'b0},{WIDTH_R+1{1'b0}}};
sub_x2 <= {-{multiplicand[WIDTH_M-1],multiplicand,1'b0},{WIDTH_R+1{1'b0}}};
p_dct <= {{WIDTH_M+1{1'b0}},multiplier,1'b0} ;
count <= 0;
done <= 0;
end
ADD:begin
case(p_dct[2:0])
3'b000,3'b111: p_dct <= p_dct;
3'b001,3'b010: p_dct <= p_dct+add1;
3'b101,3'b110: p_dct <= p_dct+sub1;
3'b100: p_dct <= p_dct+sub_x2;
3'b011: p_dct <= p_dct+add_x2;
default: p_dct <= p_dct;
endcase
count <= count+1;
end
SHIFT:
p_dct <= {p_dct[WIDTH_M+WIDTH_R+2],p_dct[WIDTH_M+WIDTH_R+2],p_dct[WIDTH_M+WIDTH_R+2:2]};
OUTPUT:begin
done <= 1;
end
endcase
end
end
assign mul_out = p_dct[WIDTH_M+WIDTH_R:1];
endmodule
testbench:
`timescale 1ns/1ps
module booth_radix4_tb();
`define TEST_WIDTH 4
parameter WIDTH_M = `TEST_WIDTH;
parameter WIDTH_R = `TEST_WIDTH;
reg clk;
reg rstn;
reg vld_in;
reg [WIDTH_M-1:0] multiplicand;
reg [WIDTH_R-1:0] multiplier;
wire [WIDTH_M+WIDTH_R-1:0] mul_out;
wire done;
//輸入 :要定義有符號和符號,輸出:無要求
wire signed [`TEST_WIDTH-1:0] m1_in;
wire signed [`TEST_WIDTH-1:0] m2_in;
reg signed [2*`TEST_WIDTH-1:0] product_ref;
reg [2*`TEST_WIDTH-1:0] product_ref_u;
assign m1_in = multiplier[`TEST_WIDTH-1:0];
assign m2_in = multiplicand[`TEST_WIDTH-1:0];
always #1 clk = ~clk;
integer i,j;
integer num_good;
initial begin
clk = 0;
vld_in = 0;
multiplicand = 0;
multiplier = 0;
num_good = 0;
rstn = 1;
#4 rstn = 0; #2 rstn = 1;
repeat(2) @(posedge clk);
for (i = 0; i < (1<<`TEST_WIDTH); i = i + 1) begin
for (j = 0; j < (1<<`TEST_WIDTH); j = j + 1) begin
vld_in = 1;
wait (done == 0);
wait (done == 1);
product_ref=m1_in*m2_in;
product_ref_u=m1_in*m2_in;
if (product_ref != mul_out) begin
$display("multiplier = %d multiplicand = %d proudct =%d",m1_in,m2_in,mul_out);
@(posedge clk);
$stop;
end
else begin
num_good = num_good + 1;
end
multiplicand = multiplicand + 1;
end
multiplier = multiplier + 1;
end
$display("sim done. num good = %d",num_good);
$finish;
end
booth_radix4 #( .WIDTH_M ( WIDTH_M ),
.WIDTH_R ( WIDTH_R ))
U_BOOTH_RADIX4_0
( .clk ( clk ),
.rstn ( rstn ),
.vld_in ( vld_in ),
.multiplicand ( multiplicand ),
.multiplier ( multiplier ),
.mul_out ( mul_out ),
.done ( done ));
initial begin
$fsdbDumpvars();
$fsdbDumpMDA();
$dumpvars();
end
endmodule
仿真波形圖:
首先num_good表示正確的計算數目,因為上面我只測試了4位寬度的所有有符號乘法,因此總的計算個數為16*16=256個,這邊顯示全部正確。

下面是波形圖:

PS:跟之前寫的基2的算法相比,這里如果位寬改為10,經過仿真得到的計算周期為12,周期幾乎比基2減少了一半。(之前寫的基2在計算10bit時需要21個周期)