RAM-Based Shift Register (ALTSHIFT_TAPS) IP Core-實現3X3像素陣列存儲


最近想要實現CNN的FPGA加速處理,首先明確在CNN計算的過程中,因為卷積運算是最耗時間的,因此只要將卷積運算在FPGA上並行實現,即可完成部分運算的加速

那么對於卷積的FPGA實現首先要考慮的是卷積子模板具體如何實現,我們在matlab或者c實現比如3X3的子模板的時候,只要用一個數組即可將模板的數據存儲起來,而在FPGA的話有以下三種方法:

  1. 用2個或3個RAM存儲3X3像素陣列
  2. 用2個或3個FIFO存儲3X3像素陣列
  3. 用shift_RAM移位存儲3X3像素陣列

而shift_RAM好像就是為了陣列的實現量身定做的一般。

shift_RAM的配置參數主要有以下幾個:

 

手冊中可以參考理解的一個非常形象的圖如下:

 

進一步的進行單獨一個IP核的仿真后得到:

其中上述參數設置分別為8,2,3,上述仿真圖中,相當於把一個矩陣A通過移位寄存的方法通過row3_data送入到RAM,然后分三行輸出,在游標所示處就可以開始輸出3X3矩陣

    0,56,-122

  92,50,-57

-58,-13,-61

以下部分是加入了對視頻信號處理控制后的代碼實現過程:

/*-----------------------------------------------------------------------

CONFIDENTIAL IN CONFIDENCE
This confidential and proprietary software may be only used as authorized
by a licensing agreement from CrazyBingo (Thereturnofbingo).
In the event of publication, the following notice is applicable:
Copyright (C) 2011-20xx CrazyBingo Corporation
The entire notice above must be reproduced on all authorized copies.
Author                :        CrazyBingo
Technology blogs     :         http://blog.chinaaet.com/crazybingo
Email Address         :         thereturnofbingo@gmail.com
Filename            :        VIP_Matrix_Generate_3X3_8Bit.v
Data                :        2014-03-19
Description            :        Generate 8Bit 3X3 Matrix for Video Image Processor.
                            Give up the 1th and 2th row edge data caculate for simple process
                            Give up the 1th and 2th point of 1 line for simple process
Modification History    :
Data            By            Version            Change Description
=========================================================================
13/05/26        CrazyBingo    1.0                Original
14/03/16        CrazyBingo    2.0                Modification
-*/ 

`timescale 1ns/1ns
module VIP_Matrix_Generate_3X3_8Bit
#(
    parameter    [9:0]    IMG_HDISP = 10'd640,    //640*480
    parameter    [9:0]    IMG_VDISP = 10'd480
)
(
    //global clock
    input                clk,                  //cmos video pixel clock
    input                rst_n,                //global reset

    //Image data prepred to be processd
    input                per_frame_vsync,    //Prepared Image data vsync valid signal
    input                per_frame_href,        //Prepared Image data href vaild  signal
    input                per_frame_clken,    //Prepared Image data output/capture enable clock
    input        [7:0]    per_img_Y,            //Prepared Image brightness input

    //Image data has been processd
    output                matrix_frame_vsync,    //Prepared Image data vsync valid signal
    output                matrix_frame_href,    //Prepared Image data href vaild  signal
    output                matrix_frame_clken,    //Prepared Image data output/capture enable clock    
    output    reg    [7:0]    matrix_p11, matrix_p12, matrix_p13,    //3X3 Matrix output
    output    reg    [7:0]    matrix_p21, matrix_p22, matrix_p23,
    output    reg    [7:0]    matrix_p31, matrix_p32, matrix_p33
);


//Generate 3*3 matrix 
//--------------------------------------------------------------------------
//--------------------------------------------------------------------------
//--------------------------------------------------------------------------
//sync row3_data with per_frame_clken & row1_data & raw2_data
wire    [7:0]    row1_data;    //frame data of the 1th row
wire    [7:0]    row2_data;    //frame data of the 2th row
reg    [7:0]    row3_data;    //frame data of the 3th row
always@(posedge clk or negedge rst_n)
begin
    if(!rst_n)
        row3_data <= 0;
    else 
        begin
        if(per_frame_clken)
            row3_data <= per_img_Y;
        else
            row3_data <= row3_data;
        end    
end

//---------------------------------------
//module of shift ram for raw data
wire    shift_clk_en = per_frame_clken;
Line_Shift_RAM_8Bit 
#(
    .RAM_Length    (IMG_HDISP)
)
u_Line_Shift_RAM_8Bit
(
    .clock        (clk),
    .clken        (shift_clk_en),    //pixel enable clock
//    .aclr        (1'b0),

    .shiftin    (row3_data),    //Current data input
    .taps0x        (row2_data),    //Last row data
    .taps1x        (row1_data),    //Up a row data
    .shiftout    ()
);

//------------------------------------------
//lag 2 clocks signal sync  因為數據存儲耗費了一個時鍾,因此3*3陣列讀取使能和時鍾要偏移一個時鍾
reg    [1:0]    per_frame_vsync_r;
reg    [1:0]    per_frame_href_r;    
reg    [1:0]    per_frame_clken_r;
always@(posedge clk or negedge rst_n)
begin
    if(!rst_n)
        begin
        per_frame_vsync_r <= 0;
        per_frame_href_r <= 0;
        per_frame_clken_r <= 0;
        end
    else
        begin
        per_frame_vsync_r     <=     {per_frame_vsync_r[0],     per_frame_vsync};
        per_frame_href_r     <=     {per_frame_href_r[0],     per_frame_href};
        per_frame_clken_r     <=     {per_frame_clken_r[0],     per_frame_clken};
        end
end
//Give up the 1th and 2th row edge data caculate for simple process
//Give up the 1th and 2th point of 1 line for simple process
wire    read_frame_href        =    per_frame_href_r[0];    //RAM read href sync signal
wire    read_frame_clken    =    per_frame_clken_r[0];    //RAM read enable
//將存儲RAM以及陣列生成兩個步驟需要的時鍾都去掉
assign    matrix_frame_vsync     =     per_frame_vsync_r[1];
assign    matrix_frame_href     =     per_frame_href_r[1];
assign    matrix_frame_clken     =     per_frame_clken_r[1];


//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
/******************************************************************************
                    ----------    Convert Matrix    ----------
                [ P31 -> P32 -> P33 -> ]    --->    [ P11 P12 P13 ]    
                [ P21 -> P22 -> P23 -> ]    --->    [ P21 P22 P23 ]
                [ P11 -> P12 -> P11 -> ]    --->    [ P31 P32 P33 ]
******************************************************************************/
//---------------------------------------------------------------------------
//---------------------------------------------------
/***********************************************
    (1) Read data from Shift_RAM
    (2) Caculate the Sobel
    (3) Steady data after Sobel generate
************************************************/
//wire    [23:0]    matrix_row1 = {matrix_p11, matrix_p12, matrix_p13};    //Just for test
//wire    [23:0]    matrix_row2 = {matrix_p21, matrix_p22, matrix_p23};
//wire    [23:0]    matrix_row3 = {matrix_p31, matrix_p32, matrix_p33};
always@(posedge clk or negedge rst_n)
begin
    if(!rst_n)
        begin
        {matrix_p11, matrix_p12, matrix_p13} <= 24'h0;
        {matrix_p21, matrix_p22, matrix_p23} <= 24'h0;
        {matrix_p31, matrix_p32, matrix_p33} <= 24'h0;
        end
    else if(read_frame_href)
        begin
        if(read_frame_clken)    //Shift_RAM data read clock enable
            begin
            {matrix_p11, matrix_p12, matrix_p13} <= {matrix_p12, matrix_p13, row1_data};    //1th shift input
            {matrix_p21, matrix_p22, matrix_p23} <= {matrix_p22, matrix_p23, row2_data};    //2th shift input
            {matrix_p31, matrix_p32, matrix_p33} <= {matrix_p32, matrix_p33, row3_data};    //3th shift input
            end
        else
            begin
            {matrix_p11, matrix_p12, matrix_p13} <= {matrix_p11, matrix_p12, matrix_p13};
            {matrix_p21, matrix_p22, matrix_p23} <= {matrix_p21, matrix_p22, matrix_p23};
            {matrix_p31, matrix_p32, matrix_p33} <= {matrix_p31, matrix_p32, matrix_p33};
            end    
        end
    else
        begin
        {matrix_p11, matrix_p12, matrix_p13} <= 24'h0;
        {matrix_p21, matrix_p22, matrix_p23} <= 24'h0;
        {matrix_p31, matrix_p32, matrix_p33} <= 24'h0;
        end
end

endmodule
//注意這里得到的每一行得第一第二的像素都沒有用到,而且最后一行的像素沒有被運算。

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM