Last active
June 22, 2017 07:45
-
-
Save pervognsen/d62c6aac440ac85d94bd25b7c6d0f2ed to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // This was mostly done as a fun experiment to see how much I could minimize LUT count and combinational delays. | |
| // Vivado synthesizes this to 17-18 LUTs for the default parameters (32-bit addresses, 8 KB cache, 32-byte cachelines). | |
| // About half those LUTs are for the tag comparator. Note that the LUT count is carefully designed to be independent | |
| // of the cacheline width by avoiding any output muxes (e.g. rather than forwarding the fill response, which would | |
| // create an output mux between the fill response and the cache array, it just repeats the cache lookup). The downside | |
| // of this general kind of design is that it takes 3 cycles to report a hit. You could easily do a design with combinational | |
| // outputs that produces a result in 1 cycle, albeit with very restrictive setup times and combinational output delays. | |
| // | |
| // See if you can reduce the LUT count further! (Yes, it's silly, since the cacheline-to-word shifter is going to dominate.) | |
| `define ASSERT(x) | |
| module cache#( | |
| parameter ADDR_WIDTH = 32 - 5, | |
| parameter DATA_WIDTH = 32*8, | |
| parameter CACHE_SIZE = 256, | |
| parameter INDEX_WIDTH = $clog2(CACHE_SIZE), | |
| parameter TAG_WIDTH = ADDR_WIDTH - INDEX_WIDTH | |
| )( | |
| input clk, | |
| input cache_req_enable, | |
| input [ADDR_WIDTH-1:0] cache_req_addr, | |
| output reg cache_req_ready, | |
| output reg cache_resp_enable, | |
| output [DATA_WIDTH-1:0] cache_resp_data, | |
| output reg mem_req_enable, | |
| output [ADDR_WIDTH-1:0] mem_req_addr, | |
| input mem_req_ready, | |
| input mem_resp_enable, | |
| input [DATA_WIDTH-1:0] mem_resp_data, | |
| output mem_resp_ready | |
| ); | |
| localparam IDLE = 0, START_READ = 1, READ = 2, HIT = 3, START_FILL = 4, FILL = 5; | |
| reg [DATA_WIDTH-1:0] data_array[0:CACHE_SIZE-1]; | |
| reg [TAG_WIDTH-1:0] tag_array[0:CACHE_SIZE-1]; | |
| reg [2:0] state; | |
| reg [ADDR_WIDTH-1:0] req_addr; | |
| reg [DATA_WIDTH-1:0] read_data; | |
| reg [TAG_WIDTH-1:0] read_tag; | |
| wire [TAG_WIDTH-1:0] req_tag = req_addr[ADDR_WIDTH-1:INDEX_WIDTH]; | |
| wire [INDEX_WIDTH-1:0] req_index = req_addr[INDEX_WIDTH-1:0]; | |
| wire [INDEX_WIDTH-1:0] cache_req_index = cache_req_addr[INDEX_WIDTH-1:0]; | |
| assign cache_resp_data = read_data; | |
| assign mem_req_addr = req_addr; | |
| assign mem_resp_ready = 1; | |
| initial begin | |
| cache_resp_enable = 0; | |
| mem_req_enable = 0; | |
| cache_req_ready = 1; | |
| state = IDLE; | |
| end | |
| always @(posedge clk) begin | |
| case (state) | |
| IDLE: begin | |
| `ASSERT(cache_req_ready) | |
| if (cache_req_enable) begin | |
| req_addr <= cache_req_addr; | |
| cache_req_ready <= 0; | |
| state <= START_READ; | |
| end | |
| end | |
| START_READ: begin | |
| read_data <= data_array[req_index]; | |
| read_tag <= tag_array[req_index]; | |
| state <= READ; | |
| end | |
| READ: begin | |
| if (read_tag == req_tag) begin | |
| cache_resp_enable <= 1; | |
| state <= HIT; | |
| end else begin | |
| mem_req_enable <= 1; | |
| state <= START_FILL; | |
| end | |
| end | |
| HIT: begin | |
| cache_resp_enable <= 0; | |
| cache_req_ready <= 1; | |
| state <= IDLE; | |
| end | |
| START_FILL: begin | |
| `ASSERT(mem_req_enable) | |
| if (mem_req_ready) begin | |
| mem_req_enable <= 0; | |
| state <= FILL; | |
| end | |
| end | |
| FILL: begin | |
| `ASSERT(mem_resp_ready) | |
| if (mem_resp_enable) begin | |
| data_array[req_index] <= mem_resp_data; | |
| tag_array[req_index] <= req_tag; | |
| state <= START_READ; | |
| end | |
| end | |
| endcase | |
| end | |
| endmodule |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks for the replies!
What I was thinking was a cache module which includes the serializer and deserializer you described in it. I.e. an memory address calculator (row start + offset) + a counter for the current word. The cache would stay in the FILL state for N cycles, where N = cache-line size / word size. The larger the difference between the two, the larger the required counter so more LUTs. That's why I asked if the LUT count will change depending on those :)
Is there a reason to have a separate component do the serialization and deserialization? Or is it just a matter of keeping things as simple as possible? Would this particular serializer instance in front of the cache be able to be used for something else other than this cache?
I forgot about the critical-word-first fill optimization. I guess having the address calculator inside the cache and fetching individual words from memory, as I mentioned above, would help in this case. Just have a special FILL_CRITICAL_WORD state which will be executed first and then switch to the existing FILL state (or something like that; I guess it depends on how your RAM behaves, if it supports burst reads and the size of those bursts).
Thanks again for the explanations. Looking forward to your streams/course to learn new stuff :)