all.c
2.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
/*
C-code sketch of how RSP microcode will process a macroblock.
Different versions will make different I/O and computational
assumptions.
Version #1 assumptions:
Macroblocks are un-tokenized into a mbdata[NBLKS][64] area of 16-bit
words in DMEM. Which blocks are non-zero is in CBP in header.
iquant and idct routines work on single blocks per call.
No longer true:
motion-compensation and reference combining (F+B) occur
all at once in a separate routine. DMAs are assummed to
already be complete (results are inputs).
*/
typedef struct mb_hdr_struct
{
short mbtype,
cbp,
quant,
mvfx, mvfy,
mvbx, mvby,
block_num; /* sMMMMMMMMMMMbbbb s=sign (<0 implies new MB)
M=Macroblock number (0-2047)
b=block number within MB */
} MB_header;
typedef struct mb_struct
{
MB_header header;
short mbdata[NBLKS][64];
unsigned char fyref[MC_YTILE_SIZE];
unsigned char fuvref[MC_UVTILE_SIZE];
unsigned char byref[MC_YTILE_SIZE];
unsigned char buvref[MC_UVTILE_SIZE];
} MacroBlock;
static short prev_quant;
static short prev_mbtype;
static MB_header mbh;
/*
Process a Macroblock
*/
proc_mb()
{
quant = mbh.quant;
mbtype = mbh.mbtype;
if( ((mbtype & MBTYPE_INTRA) != (prev_mbtype & MBTYPE_INTRA)) ||
( quant != prev_quant ) )
iquant_calc_new_qmat( quant, mbtype & MBTYPE_INTRA );
if( (mbtype & MBTYPE_INTRA) == 0 )
mc_calc_dma();
for(i=0; i<NBLKS; i++)
if( cbp & (1<<i) ) {
mc_check_dma( i-1, cbp );
iquant(i);
idct(i);
if( mc_dma_done_flag )
mc_combine( i );
else
idct_save(i);
};
while( mc_dma_done_flag != 1 )
mc_check_dma( i-1, cbp );
recon_save();
}
/*
MC can be done in many different ways. Two key charateristics
to try to optimize are: (1) minimize DMEM use, and (2) minimize
time (typically DMA xfers, VReg I/O)
To minimize DMA xfers, load horizontally adjacent tiles in 1 DMA
(but this transfers more than the necessary amount of data) Also,
UV MB data could be interleaved with Y MB data. This gets 2 DMAs
per reference direction instead of 8. Actually, UV doesn't need
to be interleaved, it could just follow Y. But then, ALL of the
UV tiles will be fetched, instead of just the necessary subsets.
To Minimize DMEM use, DMA should be scheduled to fit into recently
vacated areas of DMEM. For example, after For_Luma_mc is done, 18*16
bytes are freed up, su load the Bak_Luma_ref_tiles here.
To minimize DMEM-Vregs I/O, do For_mc and Bak_mc 4-lines at a time
alternatively, average, then write out to DMEM. After MC is done,
IDCT results (still in Vregs) can be added to reference.
*/
mc_calc_dma()
{
}
mc_check_dma()
{
}
mc_combine()
{
}