Changeset 3142 in CLRX


Ignore:
Timestamp:
Jun 5, 2017, 2:52:35 PM (2 years ago)
Author:
matszpk
Message:

CLRadeonExtender: CLRXDocs: Small fixes in VOP_SDWA. Update and fix VOP_DPP encoding.
Add examples to VOP_DPP encoding and operation code.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • CLRadeonExtender/trunk/doc/GcnSdwaDpp.md

    r3138 r3142  
    5555selected whole dword (SDWA_DWORD not choosen).
    5656
    57 Examples:
     57Examples: 
    5858```
    5959v_xor_b32 v1,v2,v3 dst_sel:byte_1 src0_sel:byte1 src1_sel:word1
     
    6969// SRC0_DST = dest. SRC0, SRC1_DST = dest. SRC1, DST_DST = VDST dest.
    7070// OPERATION(SRC0, SRC1) - instruction operation, VDST - VDST register before instruction
    71 if (HAVE_SRC0)
    72 {
    73     switch(SRC0_SEL)
    74     {
    75         case SDWA_BYTE_0:
    76             SRC0_DST = (SRC0_SEXT) ? INT32(INT8(SRC0_SRC & 0xff)) : SRC0_SRC & 0xff
    77             break;
    78         case SDWA_BYTE_1:
    79             SRC0_DST = (SRC0_SEXT) ? INT32(INT8((SRC0_SRC>>8) & 0xff)) :
    80                         (SRC0_SRC>>8) & 0xff
    81             break;
    82         case SDWA_BYTE_2:
    83             SRC0_DST = (SRC0_SEXT) ? INT32(INT8((SRC0_SRC>>16) & 0xff)) :
    84                         (SRC0_SRC>>16) & 0xff
    85             break;
    86         case SDWA_BYTE_1:
    87             SRC0_DST = (SRC0_SEXT) ? INT32(INT8(SRC0_SRC>>24)) : SRC0_SRC>>24
    88             break;
    89         case SDWA_WORD_0:
    90             SRC0_DST = (SRC0_SEXT) ? INT32(INT16(SRC0_SRC & 0xffff)) : SRC0_SRC & 0xffff
    91             break;
    92         case SDWA_WORD_1:
    93             SRC0_DST = (SRC0_SEXT) ? INT32(INT16(SRC0_SRC >> 16)) : SRC0_SRC >> 16
    94             break;
    95         case SDWA_DWORD:
    96             SRC0_DST = SRC0_SRC
    97             break;
    98     }
     71switch(SRC0_SEL)
     72{
     73    case SDWA_BYTE_0:
     74        SRC0_DST = (SRC0_SEXT) ? INT32(INT8(SRC0_SRC & 0xff)) : SRC0_SRC & 0xff
     75        break;
     76    case SDWA_BYTE_1:
     77        SRC0_DST = (SRC0_SEXT) ? INT32(INT8((SRC0_SRC>>8) & 0xff)) :
     78                    (SRC0_SRC>>8) & 0xff
     79        break;
     80    case SDWA_BYTE_2:
     81        SRC0_DST = (SRC0_SEXT) ? INT32(INT8((SRC0_SRC>>16) & 0xff)) :
     82                    (SRC0_SRC>>16) & 0xff
     83        break;
     84    case SDWA_BYTE_1:
     85        SRC0_DST = (SRC0_SEXT) ? INT32(INT8(SRC0_SRC>>24)) : SRC0_SRC>>24
     86        break;
     87    case SDWA_WORD_0:
     88        SRC0_DST = (SRC0_SEXT) ? INT32(INT16(SRC0_SRC & 0xffff)) : SRC0_SRC & 0xffff
     89        break;
     90    case SDWA_WORD_1:
     91        SRC0_DST = (SRC0_SEXT) ? INT32(INT16(SRC0_SRC >> 16)) : SRC0_SRC >> 16
     92        break;
     93    case SDWA_DWORD:
     94        SRC0_DST = SRC0_SRC
     95        break;
    9996}
    10097if (HAVE_SRC1)
     
    127124    }
    128125}
    129 DST_SRC = OPERATION(SRC0,SRC1)
     126DST_SRC = OPERATION(SRC0_DST,SRC1_DST)
    130127UNT32 tmp
    131128switch(DST_SEL)
     
    193190### VOP_DPP
    194191
    195 The VOP_DPP encoding is enabled by setting 0xfa in VSRC0 field in VOP1/VOP2/VOPC encoding.
     192The VOP_DPP encoding is enabled by setting 0xfa in SRC0 field in VOP1/VOP2/VOPC encoding.
    196193List of fields:
    197194
     
    20820528-31 | ROW_MASK   | Row enable mask
    209206
    210 The operation on wavefronts applied to VSRC0 operand in VOP instruction.
     207The operation on wavefronts applied to SRC0 operand in VOP instruction.
    211208The wavefront contains 4 rows (16 threads), and each row contains 4 banks (4 threads).
    212 The DPP_CTRL choose which operation will be applied to VSRC0.
     209The DPP_CTRL choose which operation will be applied to SRC0.
    213210List of data parallel operations:
    214211
     
    2262230x141        | DPP_ROW_HALF_MIRROR  | row_half_mirror | Mirror threads within half row
    2272240x142        | DPP_ROW_BCAST15      | row_bcast:15 | Broadcast 15 thread of each row to next row
    228 0x143        | DPP_ROW_BCAST15      | row_bcast:15 | Broadcast 31 thread to row 2 and row 3
     2250x143        | DPP_ROW_BCAST31      | row_bcast:31 | Broadcast 31 thread to row 2 and row 3
    229226
    230227The BOUND_CTRL flag (modifier `bound_ctrl` or `bound_ctrl:0`) control how to fill invalid
    231228threads (for example that last threads after left shifting). Zero value (no modifier)
    232 sets invalid threads by original VSRC0 value for particular thread. One value (with modifier)
    233 fills invalid threads by 0 thread VSRC0 value.
     229do not perform operation in thread that source threads are invalid.
     230One value (with modifier) fills invalid threads by 0 value.
    234231
    235232The field BANK_MASK (modifier `bank_mask:value`) choose which banks will be enabled during
    236233data parallel operation in each enabled row. The Nth bit represents Nth bank in each row.
    237 Disabled bank will be filled by original VSRC0 value for particular thread
     234Threads in disabled banks do not perform operation.
    238235
    239236The field ROW_MASK (modifier `row_mask:value`) choose which rows will be enabled during
    240237data parallel operation. The Nth bit represents Nth row.
    241 Disabled row will be filled by original VSRC0 value for particular thread.
    242 
     238Threads in disabled rows do not perform operation.
     239
     240Examples: 
     241```
     242v_xor_b32 v1,v2,v3 quad_perm:[2,3,0,1]
     243v_xor_b32 v1,v2,v3 row_shl:5
     244v_xor_b32 v1,v2,v3 row_shr:7
     245v_xor_b32 v1,v2,v3 row_ror:8
     246v_xor_b32 v1,v2,v3 wave_shl:1
     247v_xor_b32 v1,v2,v3 wave_shl
     248v_xor_b32 v1,v2,v3 wave_shr:1
     249v_xor_b32 v1,v2,v3 wave_shr
     250v_xor_b32 v1,v2,v3 wave_rol:1
     251v_xor_b32 v1,v2,v3 wave_rol
     252v_xor_b32 v1,v2,v3 wave_ror:1
     253v_xor_b32 v1,v2,v3 wave_ror
     254v_xor_b32 v1,v2,v3 row_mirror
     255v_xor_b32 v1,v2,v3 row_half_mirror
     256v_xor_b32 v1,v2,v3 row_bcast:15
     257v_xor_b32 v1,v2,v3 row_bcast:31
     258v_xor_b32 v1,v2,v3 row_shr:7 bound_ctrl
     259v_xor_b32 v1,v2,v3 row_shr:7 bound_ctrl:0
     260v_xor_b32 v1,v2,v3 row_shl:5 row_mask:0b1100
     261v_xor_b32 v1,v2,v3 row_shl:5 bank_mask:0b0101
     262```
     263
     264Operation code: 
     265```
     266// SRC0_SRC[X] - original VSRC0 value from thread X
     267// SRC0_DST[X] - destination VSRC0 value from thread X
     268// OPERATION(SRC0, SRC1) - instruction operation, VDST - VDST register before instruction
     269BYTE invalid = 0
     270BYTE srcLane
     271if (DPP_CTRL>=DPP_QUAD_PERM00 && DPP_CTRL<=DPP_QUAD_PERMFF)
     272{
     273    BYTE p0 = DPP_CTRL&3
     274    BYTE p1 = (DPP_CTRL>>2)&3
     275    BYTE p2 = (DPP_CTRL>>4)&3
     276    BYTE p3 = (DPP_CTRL>>6)&3
     277    BYTE curL4 = LANEID&~3
     278    if (LANEID&3==0)
     279        srcLane = curL4 + p0
     280    else if (LANEID&3==1)
     281        srcLane = curL4 + p1
     282    else if (LANEID&3==2)
     283        srcLane = curL4 + p2
     284    else if (LANEID&3==3)
     285        srcLane = curL4 + p3   
     286}
     287else if (DPP_CTRL>=DPP_ROW_SL1 && DPP_CTRL<=DPP_ROW_SL15)
     288{
     289    BYTE shift = DPP_CTRL&15
     290    BYTE slid = LANEID&15
     291    BYTE curR = LANEID&~15
     292    if (slid+shift<=15)
     293        srcLane = curR + slid + shift
     294    else
     295        srcLane = LANESNUM
     296}
     297else if (DPP_CTRL>=DPP_ROW_SR1 && DPP_CTRL<=DPP_ROW_SR15)
     298{
     299    BYTE shift = DPP_CTRL&15
     300    BYTE slid = LANEID&15
     301    BYTE curR = LANEID&~15
     302    if (slid>=shift)
     303        srcLane = curR + slid - shift
     304    else
     305        srcLane = LANESNUM
     306}
     307else if (DPP_CTRL>=DPP_ROW_RR1 && DPP_CTRL<=DPP_ROW_RR15)
     308{
     309    BYTE shift = DPP_CTRL&15
     310    BYTE slid = LANEID&15
     311    BYTE curR = LANEID&~15
     312    srcLane = curR + ((16+slid - shift)&15)
     313}
     314else if (DPP_CTRL==DPP_WF_SL1)
     315    srcLane = LANEID+1
     316else if (DPP_CTRL==DPP_WF_SR1)
     317    srcLane = LANEID-1
     318else if (DPP_CTRL==DPP_WF_RL1)
     319    srcLane = (LANEID+1)&63
     320else if (DPP_CTRL==DPP_WF_RR1)
     321    srcLane = (LANEID-1)&63
     322else if (DPP_CTRL==DPP_ROW_MIRROR)
     323{
     324    BYTE curR = LANEID&~15
     325    srcLane = curR + ((LANEID&15)^15)
     326}
     327else if (DPP_CTRL==DPP_ROW_HALF_MIRROR)
     328{
     329    BYTE curR = LANEID&~7
     330    srcLane = curR + ((LANEID&7)^7)
     331}
     332else if (DPP_CTRL==DPP_BCAST_15)
     333{
     334    BYTE curR = LANEID&~15
     335    if (LANEID<15)
     336        srcLane = LANEID
     337    else
     338        srcLane = ((LANEID-16)&~15)+15
     339}
     340else if (DPP_CTRL==DPP_BCAST_31)
     341{
     342    BYTE curR = LANEID&~31
     343    if (LANEID<31)
     344        srcLane = LANEID
     345    else
     346        srcLane = ((LANEID-31)&~31)+31
     347}
     348if (dstLane < LANESNUM)
     349    SRC0_DST[LANEID] = SRC0_SRC[srcLane]
     350else if (BOUND_CTRL==0)
     351    SRC0_DST[LANEID] = 0
     352else
     353    invalid = 1
     354if ((ROW_MASK & (1U<<(LANEID>>4)))==0)
     355    invalid = 1
     356if ((BANK_MASK & (1U<<((LANEID>>2)&3)))==0)
     357    invalid = 1
     358if (!invalid)
     359    VDST = OPERATION(SRC0_DST,SRC1)
     360```
Note: See TracChangeset for help on using the changeset viewer.