Code/Resource
Windows Develop
Linux-Unix program
Internet-Socket-Network
Web Server
Browser Client
Ftp Server
Ftp Client
Browser Plugins
Proxy Server
Email Server
Email Client
WEB Mail
Firewall-Security
Telnet Server
Telnet Client
ICQ-IM-Chat
Search Engine
Sniffer Package capture
Remote Control
xml-soap-webservice
P2P
WEB(ASP,PHP,...)
TCP/IP Stack
SNMP
Grid Computing
SilverLight
DNS
Cluster Service
Network Security
Communication-Mobile
Game Program
Editor
Multimedia program
Graph program
Compiler program
Compress-Decompress algrithms
Crypt_Decrypt algrithms
Mathimatics-Numerical algorithms
MultiLanguage
Disk/Storage
Java Develop
assembly language
Applications
Other systems
Database system
Embeded-SCM Develop
FlashMX/Flex
source in ebook
Delphi VCL
OS Develop
MiddleWare
MPI
MacOS develop
LabView
ELanguage
Software/Tools
E-Books
Artical/Document
miint.cpp
Package: shell.rar [view]
Upload User: xhy777
Upload Date: 2007-02-14
Package Size: 24088k
Code Size: 16k
Category:
Windows Kernel
Development Platform:
Visual C++
- /***************************************************************************
- *
- * INTEL Corporation Proprietary Information
- *
- *
- * Copyright (c) 1996 Intel Corporation.
- * All rights reserved.
- *
- ***************************************************************************
- AUTHOR: Kumar Balasubramanian
- ***************************************************************************
- ** MMX version of the "integer LLM mode" within IJG decompressor code.
- ** The following is an MMX implementation of the integer slow mode
- ** IDCT within the IJG code.
- */
- #define JPEG_INTERNALS
- #include "jinclude.h"
- #include "jpeglib.h"
- #include "jdct.h" /* Private declarations for DCT subsystem */
- #ifdef DCT_ISLOW_SUPPORTED
- /*
- * This module is specialized to the case DCTSIZE = 8.
- */
- #if DCTSIZE != 8
- Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
- #endif
- #if BITS_IN_JSAMPLE == 8
- #define CONST_BITS 13
- #define PASS1_BITS 2
- #else
- #define CONST_BITS 13
- #define PASS1_BITS 1 /* lose a little precision to avoid overflow */
- #endif
- /* Define the constants for the case BITS_IN_JSAMPLE = 8 */
- static const __int64 const_0_2986 = 0x0000098E0000098E ;
- static const __int64 const_0_3901 = 0x00000c7c00000c7c;
- static const __int64 const_0_54119 = 0x0000115100001151;
- static const __int64 const_0_7653 = 0x0000187E0000187E;
- static const __int64 const_0_899 = 0x00001ccd00001ccd;
- static const __int64 const_1_175 = 0x000025a1000025a1;
- static const __int64 const_1_501 = 0x0000300b0000300b;
- static const __int64 const_1_8477 = 0x00003b2100003b21;
- static const __int64 const_1_961 = 0x00003ec500003ec5 ;
- static const __int64 const_2_053 = 0x000041b3000041b3 ;
- static const __int64 const_2_562 = 0x0000520300005203 ;
- static const __int64 const_3_072 = 0x0000625400006254 ;
- static const __int64 const_all_ones = 0x0ffffffffffffffff;
- static const __int64 const_0_1_0_1 = 0x0000000100000001 ;
- static const __int64 const_zero = 0x0000000000000000;
- static const __int64 const_1_0 = 0x0000000100000001 ;
- static const __int64 const_round = 0x0000040000000400;
- static const __int64 const_round_two = 0x0002000000020000;
- static const __int64 const_mask = 0x000003ff000003ff;
- static const __int64 const_00_1_84_00_0_765 = 0x00003b210000187E;
- static const __int64 const_00_0_5411_00_00 = 0x0000115100000000;
- static const __int64 const_3_072_00_1_501_00 = 0x62540000300b0000;
- static const __int64 const_0_2986_00_2_053_00 = 0x098E000041b30000;
- static const __int64 const_0_899_00_2_562_00 = 0x1ccd000052030000;
- static const __int64 const_1_96_00_0_3901_00 = 0x3ec500000c7c0000;
- static const __int64 const_1_175_00_00_00 = 0x25a1000000000000;
- /*
- * Perform dequantization and inverse DCT on one block of coefficients.
- */
- GLOBAL(void)
- midct8x8llm (JCOEFPTR inptr, short *quantptr, short *wsptr,
- JSAMPARRAY output_buf, JDIMENSION output_col, JSAMPLE *range_limit )
- {
- INT32 locdwinptr, locdwqptr, locdwwsptr, locdwcounter, locdwrowctr ;
- __int64 locqwtmp0e,locqwtmp0o, locqwtmp1e, locqwtmp1o, locqwtmp2e ;
- __int64 locqwtmp10e , locqwtmp10o ,locqwtmp11e ,
- locqwtmp11o , locqwtmp12e , locqwtmp12o ,
- locqwtmp13e , locqwtmp13o ,locqwtmp0 ,
- locqwtmp1 ,locqwtmp2 ,locqwtmp3 ,
- locqwz5e ,locqwz5o ,locqwz1e ,locqwz1o ,
- locqwz13e ,locqwz13o ,locqwz14e ,
- locqwz14o ,locqwz23e ,locqwz23o ,
- locqwz24e ,locqwz24o ;
- // Inline assembly to do the IDCT and store the result */
- __asm {
- mov esi, inptr ; load the input pointer
- mov edi, quantptr ; load the quant table pointer
- mov locdwinptr, esi ; to be used in the idct_column loop
- mov locdwqptr, edi ; to be used in the idct_column loop
- mov esi, wsptr
- mov locdwcounter, 2 ; idct_column loop counter
- mov locdwwsptr, esi
- ;; do the idct on all the columns. Do four columns per
- ;; iteration of the loop.
- idct_column:
- mov esi, locdwinptr ; get the source pointer
- mov edi, locdwqptr ; get the quantzn. pointer
- ;; fetch C2 and Q2
- movq mm0, [esi+16*2] ; get C2
- movq mm1, [edi+16*2] ; get Q2
- movq mm2, [esi+16*6] ; get C6
- pmullw mm0, mm1 ; dequantized C2 = z2
- movq mm3, [edi+16*6] ; get Q6
- movq mm6, const_0_7653
- pmullw mm2, mm3 ; dequant. C6 = z3
- movq mm7, const_1_8477
- movq mm4, mm0 ; copy z2
- pmaddwd mm4, mm6 ; tmp3 - z1 for columns 0 & 2
- movq mm5, mm0 ; copy z2
- movq mm3, mm2 ; z3 copy
- psrlq mm5, 16 ; move z2 columns 1 & 3 to 0 & 2
- movq mm1, const_0_54119
- pmaddwd mm5, mm6 ; tmp3 - z1 for columns 1 & 3
- psrlq mm3, 16 ; move z3 columns 1 & 3 to 0 & 2
- paddw mm0, mm2 ; z2 + z3
- pmaddwd mm2, mm7 ; tmp2 - z1 for columns 0 & 2
- movq mm6, mm0 ; z2 + z3 copy
- psrlq mm6, 16 ; z2 + z3 columns 1 & 3 in 0 & 2
- pmaddwd mm3, mm7 ; tmp2 - z1 for columns 1 & 3
- movq mm7, const_all_ones
- pmaddwd mm0, mm1 ; z1 columns 0 & 2
- pmaddwd mm6, mm1 ; z1 columns 1 & 3
- pxor mm2, mm7 ; 1s complement of tmp2 - z1
- movq mm1, const_0_1_0_1
- pxor mm3, mm7 ; 1s complement of tmp2 - z1
- paddd mm2, mm1 ; 2s complement of tmp2 - z1(col 0 &2)
- paddd mm3, mm1 ; 2s complement of tmp2 - z1(col 1 & 3)
- paddd mm2, mm0 ; tmp2 (columns 0 & 2)
- paddd mm4, mm0 ; tmp2 (cols. 1 & 3)
- ;; get C0 and Q0
- movq mm0, [esi+16*0] ; get C0
- paddd mm3, mm6 ; tmp3
- movq mm1, [edi+16*0] ; getQ0
- paddd mm5, mm6 ; tmp3
- movq mm6, [esi+16*4] ; get C4
- pmullw mm0, mm1 ; dequant C0 = z2
- movq mm7, [edi+16*4] ; get Q4
- nop
- movq locqwtmp2e, mm2 ; store tmp2 even part
- pmullw mm6, mm7 ; dequant C4 = z3
- movq mm7, const_1_0
- movq mm1, mm0 ; copy of z2
- paddw mm0, mm6 ; z2+z3
- nop
- psubw mm1, mm6 ; z2-z3
- movq mm6, mm0 ; z2+z3 copy
- pmaddwd mm0, mm7 ; get 0 & 2 cols
- psrlq mm6, 16 ; get the other two cols.
- pmaddwd mm6, mm7 ;
- movq mm2, mm1 ; copy of z2-z3
- pmaddwd mm1, mm7
- psrlq mm2, 16
- pmaddwd mm2, mm7
- pslld mm0, 13 ; tmp0 cols 0&2
- movq mm7, mm4
- pslld mm6, 13 ; tmp0 cols 1 & 3
- paddd mm4, mm0 ;
- psubd mm0, mm7 ;
- movq mm7, mm5
- pslld mm2, 13
- movq locqwtmp13e, mm0 ; store tmp13 cols 0&2
- paddd mm5, mm6
- movq mm0, locqwtmp2e
- psubd mm6, mm7
- movq locqwtmp10o, mm5 ; store tmp10 cols 1&3
- movq mm7, mm3
- movq locqwtmp13o, mm6 ; store tmp13 cols 1&3
- paddd mm3, mm2
- movq locqwtmp10e, mm4 ; store tmp10 cols 0&2
- pslld mm1, 13
- movq locqwtmp11o, mm3 ; store tmp11 cols 1,3
- psubd mm2, mm7
- movq mm6, [esi+16*1]
- movq mm3, mm0
- movq locqwtmp12o, mm2 ; store tmp12 cols. 1,3
- paddd mm0, mm1
- movq mm7, [edi+16*1]
- movq locqwtmp11e, mm0 ; store tmp11 cols. 0,2
- psubd mm1, mm3
- movq mm0, [esi+16*7]
- pmullw mm6, mm7 ; dequant. C1 = tmp3
- movq locqwtmp12e, mm1
- ;; completed the even part.
- ;; Now start the odd part
- movq mm1, [edi+16*7] ; get C7
- movq mm2, [esi+16*5] ; get C5
- pmullw mm0, mm1 ; dequant. C7 = tmp0
- movq mm3, [edi+16*5]
- movq mm4, [esi+16*3]
- pmullw mm2, mm3 ; dequant. C5 = tmp1
- movq mm5, [edi+16*3]
- movq mm1, mm0
- movq locqwtmp3, mm6
- pmullw mm4, mm5 ; dequant. C3 = tmp2
- movq locqwtmp0, mm0
- paddw mm0, mm6 ; z1
- movq locqwtmp1, mm2
- movq mm3, mm2
- movq locqwtmp2, mm4
- paddw mm2, mm4 ; z2
- paddw mm1, mm4 ; z3
- movq mm4, const_1_175
- paddw mm3, mm6 ; z4
- movq mm5, mm1
- movq mm7, mm0
- psrlq mm7, 16 ; other two cols. of z1
- paddw mm5, mm3 ; z3 + z4
- movq mm6, mm5
- pmaddwd mm5, mm4 ; z5 cols 0 & 2
- pmaddwd mm0, const_0_899 ; z1 even part
- psrlq mm6, 16
- pmaddwd mm6, mm4 ; z5 cols 1 & 3
- movq mm4, mm2 ; z2 copy
- movq locqwz5e, mm5
- psrlq mm4, 16 ; get z2 cols 1 & 3
- pxor mm0, const_all_ones
- movq mm5, mm1
- movq locqwz5o, mm6
- psrlq mm5, 16
- movq mm6, const_2_562
- nop
- paddd mm0, const_0_1_0_1
- pmaddwd mm2, mm6 ; z2 cols 0 & 2
- movq locqwz1e, mm0
- pmaddwd mm4, mm6 ; z2 cols 1 & 3
- pmaddwd mm7, const_0_899 ; z1
- movq mm0, mm3
- movq mm6, const_1_961
- psrlq mm0, 16
- pxor mm2, const_all_ones
- pmaddwd mm1, mm6 ; z3 cols 0 & 2
- paddd mm2, const_0_1_0_1
- pmaddwd mm5, mm6 ; z3 cols 1 & 3
- movq mm6, const_0_3901
- nop
- pxor mm4, const_all_ones
- pmaddwd mm3, mm6 ; z4 cols 0 & 2
- paddd mm4, const_0_1_0_1
- pmaddwd mm0, mm6 ; z4 cols 1 & 3
- movq mm6, const_all_ones
- nop
- pxor mm1, mm6
- pxor mm7, mm6
- ;; twos complement of z1, z2, z3, z4
- paddd mm1, const_0_1_0_1
- pxor mm5, mm6
- paddd mm7, const_0_1_0_1
- pxor mm3, mm6
- paddd mm5, const_0_1_0_1
- nop
- movq locqwz1o, mm7
- pxor mm0, mm6
- paddd mm1, locqwz5e ; z3+z5 cols 0 & 2
- nop
- movq mm6, locqwz1e
- nop
- paddd mm5, locqwz5o ; z3+z5 cols 1 & 3
- paddd mm6, mm1
- paddd mm3, const_0_1_0_1
- paddd mm1, mm2
- paddd mm0, const_0_1_0_1
- paddd mm7, mm5
- paddd mm3, locqwz5e ; z4+z5 cols 0 & 2
- paddd mm5, mm4
- paddd mm0, locqwz5o ; z4+z5 cols 0 & 2
- paddd mm2, mm3
- paddd mm3, locqwz1e
- paddd mm4, mm0
- paddd mm0, locqwz1o
- movq locqwz23e, mm1
- nop
- movq locqwz14o, mm0
- nop
- movq mm0, locqwtmp0
- nop
- movq locqwz24e, mm2
- movq mm1, mm0
- movq mm2, const_0_2986
- psrlq mm1, 16
- movq locqwz14e, mm3
- pmaddwd mm0, mm2 ; tmp0 even
- movq mm3, locqwtmp1
- pmaddwd mm1, mm2 ; tmp0 odd
- movq locqwz24o, mm4
- movq mm2, mm3
- movq mm4, const_2_053
- psrlq mm2, 16
- movq locqwz23o, mm5
- pmaddwd mm3, mm4 ; tmp1 even
- movq mm5, locqwtmp2
- pmaddwd mm2, mm4 ; tmp1 odd
- movq locqwz13e, mm6
- movq mm4, mm5
- movq mm6, const_3_072
- psrlq mm4, 16
- movq locqwz13o, mm7
- pmaddwd mm5, mm6 ; tmp2 even
- ;;;;;;; now calculate tmp0..tmp3
- ;; then calculate the pre-descaled values
- ;; this includes the right shift with rounding
- movq mm7, locqwtmp3
- pmaddwd mm4, mm6 ; tmp2 odd
- paddd mm0, locqwz13e
- movq mm6, mm7
- paddd mm1, locqwz13o
- psrlq mm6, 16
- movq locqwtmp0e, mm0 ; tmp0 even
- nop
- movq mm0, const_1_501
- nop
- movq locqwtmp0o, mm1
- pmaddwd mm7, mm0
- paddd mm3, locqwz24e
- pmaddwd mm6, mm0
- movq mm0, locqwtmp10e
- nop
- paddd mm7, locqwz14e
- nop
- paddd mm6, locqwz14o
- psubd mm0, mm7
- movq mm1, locqwtmp10o
- nop
- movq locqwtmp1e, mm3
- psubd mm1, mm6
- movq mm3, const_round
- nop
- paddd mm2, locqwz24o
- paddd mm0, mm3
- paddd mm7, locqwtmp10e
- psrad mm0, 11
- movq locqwtmp1o, mm2
- paddd mm1, mm3
- paddd mm6, locqwtmp10o
- psrad mm1, 11
- paddd mm5, locqwz23e
- movq mm2, mm0
- paddd mm4, locqwz23o
- punpcklwd mm0, mm1
- paddd mm6, mm3
- punpckhwd mm2, mm1
- paddd mm7, mm3
- punpckldq mm0, mm2
- ;; now do all the stores of the 1D-iDCT of the four columns
- mov edi, locdwwsptr ; get pointer to scratch pad array
- movq [edi+16*7], mm0 ; store wsptr[7]
- psrad mm6, 11
- movq mm2, locqwtmp11e
- psrad mm7, 11
- psubd mm2, mm5
- movq mm0, mm7
- movq mm1, locqwtmp11o
- punpcklwd mm7, mm6
- psubd mm1, mm4
- punpckhwd mm0, mm6
- paddd mm5, locqwtmp11e
- punpckldq mm7, mm0
- paddd mm4, locqwtmp11o
- paddd mm2, mm3
- paddd mm1, mm3
- paddd mm5, mm3
- paddd mm4, mm3
- psrad mm2, 11
- movq [edi+16*0], mm7 ; store wsptr[0]
- psrad mm1, 11
- movq mm0, mm2
- psrad mm5, 11
- movq mm6, locqwtmp12e
- punpcklwd mm2, mm1
- punpckhwd mm0, mm1
- movq mm1, mm5
- movq mm7, locqwtmp12o
- punpckldq mm2, mm0
- movq [edi+16*6], mm2 ; store wsptr[6]
- psrad mm4, 11
- movq mm2, mm6
- punpcklwd mm5, mm4
- paddd mm6, locqwtmp1e
- punpckhwd mm1, mm4
- psubd mm2, locqwtmp1e
- punpckldq mm5, mm1
- movq [edi+16*1], mm5 ; store wsptr[1]
- movq mm0, mm7
- paddd mm7, locqwtmp1o
- paddd mm6, mm3
- psubd mm0, locqwtmp1o
- paddd mm7, mm3
- paddd mm2, mm3
- psrad mm7, 11
- paddd mm0, mm3
- psrad mm6, 11
- movq mm1, mm6
- psrad mm2, 11
- movq mm4, locqwtmp13e
- punpcklwd mm6, mm7
- movq mm5, mm4
- punpckhwd mm1, mm7
- paddd mm4, locqwtmp0e
- punpckldq mm6, mm1
- psubd mm5, locqwtmp0e
- psrad mm0, 11
- movq [edi+16*2], mm6 ; store wsptr[2]
- movq mm6, mm2
- paddd mm4, mm3
- punpcklwd mm2, mm0
- paddd mm5, mm3
- punpckhwd mm6, mm0
- movq mm0, locqwtmp13o
- punpckldq mm2, mm6
- movq mm1, mm0
- psrad mm4, 11
- paddd mm0, locqwtmp0o
- psrad mm5, 11
- paddd mm0, mm3
- movq mm6, mm4
- psubd mm1, locqwtmp0o
- psrad mm0, 11
- paddd mm1, mm3
- punpcklwd mm4, mm0
- movq mm3, mm5
- punpckhwd mm6, mm0
- movq [edi+16*5], mm2 ; store wsptr[5]
- punpckldq mm4, mm6
- psrad mm1, 11
- movq [edi+16*3], mm4 ; store wsptr[3]
- punpcklwd mm5, mm1
- punpckhwd mm3, mm1
- punpckldq mm5, mm3
- add locdwinptr, 8 ; skip first four columns
- add locdwqptr, 8
- movq [edi+16*4], mm5 ; store wsptr[4]
- ;;;;;;; done with 1D-idct of four columns ;;;;;;;
- ;; now update pointers for next four columns
- add locdwwsptr, 8
- mov eax, locdwcounter
- dec eax
- mov locdwcounter, eax
- jnz idct_column
- ;;;;;;;end of 1D-idct on the columns ;;;;;;;
- mov esi, wsptr ; get start addr of temp array
- mov locdwcounter, 8
- mov locdwwsptr, esi
- mov locdwrowctr, 0
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- ;;;;;;; start of 1D-idct on the rows ;;;;;;;
- idct_row:
- mov esi, locdwwsptr ; get next row start addr of temp array
- mov edi, output_buf
- movq mm0, [esi+0] ; get first 4 elements of row
- movq mm1, [esi+2*4] ; get next 4 elem. of row
- movq mm2, mm0
- movq mm3, mm0 ; copy of e3|e2|e1|e0
- paddw mm2, mm1 ; (e3+e7)|(e2+e6)|(e1+e5)|(e0+e4)
- movq mm4, mm2 ; copy of (e3+e7)|(e2+e6)|(e1+e5)|(e0+e4)
- punpckhdq mm3, mm1 ; e7|e6|e3|e2
- pmaddwd mm3, const_00_1_84_00_0_765 ; (tmp2 - z1)||(tmp3-z1)
- movq mm6, mm0 ; copy of e3|e2|e1|e0
- pmaddwd mm2, const_00_0_5411_00_00 ; z1||xxx
- psubw mm6, mm1 ; (e3-e7)|(e2-e6)|(e1-e5)|(e0-e4)
- punpckldq mm4, mm6 ; (e1-e5)|(e0-e4)|(e1+e5)|(e0+e4)
- movq mm6, mm0 ;
- movq mm5, mm3
- pslld mm4, 16 ; (e0-e4)|(e1+e5)||(e0+e4)|x0000
- pxor mm3, const_all_ones
- punpckhdq mm2, mm2 ; z1||z1
- paddd mm3, const_0_1_0_1
- psrad mm4, 3 ; (e0-e4)<<13||(e0+e4)<<13
- psrlq mm3, 32
- movq mm7, mm4 ; copy of tmp1||tmp0
- punpckldq mm5, mm3
- movq mm3, mm0 ; e3|e2|e1|e0
- paddd mm5, mm2 ; tmp2 || tmp3
- paddw mm3, mm1 ; (e7+e3)|(e2+e6)|(e1+e5)|(e0+e4)
- paddd mm4, mm5
- psubd mm7, mm5
- ;; end of even part calculation ;;
- ;; mm0 => e3|e2|e1|e0
- ;; mm1 => e7|e6|e5|e4
- ;; mm4 => tmp11||tmp10
- ;; mm7 => tmp12||tmp13
- movq mm5, mm3
- movq mm2, mm0
- pmaddwd mm0, const_3_072_00_1_501_00 ; tmp2|tmp3
- punpckldq mm5, mm5
- paddw mm5, mm3
- punpckldq mm2, mm2
- pmaddwd mm5, const_1_175_00_00_00 ; z5|0
- punpckhdq mm6, mm2
- pmaddwd mm3, const_1_96_00_0_3901_00 ; z3|z4
- paddw mm6, mm1
- pmaddwd mm6, const_0_899_00_2_562_00 ; z1|z2
- nop
- pmaddwd mm1, const_0_2986_00_2_053_00 ; tmp0|tmp1
- punpckhdq mm5, mm5
- movq mm2, const_0_1_0_1
- nop
- pxor mm3, const_all_ones
- nop
- pxor mm6, const_all_ones
- paddd mm3, mm2
- paddd mm6, mm2
- paddd mm3, mm5
- movq mm5, mm6
- paddd mm6, mm3
- movq mm2, mm5
- punpckldq mm5, mm5
- punpckhdq mm2, mm5
- paddd mm1, mm6
- paddd mm2, mm3
- movq mm5, mm1
- movq mm3, mm4
- paddd mm0, mm2
- movq mm2, mm7
- punpckldq mm5, mm5
- punpckhdq mm1, mm5
- psubd mm3, mm0
- movq mm5, const_round_two
- paddd mm0, mm4
- movq mm6, const_mask
- psubd mm2, mm1
- paddd mm0, mm5
- paddd mm1, mm7
- ;; descale the resulting coeff values
- paddd mm1, mm5
- psrad mm0, 18
- paddd mm3, mm5
- psrad mm1, 18
- paddd mm2, mm5
- psrad mm3, 18
- ;; mask the result with RANGE_MASK (least 10 bits)
- pand mm1, mm6 ; w2|w3
- psrad mm2, 18
- movd ebx, mm1 ; w3
- psrlq mm1, 32 ; 0|w2
- ;; using the results as index, get the corresponding
- ;; value from array range_limit and store the final result
- mov ecx, range_limit ; get start addr of range_limit array
- add edi, locdwrowctr
- movd edx, mm1 ; w2
- pand mm0, mm6 ; w1|w0
- mov ah, [ecx][ebx] ; w3
- mov edi, [edi]
- movd ebx, mm0 ; w0
- psrlq mm0, 32 ; 0|w1
- mov al, [ecx][edx] ; w2
- add locdwrowctr, 4
- movd edx, mm0 ; w1
- pand mm3, mm6 ; w6|w7
- add edi, output_col ; this is the dest start addr for this row
- shl eax, 16 ; w3|w2|0|0
- mov al, [ecx][ebx] ; w0
- mov ah, [ecx][edx] ; w1
- movd mm4, eax ; w3|w2|w1|w0
- pand mm2, mm6 ; w5|w4
- movd ebx, mm3 ; w7
- psrlq mm3, 32 ; 0|w6
- movd edx, mm3 ; w6
- mov ah, [ecx][ebx] ; w7
- mov al, [ecx][edx] ; w6
- movd ebx, mm2 ; w4
- psrlq mm2, 32 ; 0|w5
- shl eax, 16 ; w7|w6|0|0
- movd edx, mm2 ; w5
- mov al, [ecx][ebx] ; w4
- mov ah, [ecx][edx] ; w5
- movd mm5, eax ; w7|w6|w5|w4
- punpckldq mm4, mm5 ; w7|w6|w5|w4|w3|w2|w1|w0
- add locdwwsptr, 16
- mov eax, locdwcounter
- movq [edi], mm4
- ;; update address pointer and loop counter
- dec eax
- mov locdwcounter, eax
- jnz idct_row
- ;;;;;;; end of 1D-idct on all the rows ;;;;;;;
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- emms
- } //end of __asm
- }
- #endif /* DCT_ISLOW_SUPPORTED */