Code/Resource
Windows Develop
Linux-Unix program
Internet-Socket-Network
Web Server
Browser Client
Ftp Server
Ftp Client
Browser Plugins
Proxy Server
Email Server
Email Client
WEB Mail
Firewall-Security
Telnet Server
Telnet Client
ICQ-IM-Chat
Search Engine
Sniffer Package capture
Remote Control
xml-soap-webservice
P2P
WEB(ASP,PHP,...)
TCP/IP Stack
SNMP
Grid Computing
SilverLight
DNS
Cluster Service
Network Security
Communication-Mobile
Game Program
Editor
Multimedia program
Graph program
Compiler program
Compress-Decompress algrithms
Crypt_Decrypt algrithms
Mathimatics-Numerical algorithms
MultiLanguage
Disk/Storage
Java Develop
assembly language
Applications
Other systems
Database system
Embeded-SCM Develop
FlashMX/Flex
source in ebook
Delphi VCL
OS Develop
MiddleWare
MPI
MacOS develop
LabView
ELanguage
Software/Tools
E-Books
Artical/Document
jislex.cpp
Package: shell.rar [view]
Upload User: xhy777
Upload Date: 2007-02-14
Package Size: 24088k
Code Size: 17k
Category:
Windows Kernel
Development Platform:
Visual C++
- /*----------------------------------------------------------------------------
- %%File: jislex.c
- %%Unit: fechmap
- %%Contact: jpick
- Simple converter for decoding a subset of possible ISO-2022-7 encoded
- files (ISO-2022). Data is translated to and from Unicode. Converter
- operates according to user options.
- Module currently handles ISO-2022-JP (and JIS) and ISO-2022-KR.
- Converter is set up to handle ISO-2022-TW and ISO-2022-CN, but there
- are as yet no conversion tables for these.
- ----------------------------------------------------------------------------*/
- #include <stdio.h>
- #include <stddef.h>
- #include "private.h"
- #include "fechmap_.h"
- #include "lexint_.h"
- // State table for reading ISO-2022-7 encoded text
- //
- // Lexer recognizes the following designator sequences, used
- // to select a one or two byte character set:
- //
- // <esc> $ @ -- JIS C 6626-1978 (synonym of <esc> $ ( @)
- // <esc> $ A -- GB 2312-80 (synonym of <esc> $ ( A)
- // <esc> $ B -- JIS X 0208-1983 (synonym of <esc> $ ( B)
- //
- // <esc> $ ( @ -- JIS C 6626-1978
- // <esc> $ ( A -- GB 2312-80
- // <esc> $ ( B -- JIS X 0208-1983
- // <esc> $ ( C -- KS C 5601-1992
- // <esc> $ ( D -- JIS X 0212-1990
- // <esc> $ ( E -- ??? (ISO-IR-165:1992) ???
- // <esc> $ ( G -- CNS 11643-1992 Plane 1
- // <esc> $ ( H -- CNS 11643-1992 Plane 2
- // <esc> $ ( I -- CNS 11643-1992 Plane 3
- // <esc> $ ( J -- CNS 11643-1992 Plane 4
- // <esc> $ ( K -- CNS 11643-1992 Plane 5
- // <esc> $ ( L -- CNS 11643-1992 Plane 6
- // <esc> $ ( M -- CNS 11643-1992 Plane 7
- //
- // <esc> $ ) C -- KSC 5601-1987 (Implies ISO-2022-KR ??)
- //
- // <esc> & @ <esc> $ B -- JIS X 0208-1990
- //
- // <esc> ( B -- Ascii
- // <esc> ( H -- Deprecated variant of JIS-Roman
- // <esc> ( I -- Half-Width Katakana
- // <esc> ( J -- JIS-Roman
- // <esc> ( T -- GB 1988-89 Roman
- //
- // Lexer recognizes the following shift sequences, used to allow
- // interpretation of a given byte or bytes:
- //
- // <si> -- locking shift, interpret bytes as G0
- // <so> -- locking shift, interpret bytes as G1
- // <esc> n -- locking shift, interpret bytes as G2
- // <esc> o -- locking shift, interpret bytes as G3
- // <esc> N -- single shift, interpret bytes as G2
- // <esc> O -- single shift, interpret bytes as G3
- //
- // REVIEW (jpick): don't currently need the final four shift
- // sequences. If we support ISO-2022-CN, we'll need to use
- // G2 and G3 and potentially, then, the last four shifts.
- //
- /*----------------------------------------------------------------------------
- Character Classification Table
- ----------------------------------------------------------------------------*/
- // Tokens
- //
- #define txt (JTK) 0
- #define ext (JTK) 1 // extended characters that are legal under certain circumstances
- #define esc (JTK) 2
- #define si (JTK) 3
- #define so (JTK) 4
- #define dlr (JTK) 5
- #define at (JTK) 6
- #define amp (JTK) 7
- #define opr (JTK) 8
- #define cpr (JTK) 9
- #define tkA (JTK) 10
- #define tkB (JTK) 11
- #define tkC (JTK) 12
- #define tkD (JTK) 13
- #define tkE (JTK) 14
- #define tkG (JTK) 15
- #define tkH (JTK) 16
- #define tkI (JTK) 17
- #define tkJ (JTK) 18
- #define tkK (JTK) 19
- #define tkL (JTK) 20
- #define tkM (JTK) 21
- #define tkT (JTK) 22
- #define unk (JTK) 23 // Unexpected character
- #define eof (JTK) 24 // end-of-file
- #define err (JTK) 25 // read error
- #define nTokens 26
- // Lookup table for ISO-2022-7 encoded files
- //
- static JTK _rgjtkCharClass[256] =
- // 0 1 2 3 4 5 6 7 8 9 a b c d e f
- {
- // nul soh stx etx eot enq ack bel bs tab lf vt np cr so si 0
- txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, so, si,
- // dle dc1 dc2 dc3 dc4 nak syn etb can em eof esc fs gs rs us 1
- txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, esc, txt, txt, txt, txt,
- // sp ! " # $ % & ' ( ) * + , - . / 2
- txt, txt, txt, txt, dlr, txt, amp, txt, opr, cpr, txt, txt, txt, txt, txt, txt,
- // 0 1 2 3 4 5 6 7 8 9 : ; < = > ? 3
- txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt,
- // @ A B C D E F G H I J K L M N O 4
- at, tkA, tkB, tkC, tkD, tkE, txt, tkG, tkH, tkI, tkJ, tkK, tkL, tkM, txt, txt,
- // P Q R S T U V W X Y Z [ ] ^ _ 5
- txt, txt, txt, txt, tkT, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt,
- // ` a b c d e f g h i j k l m n o 6
- txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt,
- // p q r s t u v w x y z { | } ~ del 7
- txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt,
- // 8
- unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk,
- // 9
- unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk,
- // a
- unk, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext,
- // b
- ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext,
- // c
- ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext,
- // d
- ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext,
- // e
- unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk,
- // f
- unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk,
- // 0 1 2 3 4 5 6 7 8 9 a b c d e f
- };
- /*----------------------------------------------------------------------------
- State Table
- ----------------------------------------------------------------------------*/
- // Final states have the high-bit set. States that represent the reading
- // of a valid character escape sequence also encode the character set
- // "name" (moniker??) -- the state with the high bit masked off.
- //
- // Table State
- //
- typedef unsigned char TST;
- // Final State Mask, Related
- //
- #define grfFinal (TST) 0x80
- #define _NEscTypeFromState(nState) (int) ((nState) & 0x7f)
- // ASCII Escape Sequence (Final State)
- #define ASC (TST) (grfFinal | 0x00) // Ascii
- // Japanese Escape Sequences (Final States)
- #define JS0 (TST) (grfFinal | 0x01) // JIS-Roman
- #define JS1 (TST) (grfFinal | 0x02) // Half-Width Katakana
- #define JS2 (TST) (grfFinal | 0x03) // JIS C 6226-1978
- #define JS3 (TST) (grfFinal | 0x04) // JIS X 0208-1983
- #define JS4 (TST) (grfFinal | 0x05) // JIS X 0208-1990
- #define JS5 (TST) (grfFinal | 0x06) // JIS X 0212-1990
- // Chinese (PRC) Escape Sequences (Final States)
- #define CS0 (TST) (grfFinal | 0x07) // GB 1988-89 Roman
- #define CS1 (TST) (grfFinal | 0x08) // GB 2312-80
- // Chinese (Taiwan) Escape Sequences (Final States)
- #define TS0 (TST) (grfFinal | 0x09) // CNS 11643-1992 Plane 1
- #define TS1 (TST) (grfFinal | 0x0a) // CNS 11643-1992 Plane 2
- #define TS2 (TST) (grfFinal | 0x0b) // CNS 11643-1992 Plane 3
- #define TS3 (TST) (grfFinal | 0x0c) // CNS 11643-1992 Plane 4
- #define TS4 (TST) (grfFinal | 0x0d) // CNS 11643-1992 Plane 5
- #define TS5 (TST) (grfFinal | 0x0e) // CNS 11643-1992 Plane 6
- #define TS6 (TST) (grfFinal | 0x0f) // CNS 11643-1992 Plane 7
- // Korean Escape Sequences (Final State)
- #define KS0 (TST) (grfFinal | 0x10) // KS C 5601-1992
- // Document "Signal" for ISO-2022-KR (Doc needs special processing)
- #define KSD (TST) (grfFinal | 0x11) // ISO-2022-KR Document Signal
- // Number of unique *character set* escape sequences
- //
- #define cCsEsc 18
- // Special States (not escape sequence) (Final States)
- //
- #define TXT (TST) (grfFinal | (cCsEsc + 1)) // Process Text
- #define EXT (TST) (grfFinal | (cCsEsc + 2)) // Process (Possibly Illegal) Extended Chars
- #define FIN (TST) (grfFinal | (cCsEsc + 3)) // Finish
- #define EOI (TST) (grfFinal | (cCsEsc + 4)) // Unexpected End-Of-Input
- #define UNK (TST) (grfFinal | (cCsEsc + 5)) // Unknown State (Unexpected Character)
- #define ERR (TST) (grfFinal | (cCsEsc + 6)) // Read Error
- // Shift Sequences (do not specify character set) (Final States)
- //
- #define LSO (TST) (grfFinal | (cCsEsc + 7)) // Locking shift out (g1 into GL)
- #define LSI (TST) (grfFinal | (cCsEsc + 8)) // Locking shift in (g0 into GL)
- // For convenience, also define constants for the sets
- // that the states represent.
- //
- #define csNIL (-1) // Invalid Designator
- #define csASC (_NEscTypeFromState(ASC)) // Ascii
- #define csJS0 (_NEscTypeFromState(JS0)) // JIS-Roman
- #define csJS1 (_NEscTypeFromState(JS1)) // Half-Width Katakana
- #define csJS2 (_NEscTypeFromState(JS2)) // JIS C 6226-1978
- #define csJS3 (_NEscTypeFromState(JS3)) // JIS X 0208-1983
- #define csJS4 (_NEscTypeFromState(JS4)) // JIS X 0208-1990
- #define csJS5 (_NEscTypeFromState(JS5)) // JIS X 0212-1990
- #define csCS0 (_NEscTypeFromState(CS0)) // GB 1988-89 Roman
- #define csCS1 (_NEscTypeFromState(CS1)) // GB 2312-80
- #define csTS0 (_NEscTypeFromState(TS0)) // CNS 11643-1992 Plane 1
- #define csTS1 (_NEscTypeFromState(TS1)) // CNS 11643-1992 Plane 2
- #define csTS2 (_NEscTypeFromState(TS2)) // CNS 11643-1992 Plane 3
- #define csTS3 (_NEscTypeFromState(TS3)) // CNS 11643-1992 Plane 4
- #define csTS4 (_NEscTypeFromState(TS4)) // CNS 11643-1992 Plane 5
- #define csTS5 (_NEscTypeFromState(TS5)) // CNS 11643-1992 Plane 6
- #define csTS6 (_NEscTypeFromState(TS6)) // CNS 11643-1992 Plane 7
- #define csKS0 (_NEscTypeFromState(KS0)) // KS C 5601-1992 (into G0)
- #define csKSD (_NEscTypeFromState(KSD)) // KS C 5601-1992 (into G1)
- // Table States (Intermediate States)
- #define ST0 (TST) 0
- #define ST1 (TST) 1
- #define ST2 (TST) 2
- #define ST3 (TST) 3
- #define ST4 (TST) 4
- #define ST5 (TST) 5
- #define ST6 (TST) 6
- #define ST7 (TST) 7
- #define ST8 (TST) 8
- #define ST9 (TST) 9
- // Number of "real" (table) states
- //
- #define nStates 10
- #define IsFinal(state) ((state) & grfFinal)
- // State Have Seen Looking For
- // ----------------------------------------------------------
- // ST0 -- Start State -- <ESC> Text
- // ST1 <ESC> $ & (
- // ST2 <ESC> $ ( ) @ A B (**)
- // ST3 <ESC> $ ( @ A B C D E G H I J K L M
- // ST4 <ESC> $ ) C
- // ST5 <ESC> & @
- // ST6 <ESC> & @ <ESC>
- // ST7 <ESC> & @ <ESC> $
- // ST8 <ESC> & @ <ESC> $ B
- // ST9 <ESC> ( B H I J T
- //
- // (**) "<ESC> $ ID" is a synonym of "<ESC> $ ( ID" for ID=(@, A, B)
- //
- // Because of the large number of tokens, this table is
- // inverted (tokens x states).
- //
- static signed char _rgchNextState[nTokens][nStates] =
- {
- //
- // S S S S S S S S S S
- // T T T T T T T T T T
- // 0 1 2 3 4 5 6 7 8 9
- //--------------------------------------------------------------------
- //
- /* txt */ TXT, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK,
- /* ext */ EXT, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK,
- /* esc */ ST1, UNK, UNK, UNK, UNK, UNK, ST7, UNK, UNK, UNK,
- /* si */ LSI, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK,
- /* so */ LSO, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK,
- /* $ */ TXT, ST2, UNK, UNK, UNK, UNK, UNK, ST8, UNK, UNK,
- /* @ */ TXT, UNK, JS2, JS2, UNK, ST6, UNK, UNK, UNK, UNK,
- /* & */ TXT, ST5, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK,
- /* ( */ TXT, ST9, ST3, UNK, UNK, UNK, UNK, UNK, UNK, UNK,
- /* ) */ TXT, UNK, ST4, UNK, UNK, UNK, UNK, UNK, UNK, UNK,
- /* A */ TXT, UNK, CS1, CS1, UNK, UNK, UNK, UNK, UNK, UNK,
- /* B */ TXT, UNK, JS3, JS3, UNK, UNK, UNK, UNK, JS4, ASC,
- /* C */ TXT, UNK, UNK, KS0, KSD, UNK, UNK, UNK, UNK, UNK,
- /* D */ TXT, UNK, UNK, JS5, UNK, UNK, UNK, UNK, UNK, UNK,
- /* E */ TXT, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK,
- /* G */ TXT, UNK, UNK, TS0, UNK, UNK, UNK, UNK, UNK, UNK,
- /* H */ TXT, UNK, UNK, TS1, UNK, UNK, UNK, UNK, UNK, JS0,
- /* I */ TXT, UNK, UNK, TS2, UNK, UNK, UNK, UNK, UNK, JS1,
- /* J */ TXT, UNK, UNK, TS3, UNK, UNK, UNK, UNK, UNK, JS0,
- /* K */ TXT, UNK, UNK, TS4, UNK, UNK, UNK, UNK, UNK, UNK,
- /* L */ TXT, UNK, UNK, TS5, UNK, UNK, UNK, UNK, UNK, UNK,
- /* M */ TXT, UNK, UNK, TS6, UNK, UNK, UNK, UNK, UNK, UNK,
- /* T */ TXT, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK, CS0,
- /* unk */ UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK,
- /* eof */ FIN, EOI, EOI, EOI, EOI, EOI, EOI, EOI, EOI, EOI,
- /* err */ ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR,
- };
- // Also for ISO-2022 out. Build arrays of possible character
- // sets for each type of input character set. Character sets
- // should appear in order of hit probability (e.g., in 2022-Jp
- // JS3 is the most common set). Mark the end of array with -1.
- // (Only store these for non-ascii sets).
- //
- //
- // China (icetIso2022Cn)
- static int _rgceCn[] = { -1, };
- // Japan (icetIso2022Jp)
- static int _rgceJp[] = { csJS3, csJS1, csJS5, -1, };
- // Korea (icetIso2022Kr)
- static int _rgceKr[] = { -1, };
- // Taiwan (icetIso2022Tw)
- static int _rgceTw[] = { -1, };
- static int *_mpicetrgce[icetCount] =
- {
- 0, // icetEucCn
- 0, // icetEucJp
- 0, // icetEucKr
- 0, // icetEucTw
- _rgceCn, // icetIso2022Cn
- _rgceJp, // icetIso2022Jp
- _rgceKr, // icetIso2022Kr
- _rgceTw, // icetIso2022Tw
- 0, // icetBig5
- 0, // icetGbk
- 0, // icetShiftJis
- 0, // icetWansung
- 0, // icetUtf8
- };
- /* _ J T K G E T N E X T */
- /*----------------------------------------------------------------------------
- %%Function: _JtkGetNext
- %%Contact: jpick
- Get the next character and classify it. Return the token.
- ----------------------------------------------------------------------------*/
- static JTK __inline _JtkGetNext(IStream *pstmIn, PUCHAR puch)
- {
- ULONG rc;
- HRESULT hr;
- hr = pstmIn->Read(puch, 1, &rc);
- if (hr != S_OK )
- return err;
- else if (rc == 0)
- return eof;
- else
- return _rgjtkCharClass[*puch];
- }
- /* C C E R E A D E S C S E Q */
- /*----------------------------------------------------------------------------
- %%Function: CceReadEscSeq
- %%Contact: jpick
- Read pointer is positioned at an escape sequence, figure out
- which escape sequence it is.
- ----------------------------------------------------------------------------*/
- CCE CceReadEscSeq(IStream *pstmIn, ICET *lpicet)
- {
- UCHAR uch;
- TST tstCurr;
- JTK jtk;
- CCE cceRet;
- #ifdef DEBUG
- TST tstPrev;
- #endif
- // Sanity checks ...
- //
- #ifdef DEBUG
- if (!pstmIn || !lpicet)
- return cceInvalidParameter;
- #endif
- tstCurr = ST0;
- while (1)
- {
- // Find the next stopping state.
- //
- do
- {
- // Get the next character and clasify it.
- //
- jtk = _JtkGetNext(pstmIn, &uch);
- #ifdef DEBUG
- // Save the previous state for debugging purposes, only.
- //
- tstPrev = tstCurr;
- #endif
- // Transition -- note that order is different than
- // "normal" transition tables.
- //
- tstCurr = _rgchNextState[jtk][tstCurr];
- } while (!IsFinal(tstCurr));
- switch (tstCurr)
- {
- case JS0: // JIS-Roman
- case JS1: // Half-Width Katakana
- case JS2: // JIS C 6226-1978
- case JS3: // JIS X 0208-1983
- case JS4: // JIS X 0208-1990
- case JS5: // JIS X 0212-1990
- *lpicet = icetIso2022Jp;
- cceRet = cceSuccess;
- goto _LRet;
- case CS0: // GB 1988-89 Roman
- case CS1: // GB 2312-80
- *lpicet = icetIso2022Cn;
- cceRet = cceSuccess;
- goto _LRet;
- case TS0: // CNS 11643-1992 Plane 1
- case TS1: // CNS 11643-1992 Plane 2
- case TS2: // CNS 11643-1992 Plane 3
- case TS3: // CNS 11643-1992 Plane 4
- case TS4: // CNS 11643-1992 Plane 5
- case TS5: // CNS 11643-1992 Plane 6
- case TS6: // CNS 11643-1992 Plane 7
- *lpicet = icetIso2022Tw;
- cceRet = cceSuccess;
- goto _LRet;
- case KS0: // KS C 5601-1992
- case KSD: // ISO-2022-KR Document Signal
- *lpicet = icetIso2022Kr;
- cceRet = cceSuccess;
- goto _LRet;
- case ASC: // Ascii
- case LSO:
- case LSI:
- case TXT:
- case EXT:
- case FIN:
- // Insufficient information to choose a flavor ...
- cceRet = cceMayBeAscii;
- goto _LRet;
- case ERR:
- cceRet = cceRead;
- goto _LRet;
- default: // UNK, EOI
- cceRet = cceUnknownInput;
- goto _LRet;
- }
- }
- _LRet:
- return cceRet;
- }