encoding.cpp
Upload User: xhy777
Upload Date: 2007-02-14
Package Size: 24088k
Code Size: 10k
Category:

Windows Kernel

Development Platform:

Visual C++

  1. /*****************************************************************************
  2.     FILE: encoding.cpp
  3.     
  4.     DESCRIPTION:
  5.         Handle taking internet strings by detecting if they are UTF-8 encoded
  6.     or DBCS and finding out what code page was used.
  7. *****************************************************************************/
  8. #include "priv.h"
  9. #include "util.h"
  10. #include "ftpurl.h"
  11. #include "statusbr.h"
  12. #include <commctrl.h>
  13. #include <shdocvw.h>
  14. /*****************************************************************************
  15.     CLASS: CMultiLanguageCache
  16. *****************************************************************************/
  17. HRESULT CMultiLanguageCache::_Init(void)
  18. {
  19.     if (m_pml2)
  20.         return S_OK;
  21.     return CoCreateInstance(CLSID_CMultiLanguage, NULL, CLSCTX_INPROC_SERVER, IID_IMultiLanguage2, (void **) &m_pml2);
  22. }
  23. /*****************************************************************************
  24.     CLASS: CWireEncoding
  25. *****************************************************************************/
  26. CWireEncoding::CWireEncoding(void)
  27. {
  28.     // We can go on the stack, so we may not be zero inited.
  29.     m_nConfidence = 0;
  30.     m_uiCodePage = CP_ACP;     // 
  31.     m_dwMode = 0;
  32.     m_fUseUTF8 = FALSE;
  33. }
  34. CWireEncoding::~CWireEncoding(void)
  35. {
  36. }
  37. void CWireEncoding::_ImproveAccuracy(CMultiLanguageCache * pmlc, LPCWIRESTR pwStr, BOOL fUpdateCP, UINT * puiCodePath)
  38. {
  39.     DetectEncodingInfo dei = {0};
  40.     INT nStructs = 1;
  41.     INT cchSize = lstrlenA(pwStr);
  42.     IMultiLanguage2 * pml2 = pmlc->GetIMultiLanguage2();
  43.     // Assume we will use the normal code page.
  44.     *puiCodePath = m_uiCodePage;
  45.     if (S_OK == pml2->DetectInputCodepage(MLDETECTCP_8BIT, CP_AUTO, (LPWIRESTR)pwStr, &cchSize, &dei, (INT *)&nStructs))
  46.     {
  47.         // Is it UTF8 or just plain ansi(CP_20127)?
  48.         if (((CP_UTF_8 == dei.nCodePage) || (CP_20127 == dei.nCodePage)) &&
  49.             (dei.nConfidence > 70))
  50.         {
  51.             // Yes, so make sure the caller uses UTF8 to decode but don't update
  52.             // the codepage.
  53.             *puiCodePath = CP_UTF_8;
  54.         }
  55.         else
  56.         {
  57.             if (fUpdateCP && (dei.nConfidence > m_nConfidence))
  58.             {
  59.                 m_uiCodePage = dei.nCodePage;
  60.                 m_nConfidence = dei.nConfidence;
  61.             }
  62.         }
  63.     }
  64. }
  65. HRESULT CWireEncoding::WireBytesToUnicode(CMultiLanguageCache * pmlc, LPCWIRESTR pwStr, DWORD dwFlags, LPWSTR pwzDest, DWORD cchSize)
  66. {
  67.     HRESULT hr;
  68.     // Optimize for the fast common case.
  69.     if (Is7BitAnsi(pwStr))
  70.     {
  71.         pwzDest[0] = 0;
  72.         SHAnsiToUnicodeCP(CP_UTF_8, pwStr, pwzDest, cchSize);
  73.         hr = S_OK;
  74.     }
  75.     else
  76.     {
  77. #ifdef FEATURE_CP_AUTODETECT
  78.         if (this)
  79.         {
  80.             CMultiLanguageCache mlcTemp;
  81.             UINT cchSizeTemp = cchSize;
  82.             UINT uiCodePageToUse;
  83.             if (!pmlc)
  84.                 pmlc = &mlcTemp;
  85.             if (!pmlc || !pmlc->GetIMultiLanguage2())
  86.                 return E_FAIL;
  87.             IMultiLanguage2 * pml2 = pmlc->GetIMultiLanguage2();
  88.             _ImproveAccuracy(pmlc, pwStr, (WIREENC_IMPROVE_ACCURACY & dwFlags), &uiCodePageToUse);
  89.             if (CP_ACP == uiCodePageToUse)
  90.                 uiCodePageToUse = GetACP();
  91.             UINT cchSrcSize = lstrlenA(pwStr) + 1; // The need to do the terminator also.
  92.             hr = pml2->ConvertStringToUnicode(&m_dwMode, uiCodePageToUse, (LPWIRESTR)pwStr, &cchSrcSize, pwzDest, &cchSizeTemp);
  93.             if (!(EVAL(S_OK == hr)))
  94.                 SHAnsiToUnicode(pwStr, pwzDest, cchSize);
  95.         }
  96.         else
  97. #endif // FEATURE_CP_AUTODETECT
  98.         {
  99.             UINT uiCodePage = ((WIREENC_USE_UTF8 & dwFlags) ? CP_UTF_8 : CP_ACP);
  100.             SHAnsiToUnicodeCP(uiCodePage, pwStr, pwzDest, cchSize);
  101.         }
  102.     }
  103.     return hr;
  104. }
  105. HRESULT CWireEncoding::UnicodeToWireBytes(CMultiLanguageCache * pmlc, LPCWSTR pwzStr, DWORD dwFlags, LPWIRESTR pwDest, DWORD cchSize)
  106. {
  107.     HRESULT hr = S_OK;
  108. #ifdef FEATURE_CP_AUTODETECT
  109.     CMultiLanguageCache mlcTemp;
  110.     DWORD dwCodePage = CP_UTF_8;
  111.     DWORD dwModeTemp = 0;
  112.     DWORD * pdwMode = &dwModeTemp;
  113.     UINT cchSizeTemp = cchSize;
  114.     // In some cases, we don't know the site, so we use this.
  115.     // BUGBUG: Come back and force this to be set.
  116.     if (this)
  117.     {
  118.         dwCodePage = m_uiCodePage;
  119.         pdwMode = &m_dwMode;
  120.     }
  121.     if (!pmlc)
  122.         pmlc = &mlcTemp;
  123.     if (!pmlc)
  124.         return E_FAIL;
  125.     IMultiLanguage2 * pml2 = pmlc->GetIMultiLanguage2();
  126. //    if (WIREENC_USE_UTF8 & dwFlags)
  127. //        dwCodePage = CP_UTF_8;
  128.     UINT cchSrcSize = lstrlenW(pwzStr) + 1; // The need to do the terminator also.
  129.     if (CP_ACP == dwCodePage)
  130.         dwCodePage = GetACP();
  131.     hr = pml2->ConvertStringFromUnicode(pdwMode, dwCodePage, (LPWSTR) pwzStr, &cchSrcSize, pwDest, &cchSizeTemp);
  132.     if (!(EVAL(S_OK == hr)))
  133.         SHUnicodeToAnsi(pwzStr, pwDest, cchSize);
  134. #else // FEATURE_CP_AUTODETECT
  135.     UINT nCodePage = ((WIREENC_USE_UTF8 & dwFlags) ? CP_UTF_8 : CP_ACP);
  136.     SHUnicodeToAnsiCP(nCodePage, pwzStr, pwDest, cchSize);
  137. #endif // FEATURE_CP_AUTODETECT
  138.     return hr;
  139. }
  140. HRESULT CWireEncoding::ReSetCodePages(CMultiLanguageCache * pmlc, CFtpPidlList * pFtpPidlList)
  141. {
  142.     CMultiLanguageCache mlcTemp;
  143.     
  144.     if (!pmlc)
  145.         pmlc = &mlcTemp;
  146.     
  147.     if (!pmlc)
  148.         return E_FAIL;
  149.     // BUGBUG/TODO:
  150.     return S_OK;
  151. }
  152. HRESULT CWireEncoding::CreateFtpItemID(CMultiLanguageCache * pmlc, LPFTP_FIND_DATA pwfd, LPITEMIDLIST * ppidl)
  153. {
  154.     CMultiLanguageCache mlcTemp;
  155.     WCHAR wzDisplayName[MAX_PATH];
  156.     
  157.     if (!pmlc)
  158.         pmlc = &mlcTemp;
  159.     WireBytesToUnicode(pmlc, pwfd->cFileName, (m_fUseUTF8 ? WIREENC_USE_UTF8 : WIREENC_NONE), wzDisplayName, ARRAYSIZE(wzDisplayName));
  160.     return FtpItemID_CreateReal(pwfd, wzDisplayName, ppidl);
  161. }
  162. HRESULT CWireEncoding::ChangeFtpItemIDName(CMultiLanguageCache * pmlc, LPCITEMIDLIST pidlBefore, LPCWSTR pwzNewName, BOOL fUTF8, LPITEMIDLIST * ppidlAfter)
  163. {
  164.     CMultiLanguageCache mlcTemp;
  165.     WIRECHAR wWireName[MAX_PATH];
  166.     HRESULT hr;
  167.     if (!pmlc)
  168.         pmlc = &mlcTemp;
  169.     hr = UnicodeToWireBytes(pmlc, pwzNewName, (fUTF8 ? WIREENC_USE_UTF8 : WIREENC_NONE), wWireName, ARRAYSIZE(wWireName));
  170.     if (EVAL(SUCCEEDED(hr)))
  171.         hr = FtpItemID_CreateWithNewName(pidlBefore, pwzNewName, wWireName, ppidlAfter);
  172.     return hr;
  173. }
  174. BOOL SHIsUTF8Encoded(LPCWIRESTR pszIsUTF8)
  175. {
  176.     unsigned int len = lstrlenA(pszIsUTF8);
  177.     LPCWIRESTR endbuf = pszIsUTF8 + len;
  178.     unsigned char byte2mask = 0x00;
  179.     unsigned char c;
  180.     int trailing = 0;               // trailing (continuation) bytes to follow
  181.                              
  182.      while (pszIsUTF8 != endbuf)
  183.      {
  184.          c = *pszIsUTF8++;
  185.          if (trailing)
  186.          {
  187.              if ((c & 0xC0) == 0x80)    // Does trailing byte follow UTF-8 format?
  188.              {
  189.                  if (byte2mask)     // Need to check 2nd byte for proper range?
  190.                  {
  191.                      if (c & byte2mask)      // Are appropriate bits set?
  192.                          byte2mask=0x00;
  193.                      else
  194.                          return 0;
  195.                      trailing--;
  196.                  }
  197.              }
  198.              else
  199.                  return FALSE;
  200.          }
  201.          else
  202.          {
  203.              if ((c & 0x80) == 0x00)
  204.                  continue;         // valid 1 byte UTF-8
  205.              else
  206.              {
  207.                  if ((c & 0xE0) == 0xC0) // valid 2 byte UTF-8
  208.                  {
  209.                     if (c & 0x1E)      // Is UTF-8 byte in proper range?
  210.                     {
  211.                         trailing =1;
  212.                     }
  213.                     else
  214.                         return FALSE;
  215.                  }
  216.                  else
  217.                  {
  218.                      if ((c & 0xF0) == 0xE0)               // valid 3 byte UTF-8
  219.                      {
  220.                          if (!(c & 0x0F))                          // Is UTF-8 byte in proper range?
  221.                             byte2mask=0x20;                    // If not set mask to check next byte
  222.                          trailing = 2;
  223.                      }
  224.                      else
  225.                      {
  226.                          if ((c & 0xF8) == 0xF0)               // valid 4 byte UTF-8
  227.                          {
  228.                              if (!(c & 0x07))                          // Is UTF-8 byte in proper range?
  229.                                 byte2mask=0x30;                    // If not set mask to check next byte
  230.                              trailing = 3;
  231.                          }
  232.                          else
  233.                          {
  234.                              if ((c & 0xFC) == 0xF8)               // valid 5 byte UTF-8
  235.                              {
  236.                                  if (!(c & 0x03))                          // Is UTF-8 byte in proper range?
  237.                                     byte2mask=0x38;                    // If not set mask to check next byte
  238.                                  trailing = 4;
  239.                              }
  240.                              else
  241.                              {
  242.                                  if ((c & 0xFE) == 0xFC)               // valid 6 byte UTF-8
  243.                                  {
  244.                                      if (!(c & 0x01))                          // Is UTF-8 byte in proper range?
  245.                                         byte2mask=0x3C;                    // If not set mask to check next byte
  246.                                      trailing = 5;
  247.                                  }
  248.                                  else
  249.                                      return FALSE;
  250.                              }
  251.                          }
  252.                      }
  253.                  }
  254.              }
  255.          }
  256.      }
  257.      return (trailing == 0);
  258.  }