2012年3月24日 星期六

在 Windows Phone 上讀取 BIG5 網頁

SNAGHTML2b39c0e

Windows Phone SDK 是不支援 BIG5 編碼的,它只支援三種編碼。

BigEndianUnicode、Unicode、UTF8

image 

 

因此若要讀取 BIG5 的網頁,就必須自行將 BIG5 轉換為 Unicode,實作的主要重點在於:

  1. 取得 BIG5 –> Unicode 轉換表。(BIG5.TXT)
  2. 將轉換表改用 Dictionary 型態儲存。
  3. 讀取網頁時,用 stream,不要用 WebClient!

使用 WebClient 來讀取網頁,得到的並非是 raw data,而是已被 default encoding 轉換過的 data。更進一步解釋,就是使用了 UTF8 –> Unicode 轉換表來轉換 BIG5 的資料,這會導致資料整個變成不可用的亂碼。

 

實作

將 BIG5.TXT 加入專案

image 

 

在 ContentPanel 加入一個 textBlock

image

 

MainPage.xaml.cs

using System;
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Windows;
using System.Windows.Controls;
using System.Windows.Documents;
using System.Windows.Input;
using System.Windows.Media;
using System.Windows.Media.Animation;
using System.Windows.Shapes;
using Microsoft.Phone.Controls;
 
using System.IO;
using System.Globalization;
using System.Diagnostics;
using System.Text;
 
namespace PhoneApp2
{
    public partial class MainPage : PhoneApplicationPage
    {
        // async http
        delegate void DownDelegate(string content);
        DownDelegate downDelegate;
 
        // Big5 to Unicode mapping table
        private static Dictionary<int, int> mBIG5_Unicode_MAP = new Dictionary<int, int>();
 
        // 建構函式
        public MainPage()
        {
            InitializeComponent();
            createBig5ToUnicodeDictionary();
            readBig5WebPage();
        }
 
        private void setConent(string content)
        {
            textBlock1.Text = content;
        }
 
        private void createBig5ToUnicodeDictionary()
        {
            var resource = Application.GetResourceStream(new Uri("BIG5.TXT", UriKind.Relative));
            StreamReader sr = new StreamReader(resource.Stream);
            string line;
            while ((line = sr.ReadLine()) != null)
            {
                // 忽略註解
                if (line.StartsWith("#")) continue;
                string[] lTokens = line.Split(new char[] {'\t'});
                mBIG5_Unicode_MAP.Add(hexToInt(lTokens[0].Substring(2)), hexToInt(lTokens[1].Substring(2)));
            }
        }
 
        private void readBig5WebPage()
        {
            textBlock1.Text = "讀取中...";
            string url = "http://www.businessweekly.com.tw/feednews.php";
            downDelegate = setConent;
            System.Net.WebRequest request = HttpWebRequest.Create(url);
            IAsyncResult result = request.BeginGetResponse(ResponseCallback, request);
        }
 
        private void ResponseCallback(IAsyncResult result)
        {
            HttpWebRequest request = (HttpWebRequest)result.AsyncState;
            WebResponse response = request.EndGetResponse(result);
            Stream s = response.GetResponseStream();
            Dispatcher.BeginInvoke(downDelegate, big5ToUnicode(s).ToString());
        }
 
        private StringBuilder big5ToUnicode(Stream s)
        {
            StringBuilder lSB = new StringBuilder();
            byte[] big5Buffer = new byte[2];
            int input;
            while ((input = s.ReadByte()) != -1)
            {
                if (input > 0x81 && big5Buffer[0] == 0)
                {
                    big5Buffer[0] = (byte)input;
                }
                else if (big5Buffer[0] != 0)
                {
                    big5Buffer[1] = (byte)input;
                    int Big5Char = (big5Buffer[0] << 8) + big5Buffer[1];
                    try
                    {
                        int UTF8Char = mBIG5_Unicode_MAP[Big5Char];
                        lSB.Append((char)UTF8Char);
                    }
                    catch (Exception)
                    {
                        lSB.Append((char)mBIG5_Unicode_MAP[0xA148]);
                    }
 
                    big5Buffer = new byte[2];
                }
                else
                {
                    lSB.Append((char)input);
                }
            }
            return lSB;
        }
 
        private int hexToInt(string hexString)
        {
            return int.Parse(hexString, NumberStyles.HexNumber);
        }
    }
}

 

範例程式碼下載

 

參考連結