//對幾千萬的TXT文本數據進行去重處理,查找其中重復的數據,並移除。嘗試了各種方法,下屬方法是目前嘗試到最快的方法。以下代碼將重復和不重復數據進行分文件存放,提升效率的關鍵是用到了HashSet。 TextReader reader = File.OpenText(m_dataFilePath); string[] files = new string[2]; files[0] = ROOT_DIR + "不重復數據.txt"; files[1] = ROOT_DIR + "重復數據.txt"; TextWriter writer1 = File.CreateText(files[0]); TextWriter writer2 = File.CreateText(files[1]); string currentLine; int idx = 0; HashSet<string> previousLines = new HashSet<string>(new MyEqualityComparer()); while ((currentLine = reader.ReadLine()) != null) { if ((++idx % 10000) == 0) UpdateInfo("正在比對第 " + idx + " 條數據…"); currentLine = currentLine.TrimEnd(); if (previousLines.Add(currentLine)) { writer1.WriteLine(currentLine); } else { if(m_allSave) writer2.WriteLine(currentLine); } } reader.Close(); writer1.Close(); writer2.Close(); reader.Dispose(); writer1.Dispose(); writer2.Dispose(); //1000萬數據的處理時間也就是轉瞬之間,試試看?