此乃软件工程课的个人项目,小弟入门水平请轻拍。。
多谢anran大神指点@wanganran
以及参考了一个C#的多线程入门的文章
各位看官想要最终代码可直接跳转到最后。。
10.08:初步考虑用多线程(双线程,一个IO,一个compute),先快速写出单线程最最最简单的版本
1 string rootdir = args[0]; 2 int N = Convert.ToInt32(args[1]); 3 string filePattern = args[2]; 4 5 string[] files = Directory.GetFiles(rootdir, filePattern); 6 7 int[] tablet = new int[128]; 8 for (int i = 'a'; i <= 'z'; i++) 9 {10 tablet[i] = 1;11 }12 for (int i = 'A'; i <= 'Z'; i++)13 {14 tablet[i] = 1;15 }16 StringBuilder sb = new StringBuilder(100);17 Dictionaryresult = new Dictionary (50000);18 19 string strKey = "";20 string readLine = "";21 foreach (string file in files)22 {23 FileStream fs = new FileStream(file,FileMode.Open);24 StreamReader sr = new StreamReader(fs);25 readLine = sr.ReadToEnd();26 sr.Close();27 fs.Close();28 29 int state = 0;30 31 for (int i = 0; i < readLine.Length; i++) 32 {33 switch (state)34 {35 case 0:36 if ((state = ((readLine[i] & 0x7f) != readLine[i]) ? 0 : tablet[readLine[i]]) != 0)37 {38 sb.Clear();39 sb.Append(readLine[i]);40 }41 break;42 default:43 if ((state = ((readLine[i] & 0x7f) != readLine[i]) ? 0 : tablet[readLine[i]]) == 0)44 {45 if (sb.Length >= 1)46 {47 strKey = sb.ToString();48 strKey = strKey.ToLower();49 int value;50 if (result.TryGetValue(strKey,out value))51 {52 result[strKey]++;53 }54 else 55 {56 result.Add(strKey,1);57 }58 }59 }60 else61 sb.Append(readLine[i]);62 break;63 }64 }65 }66 var outputResult = from KVP in result67 orderby KVP.Value descending68 select new StringBuilder(KVP.Key).Append(" ").Append(KVP.Value);69 int count = 0;70 foreach (var str in outputResult)71 {72 Console.WriteLine(str);73 count++;74 if (count > N - 1) break;75 }
考虑了简单的优化包括:
1, StringBuilder.Append
2, Dictionary 索引复杂度O(1)
3, Dictionary.trygetvalue代替Dictionary.containsKey();
4,估算单词个数提前开辟内存
性能分析:
考虑优化String.ToLower(),两种思路:
1, 自己写tolower函数
2, 先不tolower,最后再merge
自己写ToLower()函数(因为前面已经根据字母表对tolower的输入进行了限制,故此处可省略很多判断步骤):
public static StringBuilder ToLower(StringBuilder str) { for (int i = 0; i < str.Length; i++) { if (str[i] <= 'Z') { str[i] =(char) ((int)str[i]+32); } } return str; }
只需原来1/6的时间
10.09. 第一项任务是把tolower merge到最后一步完成。
即统计词频时区分大小写简历dictionary,最后再创建新的dictionary对每一个key tolower,再merge。
2小时后:遇到了一个问题
Dictionary<StringBuilder,int> 中的stringbuilder是地址形式存在的,每次Add必须新创建而不能单纯改变其值,否则dictionary.tryGetValue总是true
但是新创建sb类就会浪费时间,先通过string创建StringBuilder再sb.toString()反而多了一次消耗。
总的回顾发现区分大小写保存dictionary再merge的思想,增加了创建第一个dictionary时每次的索引时间,且每个单词多增加了一次string到StringBuilder的内存开辟时间,增加了一次遍历每个数组并索引的操作(建立新的dictionary),而收益是从每个单词执行一次tolower到每种单词(区分大小写)tolower一次。
遂决定暂时放弃这个思路。
下一步:决定设时间点统计一下io操作和计算操作所用时间比(此为用cmd跑程序,要比在ide中跑时间快,后面有些图是在ide中跑的程序)
测试数据集为原数据集复制粘贴若干次后,大小约100M的若干txt文件。
可以看到IO时间只占总时间的约1/8~1/10
所以目前程序开双线程意义不大,可以考虑先考虑继续优化单线程或开多线程处理。
2小时后:
感觉单线程的瓶颈在于索引,这点取决于dictionary数据结构,暂时先不考虑优化。
尝试动手写多线程。
决定先把单线程代码整理一下。
1 static StringBuilder sb = new StringBuilder(100); 2 static Dictionaryresult = new Dictionary (50000); 3 static int[] tablet = new int[128]; 4 static void Main(string[] args) 5 { 6 DateTime dt = DateTime.Now; 7 string rootdir = args[0]; 8 int N = Convert.ToInt32(args[1]); 9 string filePattern = args[2]; 10 11 string[] files = Directory.GetFiles(rootdir, filePattern); 12 13 for (int i = 'a'; i <= 'z'; i++) 14 { 15 tablet[i] = 1; 16 } 17 for (int i = 'A'; i <= 'Z'; i++) 18 { 19 tablet[i] = 1; 20 } 21 22 string readLine = ""; 23 24 DateTime COMt = new DateTime(); 25 DateTime IOt = new DateTime(); 26 DateTime OTHt = new DateTime(); 27 long COMtime = 0; 28 long IOtime = 0; 29 long OTHtime = 0; 30 31 foreach (string file in files) 32 { 33 IOt = DateTime.Now; 34 35 readLine = ReadFile(file); 36 37 IOtime += DateTime.Now.Second * 1000 + DateTime.Now.Millisecond - IOt.Second * 1000 - IOt.Millisecond; 38 39 COMt = DateTime.Now; 40 41 Compute(readLine); 42 43 COMtime += DateTime.Now.Second * 1000 + DateTime.Now.Millisecond - COMt.Second * 1000 - COMt.Millisecond; 44 } 45 46 OTHt = DateTime.Now; 47 48 var outputResult = from KVP in result 49 orderby KVP.Value descending 50 select new StringBuilder(KVP.Key).Append(" ").Append(KVP.Value); 51 int count = 0; 52 foreach (var str in outputResult) 53 { 54 Console.WriteLine(str); 55 count++; 56 if (count > N - 1) break; 57 } 58 OTHtime = DateTime.Now.Second * 1000 + DateTime.Now.Millisecond - OTHt.Second * 1000 - OTHt.Millisecond; 59 60 DateTime ot = DateTime.Now; 61 Console.WriteLine("IO time:" + IOtime + "ms. Compute time:" + COMtime + "ms. Other time: " + OTHtime + "ms."); 62 Console.WriteLine("Time: " + ((ot.Minute * 60 + ot.Second) * 1000 + ot.Millisecond - (dt.Minute * 60 + dt.Second) * 1000 - dt.Millisecond) + "ms"); 63 Console.ReadKey(); 64 } 65 66 public static string ReadFile(string file) 67 { 68 string readLine; 69 FileStream fs = new FileStream(file, FileMode.Open); 70 StreamReader sr = new StreamReader(fs); 71 readLine = sr.ReadToEnd(); 72 sr.Close(); 73 fs.Close(); 74 return readLine; 75 } 76 77 public static void Compute(string readLine) 78 { 79 string strKey = ""; 80 int value; 81 int state = 0; 82 83 for (int i = 0; i < readLine.Length; i++) 84 { 85 switch (state) 86 { 87 case 0: 88 if ((state = ( readLine[i] > ’z’ ) ? 0 : tablet[readLine[i]]) != 0) 89 { 90 sb.Clear(); 91 sb.Append(readLine[i]); 92 } 93 break; 94 default: 95 if ((state = ( readLine[i] > ’z’ ) ? 0 : tablet[readLine[i]]) == 0) 96 { 97 if (sb.Length >= 1) 98 { 99 //strKey = sb.ToString();100 //strKey = strKey.ToLower();101 strKey = ToLower(sb).ToString();102 if (result.TryGetValue(strKey, out value))103 {104 result[strKey]++;105 }106 else107 {108 result.Add(strKey, 1);109 }110 }111 }112 else113 sb.Append(readLine[i]);114 break;115 }116 }117 }118 119 public static StringBuilder ToLower(StringBuilder str)120 {121 for (int i = 0; i < str.Length; i++)122 {123 if (str[i] <= 'Z')124 {125 str[i] = (char)((int)str[i] + 32);126 }127 }128 return str;129 }130
多线程,参考wang anran大神使用BlockingCollection<T>
先写两个线程(IO与Compute),共享一个static dictionary<string,int>以及一个static BlockingCollection<string>,预测可以比单线程提高1/10左右用时(因为创建线程也耗时间)
晚9点:
双线程的已经写完,果然如预测提升了约1/10左右的时间:
但是发现了一个bug,单个字母的统计结果与另一位同学的程序的结果不符。
现在准备来检查一下原因。
终于找到了,原因在于对于字母的分割的歧义上。
按照qiufeng老师给出的规定:Treat all non-alphabet characters as split characters.
所有不在字母表中的字符(包括数字)均视为分隔符。例如:a3b应视为a和b两个单词进行统计。
明天决定进行多个线程协同统计的编写。难点在于如何判别多个线程都已处理完毕。
10.10 准备写一个线程IO,两个线程compute
先把双线程的代码贴出来备份(奇怪的是今天的双线程跑不出昨天的速度了。。忘记改了哪里。。)
1 class Program 2 { 3 static StringBuilder sb = new StringBuilder(100); 4 static Dictionaryresult = new Dictionary (50000); 5 static int[] tablet = new int[128]; 6 static BlockingCollection queue=new BlockingCollection (100); 7 static void Main(string[] args) 8 { 9 DateTime dt = DateTime.Now; 10 string rootdir = args[0]; 11 int N = Convert.ToInt32(args[1]); 12 string filePattern = args[2]; 13 string[] files = Directory.GetFiles(rootdir, filePattern); 14 15 for (int i = 'a'; i <= 'z'; i++) 16 { 17 tablet[i] = 1; 18 } 19 for (int i = 'A'; i <= 'Z'; i++) 20 { 21 tablet[i] = 1; 22 } 23 24 Thread FileIOth = new Thread(delegate() 25 { 26 Read(files); 27 }); 28 FileIOth.Start(); 29 30 31 Thread WorkerTh = new Thread(delegate() 32 { 33 Process(); 34 var outputResult = from KVP in result 35 orderby KVP.Value descending 36 select new StringBuilder(KVP.Key).Append(" ").Append(KVP.Value); 37 int count = 0; 38 foreach (var str in outputResult) 39 { 40 Console.WriteLine(str); 41 count++; 42 if (count > N - 1) break; 43 } 44 DateTime ot = DateTime.Now; 45 Console.WriteLine("Time: " + ((ot.Minute * 60 + ot.Second) * 1000 + ot.Millisecond - (dt.Minute * 60 + dt.Second) * 1000 - dt.Millisecond) + "ms"); 46 Console.ReadKey(); 47 }); 48 WorkerTh.Start(); 49 50 51 } 52 53 public static void Read(string[] files) 54 { 55 foreach (string file in files) 56 { 57 queue.TryAdd(ReadFile(file), -1); 58 } 59 queue.TryAdd("END", -1); 60 } 61 62 public static string ReadFile(string file) 63 { 64 string readLine; 65 FileStream fs = new FileStream(file, FileMode.Open); 66 StreamReader sr = new StreamReader(fs); 67 readLine = sr.ReadToEnd(); 68 sr.Close(); 69 fs.Close(); 70 return readLine; 71 } 72 73 public static void Process() 74 { 75 string readLine; 76 while (true) 77 { 78 queue.TryTake(out readLine, -1); 79 if (readLine == "END") 80 { 81 queue.TryAdd("END", -1); 82 break; 83 } 84 Compute(readLine); 85 } 86 } 87 88 public static void Compute(string readLine) 89 { 90 string strKey = ""; 91 int value; 92 int state = 0; 93 94 for (int i = 0; i < readLine.Length; i++) 95 { 96 switch (state) 97 { 98 case 0: 99 if ((state = (readLine[i] > 'z') ? 0 : tablet[readLine[i]]) != 0)100 {101 sb.Clear();102 sb.Append(readLine[i]);103 }104 break;105 default:106 if ((state = (readLine[i] > 'z') ? 0 : tablet[readLine[i]]) == 0)107 {108 if (sb.Length >= 1)109 {110 strKey = ToLower(sb).ToString();111 if (tablet[strKey[0]] != 1) return;112 if (result.TryGetValue(strKey, out value))113 {114 result[strKey]++;115 }116 else117 {118 result.Add(strKey, 1);119 }120 }121 }122 else123 sb.Append(readLine[i]);124 break;125 }126 }127 }128 129 public static StringBuilder ToLower(StringBuilder str)130 {131 for (int i = 0; i < str.Length; i++)132 {133 if (str[i] <= 'Z')134 {135 str[i] = (char)((int)str[i] + 32);136 }137 }138 return str;139 }140 }
多线程:两个compute一个io
多线程反而更慢。。。寻找原因。
请教了anran大神,大神表示lock非常占时间。。建议更换线程安全的dictionary。
果然。。速度提升了20%。。(先去吃饭,回来再写)
测试了4线程5线程…8线程
发现1个主线程1个io线程3个compute线程的速度最快
1 class Program 2 { 3 static ConcurrentDictionaryresult = new ConcurrentDictionary (1, 50000); 4 static int[] tablet = new int[128]; 5 static BlockingCollection queue; 6 static Thread WorkerTh0; 7 static Thread WorkerTh1; 8 static Thread WorkerTh2; 9 static void Main(string[] args) 10 { 11 DateTime dt = DateTime.Now; 12 string rootdir = args[0]; 13 int N = Convert.ToInt32(args[1]); 14 string filePattern = args[2]; 15 16 string[] files = Directory.GetFiles(rootdir, filePattern); 17 18 for (int i = 'a'; i <= 'z'; i++) 19 { 20 tablet[i] = 1; 21 } 22 for (int i = 'A'; i <= 'Z'; i++) 23 { 24 tablet[i] = 1; 25 } 26 27 queue = new BlockingCollection (100); 28 Thread FileIOth = new Thread(delegate() { Read(files); }); 29 FileIOth.Start(); 30 31 WorkerTh0 = new Thread(delegate() 32 { 33 Process(); 34 }); 35 WorkerTh0.Start(); 36 WorkerTh1 = new Thread(delegate() 37 { 38 Process(); 39 }); 40 WorkerTh1.Start(); 41 42 WorkerTh2 = new Thread(delegate() 43 { 44 Process(); 45 var outputResult = from KVP in result 46 orderby KVP.Value descending 47 select new StringBuilder(KVP.Key).Append(" ").Append(KVP.Value); 48 int count = 0; 49 foreach (var str in outputResult) 50 { 51 Console.WriteLine(str); 52 count++; 53 if (count > N - 1) break; 54 } 55 DateTime ot = DateTime.Now; 56 Console.WriteLine("Time: " + ((ot.Minute * 60 + ot.Second) * 1000 + ot.Millisecond - (dt.Minute * 60 + dt.Second) * 1000 - dt.Millisecond) + "ms"); 57 //Console.ReadKey(); 58 }); 59 WorkerTh2.Start(); 60 61 62 } 63 64 public static void Read(string[] files) 65 { 66 foreach (string file in files) 67 { 68 queue.TryAdd(ReadFile(file), -1); 69 } 70 queue.TryAdd("END", -1); 71 } 72 73 public static string ReadFile(string file) 74 { 75 string readLine; 76 FileStream fs = new FileStream(file, FileMode.Open); 77 StreamReader sr = new StreamReader(fs); 78 readLine = sr.ReadToEnd(); 79 sr.Close(); 80 fs.Close(); 81 return readLine; 82 } 83 84 public static void Process() 85 { 86 string readLine; 87 while (true) 88 { 89 queue.TryTake(out readLine, -1); 90 if (readLine == "END") 91 { 92 queue.TryAdd("END", -1); 93 break; 94 } 95 Compute(readLine); 96 } 97 } 98 99 public static void Compute(string readLine)100 {101 StringBuilder sb = new StringBuilder(100);102 string strKey = "";103 int value;104 int state = 0;105 106 for (int i = 0; i < readLine.Length; i++)107 {108 switch (state)109 {110 case 0:111 if ((state = (readLine[i] > 'z') ? 0 : tablet[readLine[i]]) != 0)112 {113 sb.Clear();114 sb.Append(readLine[i]);115 }116 break;117 default:118 if ((state = (readLine[i] > 'z') ? 0 : tablet[readLine[i]]) == 0)119 {120 if (sb.Length >= 1)121 {122 strKey = ToLower(sb).ToString();123 result.AddOrUpdate(strKey, 1, (k, v) => v + 1);124 }125 }126 else127 sb.Append(readLine[i]);128 break;129 }130 }131 }132 133 public static StringBuilder ToLower(StringBuilder str)134 {135 for (int i = 0; i < str.Length; i++)136 {137 if (str[i] <= 'Z')138 {139 str[i] = (char)((int)str[i] + 32);140 }141 }142 return str;143 }144 }
决定等真实数据来了再测试一下效果。