目標:將不同格式的JSON文件存入MySQL數據庫
涉及的點有:
1. java處理JSON對象,直接見源碼。
2. java.sql.SQLException: Incorrect string value: ‘\xF0\x9F\x99\x8F\xE5\x8D…’ for column ‘text’ at row 1報錯問題,報錯原因:因為我沒有對插入文本做任何處理,文本內有不同字節的utf8字符,我的處理方式就是過濾后再插入,因為特殊的字符其實也沒什么用。
1 public static String StringFilter(String str) throws PatternSyntaxException { 2 // 清除掉所有特殊字符,只允許漢字字母數字和某些常見符號出現 3 String regEx = "[^0-9a-zA-Z\u4e00-\u9fa5~!@#$%^&*()+=|{}':;',//[//].<>/?~!@#¥%……&*()——+|{}【】‘;:”“’。,、?]+"; 4 Pattern p = Pattern.compile(regEx); 5 Matcher m = p.matcher(str); 6 return m.replaceAll(" ").trim().replaceAll("s+", " "); 7 } 8 9 public static void insertMicroblogs(String uid, String mid, String time,String geo,String source, 10 String repost,String comment,String attitude,String text,int flag,Statement statement) throws Exception { 11 if (uid == null) return; 12 String sql = "insert into data2016.microblog values(\"" + mid+ "\",\"" + uid + "\",\"" + time 13 + "\",\"" + geo+ "\",\"" + source + "\",\"" +repost + "\",\"" + comment + "\",\"" + attitude 14 + "\",\"" + text+ "\"," + flag + ")"; 15 String selectsql = "select count(*) from data2016.microblog where uid = \""+ uid+ "\" and mid=\"" + mid + "\""; 16 17 try { 18 ResultSet rset = statement.executeQuery(selectsql); 19 int rowCount = 0; //記錄查詢結果記錄數 20 if(rset.next()) { 21 rowCount=rset.getInt(1); 22 } 23 if(rowCount == 0){ 24 statement.execute(sql); 25 } 26 27 } catch (Exception e) { 28 pErrorUser.println("microblog:"+uid); 29 // System.out.println(sql); 30 e.printStackTrace(); 31 return; 32 } 33 } 34 public static void readUserMicroblogs() throws Exception{ 35 File file = new File("data/source/first/microblogs/microblogs12.txt"); 36 BufferedInputStream fis = new BufferedInputStream(new FileInputStream(file)); 37 BufferedReader reader = new BufferedReader(new InputStreamReader(fis,"utf-8"),5*1024*1024);// 用5M的緩沖讀取文本文件 38 39 String line = ""; 40 int count = 0; 41 while((line = reader.readLine()) != null){ 42 count++; 43 JSONObject blogObject = new JSONObject(line); 44 String uid = blogObject.getString("uid"); 45 String microblogs = blogObject.getString("microblogs"); 46 JSONArray microblogsArray = new JSONArray(microblogs); 47 int size = microblogsArray.length(); 48 // System.out.println("Size:" + size); 49 for (int i = 0; i < size; i++) { 50 JSONObject jsonObj = microblogsArray.getJSONObject(i); 51 String mid=jsonObj.get("mid").toString(); 52 String created_at=jsonObj.get("created_at").toString(); 53 String geo=StringFilter(jsonObj.get("geo").toString().replaceAll("\"", " ")); 54 String source=jsonObj.get("source").toString().replaceAll("\"", " "); 55 String reposts_count=jsonObj.get("reposts_count").toString(); 56 String comments_count=jsonObj.get("comments_count").toString(); 57 String attitudes_count=jsonObj.get("attitudes_count").toString(); 58 String text; 59 int flag; 60 if(jsonObj.has("retweeted_status")){ 61 flag=1; 62 String retweet = jsonObj.get("retweeted_status").toString(); 63 JSONObject weibo = new JSONObject(retweet); 64 text=StringFilter(weibo.get("text").toString().replaceAll("\"", " ")); 65 }else{ 66 flag=0; 67 text=StringFilter(jsonObj.get("text").toString().replaceAll("\"", " ")); 68 } 69 insertMicroblogs(uid,mid,created_at,geo,source,reposts_count,comments_count,attitudes_count,text,flag,statement); 70 71 } 72 73 if(count%50==0){ 74 System.out.print(count +"..."); 75 } 76 if(count%1000==0){ 77 System.out.println(); 78 } 79 } 80 }
如上代碼是我對新浪微博數據文檔的存入工作,因為文檔較大,所以加入了緩存讀取。遇到的其他問題基本都是特殊字符問題,其中插入文本中有雙引號,原本處理方法是:
String source=jsonObj.get("source").toString().replaceAll("\"", "\\\" ");
但是不知道為嘛,轉義字符沒有成功加入,所以就直接將雙引號替換為空格處理了。這一塊內容原本不打算記,原以為是很順利的事還是倒騰了一天,所以小記一下咯。
