用Java獲取頁面,然后用Jsoup來得到自己想要的數據,再保存到數據庫(我用了Hibernate的框架),最后用自己的網站顯示這些數據
豆瓣本身貌似提供了給開發者使用的接口,但是我不想去注冊賬號(我還沒有豆瓣賬號),,,就想自己通過網頁源碼分析,然后拿到自己想要的數據。
在看豆瓣的網頁源碼的時候,通過用Chrome的F12分析工具中的NetWork網絡分析,發現了豆瓣一個請求的接口,其返回值是Json數組,數組包含電影評分,名字,圖片的url,導演等數據


用QQ的截屏工具貌似還很好用(Ctrl+Alt+A)
通過這個接口可以獲取豆瓣的電影(評分由高到低,每次請求只會返回20個數據,但通過更改url里的start的數值(從這位置開始,向后獲得20個數據),可以獲取更多的數據
現在Java有別人寫好的現成的解析Json數據的包,但這次我沒用,自己嘗試去解析這個Json數據,---用正則表達式,和String字符串的方法,來獲取自己想要的數據,然后封裝成類,保存到數據庫
代碼寫的有些亂,中間出錯過幾次,寫的方法代碼有的注釋掉了,有的方法沒有用,感覺錯誤也是有保留價值的,,,(這是我自己寫的,來練手的),
下面代碼獲取了Json數據,main函數也在這:
1 package zhangtianxiao; 2 3 import java.io.*; 4 import java.net.HttpURLConnection; 5 import java.net.MalformedURLException; 6 import java.net.URL; 7 8 import org.jsoup.Jsoup; 9 import org.jsoup.nodes.Document; 10 import org.jsoup.nodes.Element; 11 import org.jsoup.select.Elements; 12 13 import Json.JsonParase; 14 15 public class DownLoadHTML { 16 public String s_html = ""; 17 public String url = ""; 18 public DownLoadHTML(String url) { 19 this.url = url; 20 21 try { 22 URL urlc =new URL(url); 23 InputStream is = urlc.openStream(); 24 BufferedReader br = new BufferedReader(new InputStreamReader(is)); 25 26 String s1 = null; 27 28 while((s1 = br.readLine()) != null) 29 { 30 s_html += s1; 31 //System.out.println(s1); 32 } 33 34 } catch (IOException e) { 35 // TODO Auto-generated catch block 36 e.printStackTrace(); 37 } 38 39 } 40 41 public void outPut() 42 { 43 // Document doc = null; 44 // try { 45 // doc = Jsoup.connect("https://movie.douban.com/").get(); 46 // } catch (IOException e1) { 47 // // TODO Auto-generated catch block 48 // e1.printStackTrace(); 49 // } 50 // Elements elements = doc.select("a"); 51 // for(Element e : elements) 52 // { 53 // System.out.println(e.toString()); 54 // } 55 File f1 = new File("E:/java/資源/1.html"); 56 try { 57 Document doc = Jsoup.parse(f1,"utf-8",""); 58 Elements elements = doc.select("a.item"); 59 for(Element e : elements) 60 { 61 System.out.println(e.toString()+"\n\n"); 62 System.out.println(e.attr("href")); 63 } 64 } catch (IOException e) { 65 // TODO Auto-generated catch block 66 e.printStackTrace(); 67 } 68 69 } 70 71 public static void main(String[] args) 72 { 73 DownLoadHTML d = new DownLoadHTML("https://movie.douban.com/tag/#/?sort=S&range=0,10&tags="); 74 //d.outPut(); 75 76 d.getDetail1(); 77 78 } 79 80 81 //測試用獲取頁面用的方法 82 public void getDetail() 83 { 84 String url = "https://movie.douban.com/subject/24751811/"; 85 try { 86 Document doc = Jsoup.connect(url).get(); 87 //System.out.println(doc.toString()); 88 Element e = doc.selectFirst("span.short").child(0); 89 System.out.println(e.text()); 90 Element e1 = doc.selectFirst("a.lnk-sharing"); 91 System.out.println(e1.attr("data-name")); 92 } catch (IOException e) { 93 // TODO Auto-generated catch block 94 e.printStackTrace(); 95 } 96 97 } 98 99 //由於上面的方法無法解析服務器發回的數據,網上搜了一下 100 public void getDetail1() 101 { 102 103 104 105 HttpURLConnection conn = null; 106 try{ 107 URL realUrl = new URL("https://movie.douban.com/j/new_search_subjects?sort=S&range=0,10&tags=%E7%94%B5%E5%BD%B1&start=0"); 108 conn = (HttpURLConnection)realUrl.openConnection(); 109 conn.setRequestMethod("GET"); 110 conn.setUseCaches(false); 111 conn.setReadTimeout(8000); 112 conn.setConnectTimeout(8000); 113 conn.setInstanceFollowRedirects(false); 114 }catch(Exception e){ 115 e.printStackTrace(); 116 } 117 118 119 try { 120 int responseCode = conn.getResponseCode(); 121 System.out.println(responseCode); 122 } catch (IOException e) { 123 // TODO Auto-generated catch block 124 e.printStackTrace(); 125 } 126 127 128 BufferedReader in = null; 129 130 String result = ""; 131 132 try { 133 in = new BufferedReader(new InputStreamReader(conn.getInputStream(),"utf-8")); 134 String line; 135 while((line = in.readLine()) != null) 136 { 137 System.out.println(line); 138 result += line; 139 } 140 } catch (UnsupportedEncodingException e) { 141 // TODO Auto-generated catch block 142 e.printStackTrace(); 143 } catch (IOException e) { 144 // TODO Auto-generated catch block 145 e.printStackTrace(); 146 } 147 148 JsonParase.parase(result); 149 } 150 151 152 153 }
然后怕代碼太長,把自己搞蒙了,把解析Json的工作,拿了出來單獨一個類解析Json數據:
1 package Json; 2 3 import java.io.IOException; 4 import java.util.regex.Matcher; 5 import java.util.regex.Pattern; 6 7 import org.hibernate.Session; 8 import org.hibernate.Transaction; 9 import org.jsoup.Jsoup; 10 import org.jsoup.nodes.Document; 11 import org.jsoup.nodes.Element; 12 13 public class JsonParase { 14 public static void parase(String json) 15 { 16 String regex = "(\\{\"directors\":)(.+)(\\})"; 17 String regex1 = "(?<=\\{)(\"dire.+?)(?=\\})"; 18 Pattern p = Pattern.compile(regex1); 19 Matcher m = p.matcher(json); 20 while(m.find()) 21 { 22 String e1 = null; 23 System.out.println(e1 = m.group()); 24 String regex2 = "(?<=\")(.+?)(?=\")"; 25 26 // Matcher m1 = Pattern.compile(regex2).matcher(e1); 27 // while(m1.find()) 28 // { 29 // System.out.println(m1.group()); 30 // } 31 32 String[] ms = e1.split(","); 33 Movie movie = new Movie(); 34 //System.out.println(ms.length); 35 for(String m1 : ms) 36 { 37 //System.out.println(m1); 38 39 Matcher matcher1 = Pattern.compile(regex2).matcher(m1.replace(":", "")); 40 41 while(matcher1.find()) 42 { 43 //System.out.println(matcher1.group()); 44 if(matcher1.group().equals("title")) 45 { 46 matcher1.find(); 47 //System.out.println(matcher1.group()+"****************"); 48 movie.setTitle(matcher1.group().replace("\"", "")); 49 System.out.println(movie.getTitle()); 50 } 51 if(matcher1.group().equals("rate")) 52 { 53 matcher1.find(); 54 //System.out.println(matcher1.group()+"****************"); 55 movie.setRate(matcher1.group().replace("\"", "")); 56 System.out.println(movie.getRate()); 57 } 58 if(matcher1.group().equals("url")) 59 { 60 matcher1.find(); 61 //System.out.println(matcher1.group()+"****************"); 62 movie.setUrl(matcher1.group().replace("\"", "").replaceAll("https", "https:")); 63 System.out.println(movie.getUrl()); 64 } 65 if(matcher1.group().equals("cover")) 66 { 67 matcher1.find(); 68 //System.out.println(matcher1.group()+"****************"); 69 movie.setCoverurl(matcher1.group().replace("\"", "").replaceAll("https", "https:")); 70 System.out.println(movie.getCoverurl()); 71 } 72 } 73 74 //沒有考慮到http://***********這種類型數據 75 // String[] ms1 = m1.split(":"); 76 // 77 // 78 // 79 // if(ms1.length == 2) 80 // { 81 // Movie movie = new Movie(); 82 // //System.out.println(ms1[0]); 83 // //System.out.println(ms1[0].replaceAll("\"", "")); 84 // if(ms1[0].replaceAll("\"", "") == "title") 85 // { 86 // movie.setTitle(ms1[1].replaceAll("\"", "")); 87 // } 88 // if(ms1[0].replaceAll("\"", "") == "rate") 89 // { 90 // movie.setRate(ms1[1].replaceAll("\"", "")); 91 // } 92 // if(ms1[0].replaceAll("\"", "") == "title") 93 // { 94 // movie.setTitle(ms1[1].replaceAll("\"", "")); 95 // } 96 // } 97 // //System.out.println(); 98 } 99 getComment(movie); 100 storeMovie(movie); 101 102 } 103 } 104 public static void getComment(Movie m) 105 { 106 //System.out.println(m.getUrl()); 107 String url = m.getUrl().replace("\\", ""); 108 try { 109 Document doc = Jsoup.connect(url).get(); 110 //System.out.println(doc); 111 112 if(doc.selectFirst("span.short") == null) 113 { 114 if(doc.selectFirst("div#link-report") == null) 115 { 116 //System.out.println("---------------------------"); 117 return; 118 } 119 Element e2 = doc.selectFirst("div#link-report"); 120 //System.out.println("************\n"+e2.child(0).text()); 121 m.setComment(e2.child(0).text()); 122 return; 123 } 124 125 Element e = doc.selectFirst("span.short").child(0); 126 127 //System.out.println(e.text()); 128 129 m.setComment(e.text()); 130 Element e1 = doc.selectFirst("a.lnk-sharing"); 131 //System.out.println(e1.attr("data-name")); 132 } catch (IOException e) { 133 // TODO Auto-generated catch block 134 e.printStackTrace(); 135 } 136 } 137 public static void storeMovie(Movie m) 138 { 139 Session session = HibernateTools.getSession(); 140 Transaction tx = session.beginTransaction(); 141 session.save(m); 142 tx.commit(); 143 session.close(); 144 //HibernateTools.closeSessionFactory(); 145 } 146 }
代碼用到了Jsoup(這個庫感覺挺好用,可以像寫Js代碼一樣來操縱Html頁面的元素),然后Hibernate來訪問數據庫,
上面代碼獲取了20條數據,保存到數據庫,然后我在自己寫的網頁上讀取了這寫數據:


自己嘗試去解析Json數據還挺麻煩的,而且效果不是很好,沒有別人封裝好的簡潔,有空去看看別人的源碼,學習學習,,,,,,,,,,,,,,,,,,
