1. 크롤링을 할 RSS URL 검색(참고 : 한겨레 RSS서비스)
2. RSS URL을 이용한 신문기사 크롤링하기
- it.java
package controller;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import javax.servlet.ServletException;
import javax.servlet.annotation.WebServlet;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
@WebServlet("/it")
public class It extends HttpServlet{
@Override
protected void service(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException {
String rss = req.getParameter("rss");
rss = rss == null ? "https://www.hani.co.kr/rss/international" : rss;
URL url = new URL(rss);
BufferedReader br = new BufferedReader(new InputStreamReader(url.openStream()));
resp.setContentType("text/xml; charset=utf-8");
String s = null;
while((s = br.readLine()) != null) {
resp.getWriter().println(s);
}
}
}
- xmlParsing.jsp
<%@ page language="java" contentType="text/html; charset=UTF-8"
pageEncoding="UTF-8"%>
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Insert title here</title>
<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.6.0/jquery.min.js" integrity="sha512-894YE6QWD5I59HgZOGReFYm4dnWc1Qt5NtvYSaNcOP+u1T9qYdvdihz0PPSiiqn/+/3e7Jo4EaG7TubfWGUrMQ==" crossorigin="anonymous" referrerpolicy="no-referrer"></script>
<script>
$(function() {
$.get("it", {rss:'https://www.hani.co.kr/rss/international'}, function(data) {
console.log(data);
var dataXml = $.parseXML(data);
console.log(dataXml);
var str = "<table border='1'>"
$(dataXml).find("item").each(function() {
var title = $(this).find("title").text();
var link = $(this).find("link").text();
var description = $(this).find("description").text();
var pubDate = $(this).find("pubDate").text();
str += "<tr>"
str += "<td><a href='" + link + "'>" + title + "</a></td>"
str += "<td>" + pubDate + "</td>"
str += "</tr>"
console.log(title, link, description, pubDate);
})
str += "</table>"
$("body").append(str);
}, "text")
})
</script>
</head>
<body>
</body>
</html>