Java实现html页面抓取转PDF
2021
04.13

Java实现html页面抓取转PDF文档

1.引言:

  在现实中我们有的时候需要转出某个特定页面的内容导出成报告等的功能。本文章就是实现此功能。利用WkhtmltoPDF插件。环境用的是java 8,Spring Boot,Thymeleaf将页面转为PDF。

2.总纲:

大概总体步骤分为三步。

1)  通过浏览器发送一个http请求到后台程序,并设置转PDF的位置及所创建的PDF的名称。

2) 通过java代码调用Wkhtmltopd命令,然后读取发来的http的页面转为PDF文档。

3) 最终生成后的PDF文档的下载路径。返回浏览器,供用户下载。

3.技术实现步骤

1) 首先需要安装WkhtmltoPDF插件。我们可以去官网下载自己对应的版本

官网下载地址:https://wkhtmltopdf.org/downloads.html

如果是给linux下载安装的话则下载linux的版本

2) 随后设置WkhtmltoPDF的环境变量。路径需要设置到安装完的bin文件夹

 

3) 设置到系统变量中的Path中

 

之后我们就可以开发程序了

3. 最后开发,本章用的是idea进行开发。

1)这里首先引入对应的依赖。

<dependency>

<groupId>org.springframework.boot</groupId>

<artifactId>spring-boot-starter-thymeleaf</artifactId>

</dependency>

<dependency>

<groupId>org.springframework.boot</groupId>

<artifactId>spring-boot-starter-logging</artifactId>

</dependency>
<dependency>

<groupId>org.springframework.boot</groupId>

<artifactId>spring-boot-devtools</artifactId>

<optional>true</optional>

</dependency>

<dependency>

<groupId>org.webjars</groupId>

<artifactId>bootstrap</artifactId>

<version>3.3.7</version>

</dependency>


3.1 Controller层
package org.thinkingingis.controller;
import java.io.ByteArrayInputStream;
import java.io.IOException;

import java.io.InputStream;

import java.io.OutputStream;

import javax.servlet.http.HttpServletResponse;

import org.apache.tomcat.util.http.fileupload.IOUtils;

import org.springframework.boot.web.client.RestTemplateBuilder;

import org.springframework.stereotype.Controller;

import org.springframework.web.bind.annotation.RequestMapping;

import org.springframework.web.bind.annotation.RequestMethod;

import org.springframework.web.client.RestTemplate;

import org.thinkingingis.model.PdfFileRequest;

import org.thinkingingis.service.PdfFileCreator;



@Controller

@RequestMapping("/print")

public class PrintPdfController {

private final RestTemplate restTemplate;

private final PdfFileCreator pdfFileCreator;

public PrintPdfController(RestTemplateBuilder restTemplateBuilder,PdfFileCreator pdfFileCreator) {

this.restTemplate = restTemplateBuilder.build();

this.pdfFileCreator = pdfFileCreator;

}



@RequestMapping(value = "/pdf", method = RequestMethod.GET)

public void createPdfFromUrl(HttpServletResponse response) {

PdfFileRequest fileRequest = new PdfFileRequest();

fileRequest.setFileName("index.pdf");

fileRequest.setSourceHtmlUrl("http://www.baidu.com");

pdfFileCreator.writePdfToResponse(fileRequest,response);

}



private void writePdfFileToResponse(byte[] pdfFile, String fileName, HttpServletResponse response) {

try (InputStream in = new ByteArrayInputStream(pdfFile)) {

OutputStream out = response.getOutputStream();

IOUtils.copy(in, out);

out.flush();

response.setContentType("application/pdf");

response.setHeader("Content-Disposition", "attachment; filename=\"" + fileName + "\"");

}

catch (IOException ex) {

throw new RuntimeException("Error occurred when creating PDF file", ex);

}

}

}



3.2 Service层



package org.thinkingingis.service;

import java.io.*;

import java.nio.charset.StandardCharsets;

import java.util.Arrays;

import java.util.List;

import java.util.concurrent.TimeUnit;

import javax.servlet.http.HttpServletResponse;

import org.apache.tomcat.util.http.fileupload.IOUtils;

import org.slf4j.Logger;

import org.slf4j.LoggerFactory;

import org.springframework.stereotype.Service;

import org.thinkingingis.model.PdfFileRequest;

@Service

public class PdfFileCreator {

private static final Logger LOGGER = LoggerFactory.getLogger(PdfFileCreator.class);

public void writePdfToResponse(PdfFileRequest fileRequest, HttpServletResponse response) {

String pdfFileName = fileRequest.getFileName();

requireNotNull(pdfFileName, "The file name of the created PDF must be set");

requireNotEmpty(pdfFileName, "File name of the created PDF cannot be empty");

String sourceHtmlUrl = fileRequest.getSourceHtmlUrl();

requireNotNull(sourceHtmlUrl, "Source HTML url must be set");

requireNotEmpty(sourceHtmlUrl, "Source HTML url cannot be empty");

List<String> pdfCommand = Arrays.asList(

/*安装程序路径的bin文件夹下*/

"F:\\PDFUtils\\wkhtmltopdf\\bin\\wkhtmltopdf",

"--orientation",

"Landscape",

sourceHtmlUrl,

"F:\\公司程序\\测试工具项目\\Html转pdf\\spring-boot-htmltopdf\\"+pdfFileName

);

ProcessBuilder pb = new ProcessBuilder(pdfCommand);

Process pdfProcess;

int status;

try {

pdfProcess = pb.start();

try(BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(pdfProcess.getInputStream(), StandardCharsets.UTF_8))){

while (pdfProcess.isAlive()) {

while (bufferedReader.ready()) {

String s = bufferedReader.readLine();

}

}

status = pdfProcess.waitFor();

}catch (InterruptedException e) {

throw new RuntimeException("PDF generation failed");

}

//检测是否生成成功,如果成功则创建下载pdf的路径

if(status==0) {

File zipfile = new File("F:\\公司程序\\测试工具项目\\Html转pdf\\spring-boot-htmltopdf\\" + pdfFileName);

response.addHeader("Content-Disposition", "attachment;filename=" + new String(zipfile.getName().getBytes("UTF-8"), "ISO8859-1")); //转码之后下载的文件不会出现中文乱码

response.addHeader("Content-Length", "" + zipfile.length());

InputStream fis = new BufferedInputStream(new FileInputStream(zipfile));

byte[] buffer = new byte[fis.available()];

fis.read(buffer);

fis.close();

OutputStream toClient = new BufferedOutputStream(response.getOutputStream());

toClient.write(buffer);

toClient.flush();

toClient.close();

}

}

catch (IOException ex) {

throw new RuntimeException("PDF generation failed");

}

}

private void requireNotNull(String value, String message) {

if (value == null) {

throw new IllegalArgumentException(message);

}

}

private void requireNotEmpty(String value, String message) {

if (value.isEmpty()) {

throw new IllegalArgumentException(message);

}

}

private void writeCreatedPdfFileToResponse(InputStream in, HttpServletResponse response) throws IOException {

OutputStream out = response.getOutputStream();

IOUtils.copy(in, out);

out.flush();

out.close();

}

private void waitForProcessBeforeContinueCurrentThread(Process process) {

try {

process.waitFor(2, TimeUnit.SECONDS);

}

catch (InterruptedException ex) {

Thread.currentThread().interrupt();

}

}

private void requireSuccessfulExitStatus(Process process) {

if (process.exitValue() != 0) {

throw new RuntimeException("PDF generation failed");

}

}

private void setResponseHeaders(HttpServletResponse response, PdfFileRequest fileRequest) {

response.setContentType("application/pdf");

response.setHeader("Content-Disposition", "attachment; filename=\"" + fileRequest.getFileName() + "\"");

}

private void writeErrorMessageToLog(Exception ex, Process pdfProcess) throws IOException {

LOGGER.error("Could not create PDF because an exception was thrown: ", ex);

LOGGER.error("The exit value of PDF process is: {}", pdfProcess.exitValue());

String errorMessage = getErrorMessageFromProcess(pdfProcess);

LOGGER.error("PDF process ended with error message: {}", errorMessage);

}

private String getErrorMessageFromProcess(Process pdfProcess) {

try {

BufferedReader reader = new BufferedReader(new InputStreamReader(pdfProcess.getErrorStream()));

StringWriter writer = new StringWriter();

String line;

while ((line = reader.readLine()) != null) {

writer.append(line);

}

return writer.toString();

}

catch (IOException ex) {

LOGGER.error("Could not extract error message from process because an exception was thrown", ex);

return "";

}

}

}


 


自此,可以运行项目访问http://localhost:8080/print/pdf路径后,则会返回下载百度的页面的pdf文档。也可以根据自己的项目修改其中部分逻辑。

 

 

 

 

热门文章
1
Java实现html页面抓取转PDF
1207浏览 2021-04-13 17:09
2
对接父子级数据的添加或修改逻辑
1174浏览 2021-04-21 09:47
3
Mysql常用sql语句
1173浏览 2021-04-20 10:46
4
通过微信接口获取code
1165浏览 2021-04-14 09:26
5
vue 页面按钮点击复制内容
1164浏览 2021-04-21 09:42
6
Linux安装中文字体
1124浏览 2021-04-13 17:14
7
vue中数组去重的实现
1086浏览 2021-04-14 09:20
8
合并单元格的方法
1081浏览 2021-04-14 09:23
9
Nginx如何配置重定向
1078浏览 2021-04-14 09:09
10
互联网文明颠覆你的认知
895浏览 2020-12-02 15:47