此处将实验数据处理的过程整理如下:
1.需求
DBLP现在的数据量太大了(截止到2018-07-12 22:38,2G+),根本用不了那么多,而且在XML中不方便处理。
现在的需求是:
根据需求将一定量的数据导入到数据库(此处以mysql为例)
2.处理
经分析不难想到,问题的解决分为两步:
1.按照需求将文件截取一部分(有人也许会说,这都是废话,谁都知道,单独列为一步, 是因为一般的编辑器,还真打不开2G+的xml
)
2.xml导入数据库
2.1截取部分文件
截取文件,我推荐的软件是UltraEdit,至于如何破解,此处不做过多赘述

开始

结束
然后截取你需要的数据拷贝到新的文件里即可,需要注意的是:
新文件
的末尾别忘了拷贝结束标签
</dblp>

新文件的末尾
2.2数据导入数据库
技术选型: java的DM4J
+ mysql
(1)数据库建表
/*
Navicat Premium Data Transfer
Source Server Type : MySQL
Source Server Version : 50527
Source Host : localhost:3306
Source Schema : mypaper
Target Server Type : MySQL
Target Server Version : 50527
File Encoding : 65001
*/SET NAMES utf8mb4;SET FOREIGN_KEY_CHECKS = 0;-- ------------------------------ Table structure for paper-- ----------------------------DROP TABLE IF EXISTS `paper`;CREATE TABLE `paper` ( `id` int(32) UNSIGNED ZEROFILL NOT NULL AUTO_INCREMENT, `author` varchar(1024) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, `title` varchar(1024) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, `mdate` varchar(512) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, `key` varchar(512) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, `pages` varchar(512) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, `year` varchar(64) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, `volume` varchar(512) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, `journal` varchar(512) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, `number` varchar(64) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, `url` varchar(512) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, `ee` varchar(512) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 243354 CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Compact;SET FOREIGN_KEY_CHECKS = 1;
(2)java处理
实体类:
package org.stone6762.domain;/**
* @ClassName:MyPaper
* @author Stone6762
* @Description:
*/public class MyPaper { private String author; private String title; private String mdate; private String key; private String pages; private String year; private String volume; private String journal; private String number; private String url; private String ee; public String getAuthor() { return author;
} public void setAuthor(String author) { this.author = author;
} public String getTitle() { return title;
} public void setTitle(String title) { this.title = title;
} public String getMdate() { return mdate;
} public void setMdate(String mdate) { this.mdate = mdate;
} public String getKey() { return key;
} public void setKey(String key) { this.key = key;
} public String getPages() { return pages;
} public void setPages(String pages) { this.pages = pages;
} public String getYear() { return year;
} public void setYear(String year) { this.year = year;
} public String getVolume() { return volume;
} public void setVolume(String volume) { this.volume = volume;
} public String getJournal() { return journal;
} public void setJournal(String journal) { this.journal = journal;
} public String getNumber() { return number;
} public void setNumber(String number) { this.number = number;
} public String getUrl() { return url;
} public void setUrl(String url) { this.url = url;
} public String getEe() { return ee;
} public void setEe(String ee) { this.ee = ee;
} @Override
public String toString() { return "MyPaper [author=" + author + ", title=" + title + ", mdate=" + mdate + ", key=" + key + ", pages="
+ pages + ", year=" + year + ", volume=" + volume + ", journal=" + journal + ", number=" + number
+ ", url=" + url + ", ee=" + ee + "]";
}
}
xml解析工具
package org.stone6762.utils;import java.io.File;import java.util.ArrayList;import java.util.Iterator;import java.util.List;import org.dom4j.Attribute;import org.dom4j.Document;import org.dom4j.DocumentException;import org.dom4j.Element;import org.dom4j.io.SAXReader;import org.stone6762.domain.MyPaper;/**
* @Title:XML2Bean.java
* @author Stone6762
* @Description:
*/public class XML2Bean { public static List<MyPaper> getPapersFromXML(File file) {
List<MyPaper> paperList = null;
MyPaper paper = null;
SAXReader reader = new SAXReader(); try {
Document document = reader.read(file);
Element paperstore = document.getRootElement();
Iterator storeit = paperstore.elementIterator();
paperList = new ArrayList<MyPaper>(); while (storeit.hasNext()) {
paper = new MyPaper();
Element paperElement = (Element) storeit.next(); // 遍历paperElement的属性
List<Attribute> attributes = paperElement.attributes(); for (Attribute attribute : attributes) { if (attribute.getName().equals("mdate")) {
String mdate = attribute.getValue();
paper.setMdate(mdate);
} if (attribute.getName().equals("key")) {
String key = attribute.getValue();
paper.setKey(key);
}
}
Iterator paperit = paperElement.elementIterator(); while (paperit.hasNext()) {
Element child = (Element) paperit.next();
String nodeName = child.getName(); if (nodeName.equals("author")) {
String author = child.getStringValue();
paper.setAuthor(author);
} else if (nodeName.equals("title")) {
String title = child.getStringValue();
paper.setTitle(title);
} else if (nodeName.equals("pages")) {
String pages = child.getStringValue();
paper.setPages(pages);
} else if (nodeName.equals("year")) {
String year = child.getStringValue();
paper.setYear(year);
} else if (nodeName.equals("volume")) {
String volume = child.getStringValue();
paper.setVolume(volume);
} else if (nodeName.equals("journal")) {
String journal = child.getStringValue();
paper.setJournal(journal);
} else if (nodeName.equals("number")) {
String number = child.getStringValue();
paper.setNumber(number);
} else if (nodeName.equals("url")) {
String url = child.getStringValue();
paper.setUrl(url);
} else if (nodeName.equals("ee")) {
String ee = child.getStringValue();
paper.setEe(ee);
}
}
paperList.add(paper);
paper = null;
}
} catch (DocumentException e) {
e.printStackTrace();
} return paperList;
}
}
导入数据库
package org.stone6762.main;import java.io.File;import java.sql.Connection;import java.sql.DriverManager;import java.sql.PreparedStatement;import java.util.List;import org.stone6762.domain.MyPaper;import org.stone6762.utils.XML2Bean;/**
* @ClassName:Main
* @author Stone6762
* @Description:
*/public class Main { private static String USERNAME = "root"; private static String PASSWORD = "root"; private static String DRIVER = "com.mysql.jdbc.Driver"; private static String URL = "jdbc:mysql://localhost:3306/mypaper"; private static Connection con = null; private static PreparedStatement pstmt = null; public static void main(String[] args) throws Exception {
Class.forName(DRIVER);
System.out.println("注册驱动成功!");
con = DriverManager.getConnection(URL, USERNAME, PASSWORD);
System.out.println("获取连接成功!");
String sql = "INSERT INTO paper (paper.author, "
+ "paper.ee,"
+ "paper.journal,"
+ "paper.key,"
+ "paper.mdate,"
+ "paper.number,"
+ "paper.pages,"
+ "paper.title,"
+ "paper.url,"
+ "paper.volume,"
+ "paper.year )VALUES (?,?,?,?,?,?,?,?,?,?,? )";
pstmt = con.prepareStatement(sql);
List<MyPaper> papers = XML2Bean.getPapersFromXML(new File("ref/DBCP_1_1.xml")); for (int i = 0; i < papers.size(); i++) {
MyPaper myPaper = papers.get(i);
pstmt.setObject(1, myPaper.getAuthor());
pstmt.setObject(2, myPaper.getEe());
pstmt.setObject(3, myPaper.getJournal());
pstmt.setObject(4, myPaper.getKey());
pstmt.setObject(5, myPaper.getMdate());
pstmt.setObject(6, myPaper.getNumber());
pstmt.setObject(7, myPaper.getPages());
pstmt.setObject(8, myPaper.getTitle());
pstmt.setObject(9, myPaper.getUrl());
pstmt.setObject(10, myPaper.getVolume());
pstmt.setObject(11, myPaper.getYear());
pstmt.executeUpdate();
System.out.println(i+"插入成功!");
}
}
}
3.结果展示

部分数据

单条数据
作者:新手村的0级玩家
链接:https://www.jianshu.com/p/e3b1afcc333a