I have done this kind of project a lot of times before.
Things you need to do:
1.) Check out this project Extract Text from PDF in C#. The project uses ITextSharp.
- It would be better if you download the sample project and have a look on how it works. In this project it shows how to extract data from a pdf. Check out the PDFParser class, it has the function named ExtractTextFromPDFBytes(byte[] input) from that function you can see how the text is being extracted out from the uncompressed pdf file. Don't forget to include the ITextSharp dll.
PDFParser class
1 using System;
2 using System.IO;
3 using iTextSharp.text.pdf;
4
5 namespace PdfToText
6 {
7 ///
8 /// Parses a PDF file and extracts the text from it.
9 ///
10 public class PDFParser
11 {
12 /// BT = Beginning of a text object operator
13 /// ET = End of a text object operator
14 /// Td move to the start of next line
15 /// 5 Ts = superscript
16 /// -5 Ts = subscript
17
18 #region Fields
19
20 #region _numberOfCharsToKeep
21 ///
22 /// The number of characters to keep, when extracting text.
23 ///
24 private static int _numberOfCharsToKeep = 15;
25 #endregion
26
27 #endregion
28
29 #region ExtractText
30 ///
31 /// Extracts a text from a PDF file.
32 ///
33 /// the full path to the pdf file.
34 /// the output file name.
35 /// the extracted text
36 public bool ExtractText(string inFileName, string outFileName)
37 {
38 StreamWriter outFile = null;
39 try
40 {
41 // Create a reader for the given PDF file
42 PdfReader reader = new PdfReader(inFileName);
43 //outFile = File.CreateText(outFileName);
44 outFile = new StreamWriter(outFileName, false, System.Text.Encoding.UTF8);
45
46 Console.Write("Processing: ");
47
48 int totalLen = 68;
49 float charUnit = ((float)totalLen) / (float)reader.NumberOfPages;
50 int totalWritten= 0;
51 float curUnit = 0;
52
53 for (int page = 1; page = 1.0f)
59 {
60 for (int i = 0; i = 1.0f)
70 {
71 for (int i = 0; i
104 /// This method processes an uncompressed Adobe (text) object
105 /// and extracts text.
106 ///
107 /// uncompressed
108 ///
109 private string ExtractTextFromPDFBytes(byte[] input)
110 {
111 if (input == null || input.Length == 0) return "";
112
113 try
114 {
115 string resultString = "";
116
117 // Flag showing if we are we currently inside a text object
118 bool inTextObject = false;
119
120 // Flag showing if the next character is literal
121 // e.g. '\' to get a '' character or '(' to get '('
122 bool nextLiteral = false;
123
124 // () Bracket nesting level. Text appears inside ()
125 int bracketDepth = 0;
126
127 // Keep previous chars to get extract numbers etc.:
128 char[] previousCharacters = new char[_numberOfCharsToKeep];
129 for (int j = 0; j = ' ') && (c = 128) && (c
235 /// Check if a certain 2 character token just came along (e.g. BT)
236 ///
237 /// the searched token
238 /// the recent character array
239 ///
240 private bool CheckToken(string[] tokens, char[] recent)
241 {
242 foreach(string token in tokens)
243 {
244 if ((recent[_numberOfCharsToKeep - 3] == token[0]) &&
245 (recent[_numberOfCharsToKeep - 2] == token[1]) &&
246 ((recent[_numberOfCharsToKeep - 1] == ' ') ||
247 (recent[_numberOfCharsToKeep - 1] == 0x0d) ||
248 (recent[_numberOfCharsToKeep - 1] == 0x0a)) &&
249 ((recent[_numberOfCharsToKeep - 4] == ' ') ||
250 (recent[_numberOfCharsToKeep - 4] == 0x0d) ||
251 (recent[_numberOfCharsToKeep - 4] == 0x0a))
252 )
253 {
254 return true;
255 }
256 }
257 return false;
258 }
259 #endregion
260 }
261 }
2.) Parse the extracted text and create and xml file.
Some of my concerns before are the pdf's which contains broken links or urls inside the pages. Now, just in case if you are also concern on this issue, regular expression can easily solve your problem but I suggest you deal with it later on.
Now here is a sample code on how to create an xml. Understand how the code works so later on you will know on how to deal with your own code.
try {
//XmlDataDocument sourceXML = new XmlDataDocument();
string xmlFile = Server.MapPath(“DVDlist.xml”);
//create a XML file is not exist
System.Xml.XmlTextWriter writer = new System.Xml.XmlTextWriter(xmlFile, null);
//starts a new document
writer.WriteStartDocument();
//write comments
writer.WriteComment(“Commentss: XmlWriter Test Program”);
writer.Formatting = Formatting.Indented;
writer.WriteStartElement(“DVDlist”);
writer.WriteStartElement(“DVD”);
writer.WriteAttributeString(“ID”, “1″);
//write some simple elements
writer.WriteElementString(“Title”, “Tere Naam”);
writer.WriteStartElement(“Starring”);
writer.WriteElementString(“Actor”, “Salman Khan”);
writer.WriteEndElement();
writer.WriteEndElement();
writer.WriteEndElement();
writer.Close();
}
catch (Exception e1) {
Page.Response.Write(e1);
}
Hope it helps :)
与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…