来源: C#读取PDF文档文字内容 – tzdk – 博客园
C#读取PDF文档文字内容
通过iTextSharp读取PDF文件内容,下载地址,下载后解压itextsharp-dll-core.zip。
只能读取英文和数字,文档中包含的汉字无法正常读取:
private string ReadPdfContent(string filepath)
{
try
{
string pdffilename = filepath;
PdfReader pdfReader = new PdfReader(pdffilename);
int numberOfPages = pdfReader.NumberOfPages;
string text = string.Empty;
for (int i = 1; i <= numberOfPages; ++i)
{
byte[] bufferOfPageContent = pdfReader.GetPageContent(i);
text += System.Text.Encoding.UTF8.GetString(bufferOfPageContent);
}
pdfReader.Close();
return text;
}
catch (Exception ex)
{
StreamWriter log = File.AppendText(System.AppDomain.CurrentDomain.SetupInformation.ApplicationBase+"\\log.log");
log.WriteLine("出错文件:" + e.FullPath + "原因:" + ex.ToString());
log.Flush();
log.Close();return null;
}
}
可以读取中英文
private string OnCreated(string filepath)
{
try
{
string pdffilename = filepath;
PdfReader pdfReader = new PdfReader(pdffilename);
int numberOfPages = pdfReader.NumberOfPages;
string text = string.Empty;
for (int i = 1; i <= numberOfPages; ++i)
{
iTextSharp.text.pdf.parser.ITextExtractionStrategy strategy = new iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy();
text += iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(pdfReader, i, strategy);
}
pdfReader.Close();
return text;
}
catch (Exception ex)
{
StreamWriter wlog = File.AppendText(System.AppDomain.CurrentDomain.SetupInformation.ApplicationBase+"\\mylog.log");
wlog.WriteLine("出错文件:" + e.FullPath + "原因:" + ex.ToString());
wlog.Flush();
wlog.Close();return null;
}
}
Mikel
