I've received a couple of private mails from users on how to extract text
from PDF files using the Etymon lib. I thought I'd just post it for the
archives in case anyone's interested.
If you still need help just holler! The references to cat are Log4j's
category. You can remove it
without side-effects if you don't use Log4j.
private String getContent(Pdf pdf, int pageNo)
{
String content = null;
PjStream stream = null;
StringBuffer strbf = new StringBuffer();
try
{
PjPage page = (PjPage) pdf.getObject(pdf.getPage(pageNo));
PjObject pobj = (PjObject) pdf.resolve(page.getContents());
if (pobj instanceof PjArray)
{
PjArray array = (PjArray) pobj;
Vector vArray = array.getVector();
int size = vArray.size();
for (int j = 0; j < size; j++)
{
stream = (PjStream) pdf.resolve((PjObject)
vArray.get(j));
strbf.append(getStringFromPjStream(stream));
}
content = strbf.toString();
}
else
{
stream = (PjStream) pobj;
content = getStringFromPjStream(stream);
}
}
catch (InvalidPdfObjectException pdfe)
{
cat.error("Invalid PDF Object:" + pdfe, pdfe);
}
catch (Exception e)
{
cat.error("Exception in getContent() " + e, e);
}
return content;
}
private String getStringFromPjStream(PjStream stream)
{
StringBuffer strbf = new StringBuffer();
try
{
int start,end = 0;
stream = stream.flateDecompress();
String longString = stream.toString();
int strlen = longString.length();
int lastIndex = longString.lastIndexOf(")");
while (lastIndex != -1 && end != lastIndex)
{
start = longString.indexOf("(", end);
end = longString.indexOf(")", start);
String text = longString.substring(start + 1, end);
strbf.append(text);
}
}
catch (InvalidPdfObjectException pdfe)
{
cat.error("InvalidObjectException:" + pdfe.getMessage(), pdfe);
}
return strbf.toString();
}
Good luck!
Regards,
Kelvin
Regards,
Kelvin Tan
Relevanz Pte Ltd
http://www.relevanz.com
180B Bencoolen St.
The Bencoolen, #04-01
S(189648)
Tel: 238 6229
Fax: 337 4417
--
To unsubscribe, e-mail: <mailto:lucene-user-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-user-help@jakarta.apache.org>
from PDF files using the Etymon lib. I thought I'd just post it for the
archives in case anyone's interested.
If you still need help just holler! The references to cat are Log4j's
category. You can remove it
without side-effects if you don't use Log4j.
private String getContent(Pdf pdf, int pageNo)
{
String content = null;
PjStream stream = null;
StringBuffer strbf = new StringBuffer();
try
{
PjPage page = (PjPage) pdf.getObject(pdf.getPage(pageNo));
PjObject pobj = (PjObject) pdf.resolve(page.getContents());
if (pobj instanceof PjArray)
{
PjArray array = (PjArray) pobj;
Vector vArray = array.getVector();
int size = vArray.size();
for (int j = 0; j < size; j++)
{
stream = (PjStream) pdf.resolve((PjObject)
vArray.get(j));
strbf.append(getStringFromPjStream(stream));
}
content = strbf.toString();
}
else
{
stream = (PjStream) pobj;
content = getStringFromPjStream(stream);
}
}
catch (InvalidPdfObjectException pdfe)
{
cat.error("Invalid PDF Object:" + pdfe, pdfe);
}
catch (Exception e)
{
cat.error("Exception in getContent() " + e, e);
}
return content;
}
private String getStringFromPjStream(PjStream stream)
{
StringBuffer strbf = new StringBuffer();
try
{
int start,end = 0;
stream = stream.flateDecompress();
String longString = stream.toString();
int strlen = longString.length();
int lastIndex = longString.lastIndexOf(")");
while (lastIndex != -1 && end != lastIndex)
{
start = longString.indexOf("(", end);
end = longString.indexOf(")", start);
String text = longString.substring(start + 1, end);
strbf.append(text);
}
}
catch (InvalidPdfObjectException pdfe)
{
cat.error("InvalidObjectException:" + pdfe.getMessage(), pdfe);
}
return strbf.toString();
}
Good luck!
Regards,
Kelvin
Regards,
Kelvin Tan
Relevanz Pte Ltd
http://www.relevanz.com
180B Bencoolen St.
The Bencoolen, #04-01
S(189648)
Tel: 238 6229
Fax: 337 4417
--
To unsubscribe, e-mail: <mailto:lucene-user-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-user-help@jakarta.apache.org>