Knowledge Base - Extract images from a PDF document

You can use the following example to extract all the images (not annotations) contained in a given PDF document.

Document inputFile = new Document();
if (inputFile.Open(path, "") == 0) {
for (int pageIndex = 0; pageIndex < 3; pageIndex++) {
Page page = inputFile.GetPage(pageIndex);
if (page != null) {
page.ObjsStart();
List<Integer> imageIndices = getImagesIndex(page);
if(imageIndices == null) continue;
for(int i = 0; i < imageIndices.size(); i++) {
int randomIndex = imageIndices.get(i);
if (page.ObjsGetCharUnicode(randomIndex) == -1) { // image
String format = "png";

PDFImage extractedImage = getImage(page, randomIndex);
if(extractedImage != null) { //extract the image to the file system
ImageIO.write(extractedImage.getImage(), format, new File(imageExtractPath +
file.getName().substring(0, file.getName().lastIndexOf(".")) + "_" + pageIndex + "." + format));
}
}
}
page.Close();
}
}
}
inputFile.Close();
/**
* Iterates on the given page characters and return the index of the found images.
*
* @param page the page to check, ObjsStart must be invoked first.
* @return a list containing the indices of the found images.
*/
public static List<Integer> getImagesIndex(Page page) {
List<Integer> indices = null;
int charCount = page.ObjsGetCharCount();
for (int i = 0; i < charCount; i++) {
if (page.ObjsGetCharUnicode(i) == -1) { // image
if(indices == null) indices = new ArrayList<Integer>();
indices.add(i);
}
}
return indices;
}

 

/**
* get an image from the given page.
*
* @param page the pdf page object, ObjsStart must be invoked first.
* @param imageIndex the index of the image char in the given page.
* @return PDFImage object containing the BufferedImage extracted from the page, and the image coordinates.
*/
public static PDFImage getImage(Page page, int imageIndex) {
PDFImage pdfImage = new PDFImage();
pdfImage.setImageIndex(imageIndex);

BufferedImage extractedImage = page.ObjsGetImage(imageIndex);
//page is extracted upside-down, so flip it
if(extractedImage != null)
extractedImage = flipImageVertically(extractedImage);
pdfImage.setImage(extractedImage);

//get image coordinates
float[] imageRect = new float[4];
page.ObjsGetCharRect(imageIndex, imageRect);
pdfImage.setImageRect(imageRect);

return pdfImage;
}

 

public static BufferedImage flipImageVertically(BufferedImage input) {
BufferedImage flipped = new BufferedImage(input.getWidth(), input.getHeight(), input.getType());
AffineTransform tran = AffineTransform.getTranslateInstance(0, input.getHeight());
AffineTransform flip = AffineTransform.getScaleInstance(1d, -1d);
tran.concatenate(flip);
Graphics2D g = flipped.createGraphics();
g.setTransform(tran);
g.drawImage(input, 0, 0, null);
g.dispose();
return flipped;
}

 

public class PDFImage {

private int mImageIndex;
private float[] mImageRect;
private BufferedImage mImage;

public int getImageIndex() {
return mImageIndex;
}

public void setImageIndex(int mImageIndex) {
this.mImageIndex = mImageIndex;
}

public float[] getImageRect() {
return mImageRect;
}

public void setImageRect(float[] mImageRect) {
this.mImageRect = mImageRect;
}

public BufferedImage getImage() {
return mImage;
}

public void setImage(BufferedImage mImage) {
this.mImage = mImage;
}
}
Applies To

RadaeePDF Master SDK

Details

Created : 2020-10-01 09:52:19, Last Modified : 2021-05-27 13:35:09