[RESOLVED] pdf file search over string or text

Dear all,

I want to seach pdf files using asp.net C#.

if any pdf file content is "albert"  list that all files


or


search file on the basis of its content

thanks!

Hi umeshjha.net,

You can use the following code to open the pdf file, and find whether the pdf file contain the "albert" words.

using iTextSharp.text.pdf;
using iTextSharp.text;
  
private void openPDF()
{
            string str = "";
            string newFile = "c:\\New Document.pdf";
            Document doc = new Document();
  
            PdfReader reader = new PdfReader("c:\\New Document.pdf");
            for (int i = 1; i <= reader.NumberOfPages; i++)
            {
                byte[] bt = reader.GetPageContent(i);
  
                str += ExtractTextFromPDFBytes(bt);
  
            }
}
  
  
 private string ExtractTextFromPDFBytes(byte[] input)
        {
            if (input == null || input.Length == 0) return "";
  
            try
            {
                string resultString = "";
  
                // Flag showing if we are we currently inside a text object
                bool inTextObject = false;
  
                // Flag showing if the next character is literal
                // e.g. '\\' to get a '\' character or '\(' to get '('
                bool nextLiteral = false;
  
                // () Bracket nesting level. Text appears inside ()
                int bracketDepth = 0;
  
                // Keep previous chars to get extract numbers etc.:
                char[] previousCharacters = new char[_numberOfCharsToKeep];
                for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';
  
  
                for (int i = 0; i < input.Length; i++)
                {
                    char c = (char)input[i];
  
                    if (inTextObject)
                    {
                        // Position the text
                        if (bracketDepth == 0)
                        {
                            if (CheckToken(new string[] { "TD", "Td" }, previousCharacters))
                            {
                                resultString += "\n\r";
                            }
                            else
                            {
                                if (CheckToken(new string[] { "'", "T*", "\"" }, previousCharacters))
                                {
                                    resultString += "\n";
                                }
                                else
                                {
                                    if (CheckToken(new string[] { "Tj" }, previousCharacters))
                                    {
                                        resultString += " ";
                                    }
                                }
                            }
                        }
  
                        // End of a text object, also go to a new line.
                        if (bracketDepth == 0 &&
                            CheckToken(new string[] { "ET" }, previousCharacters))
                        {
  
                            inTextObject = false;
                            resultString += " ";
                        }
                        else
                        {
                            // Start outputting text
                            if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))
                            {
                                bracketDepth = 1;
                            }
                            else
                            {
                                // Stop outputting text
                                if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))
                                {
                                    bracketDepth = 0;
                                }
                                else
                                {
                                    // Just a normal text character:
                                    if (bracketDepth == 1)
                                    {
                                        // Only print out next character no matter what.
                                        // Do not interpret.
                                        if (c == '\\' && !nextLiteral)
                                        {
                                            nextLiteral = true;
                                        }
                                        else
                                        {
                                            if (((c >= ' ') && (c <= '~')) ||
                                                ((c >= 128) && (c < 255)))
                                            {
                                                resultString += c.ToString();
                                            }
  
                                            nextLiteral = false;
                                        }
                                    }
                                }
                            }
                        }
                    }
  
                    // Store the recent characters for
                    // when we have to go back for a checking
                    for (int j = 0; j < _numberOfCharsToKeep - 1; j++)
                    {
                        previousCharacters[j] = previousCharacters[j + 1];
                    }
                    previousCharacters[_numberOfCharsToKeep - 1] = c;
  
                    // Start of a text object
                    if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters))
                    {
                        inTextObject = true;
                    }
                }
                return resultString;
            }
            catch
            {
                return "";
            }
        }
  
 private bool CheckToken(string[] tokens, char[] recent)
   {
     foreach (string token in tokens)
       {
          if ((recent[_numberOfCharsToKeep - 3] == token[0]) &&
           (recent[_numberOfCharsToKeep - 2] == token[1]) &&
           ((recent[_numberOfCharsToKeep - 1] == ' ') ||
           (recent[_numberOfCharsToKeep - 1] == 0x0d) ||
           (recent[_numberOfCharsToKeep - 1] == 0x0a)) &&
           ((recent[_numberOfCharsToKeep - 4] == ' ') ||
           (recent[_numberOfCharsToKeep - 4] == 0x0d) ||
           (recent[_numberOfCharsToKeep - 4] == 0x0a))
                 )
           {
                    return true;
            }
            }
            return false;
        }


Please check the following link:

http://www.codeproject.com/KB/showcase/TallComponents.aspx

thoughts on "[RESOLVED] pdf file search over string or text"

Leave a Reply

Your email address will not be published. Required fields are marked *

You may use these HTML tags and attributes: <a href="" title=""> <abbr title=""> <acronym title=""> <b> <blockquote cite=""> <cite> <code> <del datetime=""> <em> <i> <q cite=""> <strike> <strong>