Home arrow static arrow Java Programming [Archive] - can Java be used to parse Microsoft Word(.doc) files?
Warning: Creating default object from empty value in /www/htdocs/w008deb8/wiki/components/com_staticxt/staticxt.php on line 51
Java Programming [Archive] - can Java be used to parse Microsoft Word(.doc) files?
5 Duke Stars available
This topic has 2 replies on 1 page.

Posts:26
Registered: 6/14/04
can Java be used to parse Microsoft Word(.doc) files?  
Jun 30, 2004 10:49 PM



 
Hi guys ,
I want to know whether Java can be used to parse Microsoft Word(.doc) files for searching a string or for checking for grammatical errors, etc
Thanks in advance.
Avichal
 

Posts:2,909
Registered: 13.8.2003
Re: can Java be used to parse Microsoft Word(.doc) files?  
Jun 30, 2004 11:04 PM (reply 1 of 2)



 
In theory yes, in practice it would be a lot of work (besides Word does a pretty good job of finding grammatical errors already).
 

Posts:55
Registered: 6/12/04
Re: can Java be used to parse Microsoft Word(.doc) files?  
Jun 30, 2004 11:23 PM (reply 2 of 2)



 
Hey man, anything and every thing can be done these days.

About ur question doc is like all other normal text files with some extra features and extra character supports and other stuffs.

If u neglect those parts and if u consider it to be a normal text file then its a much simpler job.

Here is a code that searches for the key word in all the doc files, txt files, pdf files and html files

in the mentioned folder and sub folders. Any way its a servlet u can change it to a normal program.

It first check the file to know whether they are doc, pdf, html or txt files if yes then it will read the file and

store the contents in the vector and parse the vector for the search string and display the result.

Along with the result the below code will also display the time taken and the number of search string found in the document

 import java.io.*;import java.util.*;import java.net.*;import javax.servlet.*;import javax.servlet.http.*; public class search_local extends HttpServlet{	public void service( HttpServletRequest _req, HttpServletResponse _res ) throws ServletException, IOException	{		long startTime = System.currentTimeMillis();						File RootDir	= new File( _req.getRealPath( "/docs/" ) );		if ( RootDir.isDirectory() == false )		{			System.out.println( "Invalid directory" );			_res.setStatus( HttpServletResponse.SC_NO_CONTENT );			return;		} 		Vector kList = new Vector( 3 );		StringTokenizer st = new StringTokenizer( _req.getParameter( "search_text" ), "+" );		while ( st.hasMoreTokens() )			kList.addElement( st.nextToken().trim() ); 		//- Run through list		Vector toBeDone	= new Vector( 10 );		Vector found	= new Vector( 10 ); 		String dir[] = RootDir.list( new htmlFilter() );		cDirInfo tX = new cDirInfo( RootDir, dir ); 		toBeDone.addElement( tX );		while (  toBeDone.isEmpty() == false )		{			tX = (cDirInfo)toBeDone.firstElement();						try			{				int x = 0;				for ( ;; )				{					File newFile = new File( tX.rootDir, tX.dirList[x] );					if ( newFile.isDirectory() )					{						File t = new File( tX.rootDir, tX.dirList[x] );						String a[] = newFile.list( new htmlFilter() );  						toBeDone.addElement( new cDirInfo( t, a ) );					}					else					{						int freq = searchFile( kList, newFile );						if ( freq != 0 )							found.addElement( new cPage( freq, newFile ) );											} 					x++;				}			}			catch( ArrayIndexOutOfBoundsException E ){}			toBeDone.removeElementAt(0);			dir	= null;		}		long totalTime = System.currentTimeMillis()	- startTime;		formatResults( found, kList, totalTime, _req.getRealPath( "/docs" ), _res );	} 	//---- 	private void formatResults( Vector _fList, Vector _kList, long time, String _root, HttpServletResponse _res ) throws IOException	{	   	_res.setContentType("text/html");		PrintWriter Out = new PrintWriter( _res.getOutputStream() ); 		Out.println( "<HTML><HEAD><TITLE>Search results</TITLE></HEAD>" );		Out.println( "<BODY><H3>Search Results</H3><BR>" );				Out.println( "Keywords:<B> " );		Enumeration E = _kList.elements();		while ( E.hasMoreElements() )			Out.println( (String)E.nextElement() + " : " ); 		Out.println( "</B><BR><BR><CENTER><HR WIDTH=100%></CENTER><BR>" );		E = _fList.elements();		cPage sPage;		String link;		while ( E.hasMoreElements() )		{			sPage = (cPage)E.nextElement();			link  = sPage.cFile.toString();			link  = "http://localhost/BugFix/docs/" + link.substring( link.indexOf( _root )+_root.length(), link.length() );			Out.println( "<FONT SIZE=+1><A HREF=" + link + ">" + sPage.cFile.getName() + "</A></FONT>" );			Out.println( "<FONT SIZE=-2>(" + sPage.freq + ")</FONT><BR>" );		} 		if ( _fList.size() == 0 )			Out.println( "<I><B>No sites found!</I></B><BR>"); 		Out.println( "<BR><CENTER><HR WIDTH=100%></CENTER>" );		Out.println( "<BR><FONT SIZE=-1>Time to complete: " + ((double)time/1000) + " seconds</FONT>" );		Out.println( "</BODY></HTML>" );		Out.flush();	} 	//---- 	private int searchFile( Vector _klist, File _filename )	{		//- Links the file		int	frequency=0;		try		{			DataInputStream In	= new DataInputStream( new FileInputStream( _filename ) );			String LineIn, token;			boolean bValid = true;			Enumeration E;			cLineParse lp; 			while ( (LineIn = In.readLine()) != null )			{				lp = new cLineParse( LineIn.toUpperCase() ); 				while ( (token=lp.nextToken()) != "" )				{					if ( token.indexOf( "<"<A" ) != -1 ||						 token.indexOf( "<HE" ) != -1 ||						 token.indexOf( "<APP" ) != -1 ||						 token.indexOf( "<SER" ) != -1 ||						 token.indexOf( "<TEX" ) != -1  ))						bValid  = false;					else if (	token.indexOf( "<" ) != -1 && (								token.indexOf( "</A" ) != -1 ||								token.indexOf( "</HE" ) != -1 ||								token.indexOf( "</APP" ) != -1 ||								token.indexOf( "</SER" ) != -1 ||								token.indexOf( "</TEX" ) != -1  ))						bValid  = true;					else if ( bValid )					{						E = _klist.elements();						String key;						while ( E.hasMoreElements() )						{							key	= ((String)E.nextElement()).toUpperCase();							if ( token.indexOf( key ) != -1 )								frequency++;						}					}				} 			} 			In.close();		}		catch( IOException E ){} 		return frequency;	}} class cPage extends Object{	public int	freq;	public File cFile; 	public cPage( int _freq, File _cFile )	{		freq = _freq;		cFile = _cFile;	}}  //- End of file //----- Supporting classes class htmlFilter implements FilenameFilter{	public boolean accept(File dir, String name)	{		File tF	= new File( dir, name ); 		if ( tF.isDirectory() )			return trueint indx = name.lastIndexOf( "." );		if ( indx == -1 )			return false; 		String Ext = name.substring( indx+1, name.length() ).toLowerCase(); 		if ( Ext.equals( "html" ) ||			 Ext.equals( "pdf" ) ||			 Ext.equals( "txt" ) ||			 Ext.equals( "doc" ) )			 return true;				return false;	}} //--- class cDirInfo{	public File	rootDir;	public String[] dirList; 	public cDirInfo( File _r, String[] _d )	{		rootDir	= _r;		dirList = _d;	}} class cLineParse{	String L; 	public cLineParse( String _s )	{		L = _s;	} 	public String nextToken()	{		String ns="";		boolean bStart = falsefor ( int x=0; x < L.length(); x++ )		{			if ( L.charAt(x) == '<' && ns.length() != 0 )			{				L = L.substring( x, L.length() );				return ns;			}			else if ( L.charAt(x) == '<' )			{				ns	= ns + L.charAt( x );				bStart = true;			}			else if ( L.charAt(x) == '>' ||					  L.charAt(x) == '\r' ||				 ( L.charAt(x) == ' ' && bStart == false ) )			{				ns	= ns + L.charAt( x );				L = L.substring( x+1, L.length() );				return ns;			}			else				ns	= ns + L.charAt( x );		}				L = "";		return ns;	}}   
 
This topic has 2 replies on 1 page.