all repos — gemini-redirect @ fcd5db0bbb4bb9fe310c1507e91664de87765514

blog/ribw/build-your-own-pc/index.html (view raw)

  1<!DOCTYPE html>
  2<html>
  3<head>
  4<meta charset="utf-8" />
  5<meta name="viewport" content="width=device-width, initial-scale=1" />
  6<title>Build your own PC</title>
  7<link rel="stylesheet" href="../css/style.css">
  8</head>
  9<body>
 10<main>
 11<p><em>…where PC obviously stands for Personal Crawler</em>.</p>
 12<div class="date-created-modified">Created 2020-02-25<br>
 13Modified 2020-03-18</div>
 14<hr />
 15<p>This post contains the source code for a very simple crawler written in Java. You can compile and run it on any file or directory, and it will calculate the frequency of all the words it finds.</p>
 16<h2 class="title" id="source_code"><a class="anchor" href="#source_code">¶</a>Source code</h2>
 17<p>Paste the following code in a new file called <code>Crawl.java</code>:</p>
 18<pre><code>import java.io.*;
 19import java.util.*;
 20import java.util.regex.Matcher;
 21import java.util.regex.Pattern;
 22
 23class Crawl {
 24	// Regex used to tokenize the words from a line of text
 25	private final static Pattern WORDS = Pattern.compile(&quot;\\w+&quot;);
 26
 27	// The file where we will cache our results
 28	private final static File INDEX_FILE = new File(&quot;index.bin&quot;);
 29
 30	// Helper method to determine if a file is a text file or not
 31	private static boolean isTextFile(File file) {
 32		String name = file.getName().toLowerCase();
 33		return name.endsWith(&quot;.txt&quot;)
 34				|| name.endsWith(&quot;.java&quot;)
 35				|| name.endsWith(&quot;.c&quot;)
 36				|| name.endsWith(&quot;.cpp&quot;)
 37				|| name.endsWith(&quot;.h&quot;)
 38				|| name.endsWith(&quot;.hpp&quot;)
 39				|| name.endsWith(&quot;.html&quot;)
 40				|| name.endsWith(&quot;.css&quot;)
 41				|| name.endsWith(&quot;.js&quot;);
 42	}
 43
 44	// Normalizes a string by converting it to lowercase and removing accents
 45	private static String normalize(String string) {
 46		return string.toLowerCase()
 47				.replace(&quot;á&quot;, &quot;a&quot;)
 48				.replace(&quot;é&quot;, &quot;e&quot;)
 49				.replace(&quot;í&quot;, &quot;i&quot;)
 50				.replace(&quot;ó&quot;, &quot;o&quot;)
 51				.replace(&quot;ú&quot;, &quot;u&quot;);
 52	}
 53
 54	// Recursively fills the map with the count of words found on all the text files
 55	static void fillWordMap(Map&lt;String, Integer&gt; map, File root) throws IOException {
 56		// Our file queue begins with the root
 57		Queue&lt;File&gt; fileQueue = new ArrayDeque&lt;&gt;();
 58		fileQueue.add(root);
 59
 60		// For as long as the queue is not empty...
 61		File file;
 62		while ((file = fileQueue.poll()) != null) {
 63			if (!file.exists() || !file.canRead()) {
 64				// ...ignore files for which we don't have permission...
 65				System.err.println(&quot;warning: cannot read file: &quot; + file);
 66			} else if (file.isDirectory()) {
 67				// ...else if it's a directory, extend our queue with its files...
 68				File[] files = file.listFiles();
 69				if (files == null) {
 70					System.err.println(&quot;warning: cannot list dir: &quot; + file);
 71				} else {
 72					fileQueue.addAll(Arrays.asList(files));
 73				}
 74			} else if (isTextFile(file)) {
 75				// ...otherwise, count the words in the file.
 76				countWordsInFile(map, file);
 77			}
 78		}
 79	}
 80
 81	// Counts the words in a single file and adds the count to the map.
 82	public static void countWordsInFile(Map&lt;String, Integer&gt; map, File file) throws IOException {
 83		BufferedReader reader = new BufferedReader(new FileReader(file));
 84
 85		String line;
 86		while ((line = reader.readLine()) != null) {
 87			Matcher matcher = WORDS.matcher(line);
 88			while (matcher.find()) {
 89				String token = normalize(matcher.group());
 90				Integer count = map.get(token);
 91				if (count == null) {
 92					map.put(token, 1);
 93				} else {
 94					map.put(token, count + 1);
 95				}
 96			}
 97		}
 98
 99		reader.close();
100	}
101
102	// Prints the map of word count to the desired output stream.
103	public static void printWordMap(Map&lt;String, Integer&gt; map, PrintStream writer) {
104		List&lt;String&gt; keys = new ArrayList&lt;&gt;(map.keySet());
105		Collections.sort(keys);
106		for (String key : keys) {
107			writer.println(key + &quot;\t&quot; + map.get(key));
108		}
109	}
110
111	@SuppressWarnings(&quot;unchecked&quot;)
112	public static void main(String[] args) throws IOException, ClassNotFoundException {
113		// Validate arguments
114		if (args.length == 1 &amp;&amp; args[0].equals(&quot;--help&quot;)) {
115			System.err.println(&quot;usage: java Crawl [input]&quot;);
116			return;
117		}
118
119		File root = new File(args.length &gt; 0 ? args[0] : &quot;.&quot;);
120
121		// Loading or generating the map where we aggregate the data  {word: count}
122		Map&lt;String, Integer&gt; map;
123		if (INDEX_FILE.isFile()) {
124			System.err.println(&quot;Found existing index file: &quot; + INDEX_FILE);
125			try (ObjectInputStream ois = new ObjectInputStream(new FileInputStream(INDEX_FILE))) {
126				map = (Map&lt;String, Integer&gt;) ois.readObject();
127			}
128		} else {
129			System.err.println(&quot;Index file not found: &quot; + INDEX_FILE + &quot;; indexing...&quot;);
130			map = new TreeMap&lt;&gt;();
131			fillWordMap(map, root);
132			// Cache the results to avoid doing the work a next time
133			try (ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream(INDEX_FILE))) {
134				out.writeObject(map);
135			}
136		}
137
138		// Ask the user in a loop to query for words
139		Scanner scanner = new Scanner(System.in);
140		while (true) {
141			System.out.print(&quot;Escriba palabra a consultar (o Enter para salir): &quot;);
142			System.out.flush();
143			String line = scanner.nextLine().trim();
144			if (line.isEmpty()) {
145				break;
146			}
147
148			line = normalize(line);
149			Integer count = map.get(line);
150			if (count == null) {
151				System.out.println(String.format(&quot;La palabra \&quot;%s\&quot; no está presente&quot;, line));
152			} else if (count == 1) {
153				System.out.println(String.format(&quot;La palabra \&quot;%s\&quot; está presente 1 vez&quot;, line));
154			} else {
155				System.out.println(String.format(&quot;La palabra \&quot;%s\&quot; está presente %d veces&quot;, line, count));
156			}
157		}
158	}
159}
160</code></pre>
161<p>It can be compiled and executed as follows:</p>
162<pre><code>javac Crawl.java
163java Crawl
164</code></pre>
165<p>Instead of copy-pasting the code, you may also download it as a <code>.zip</code>:</p>
166<p><em>(contents removed)</em></p>
167<h2 id="addendum"><a class="anchor" href="#addendum">¶</a>Addendum</h2>
168<p>The following simple function can be used if one desires to print the contents of a file:</p>
169<pre><code>public static void printFile(File file) {
170	if (isTextFile(file)) {
171		System.out.println('\n' + file.getName());
172		try (BufferedReader reader = new BufferedReader(new FileReader(file))) {
173			String line;
174			while ((line = reader.readLine()) != null) {
175				System.out.println(line);
176			}
177		} catch (FileNotFoundException ignored) {
178			System.err.println(&quot;warning: file disappeared while reading: &quot; + file);
179		} catch (IOException e) {
180			e.printStackTrace();
181		}
182	}
183}
184</code></pre>
185</main>
186</body>
187</html>
188