blog/ribw/build-your-own-pc/index.html (view raw)
1<!DOCTYPE html>
2<html>
3<head>
4<meta charset="utf-8" />
5<meta name="viewport" content="width=device-width, initial-scale=1" />
6<title>Build your own PC</title>
7<link rel="stylesheet" href="../css/style.css">
8</head>
9<body>
10<main>
11<p><em>…where PC obviously stands for Personal Crawler</em>.</p>
12<div class="date-created-modified">Created 2020-02-25<br>
13Modified 2020-03-18</div>
14<hr />
15<p>This post contains the source code for a very simple crawler written in Java. You can compile and run it on any file or directory, and it will calculate the frequency of all the words it finds.</p>
16<h2 class="title" id="source_code"><a class="anchor" href="#source_code">¶</a>Source code</h2>
17<p>Paste the following code in a new file called <code>Crawl.java</code>:</p>
18<pre><code>import java.io.*;
19import java.util.*;
20import java.util.regex.Matcher;
21import java.util.regex.Pattern;
22
23class Crawl {
24 // Regex used to tokenize the words from a line of text
25 private final static Pattern WORDS = Pattern.compile("\\w+");
26
27 // The file where we will cache our results
28 private final static File INDEX_FILE = new File("index.bin");
29
30 // Helper method to determine if a file is a text file or not
31 private static boolean isTextFile(File file) {
32 String name = file.getName().toLowerCase();
33 return name.endsWith(".txt")
34 || name.endsWith(".java")
35 || name.endsWith(".c")
36 || name.endsWith(".cpp")
37 || name.endsWith(".h")
38 || name.endsWith(".hpp")
39 || name.endsWith(".html")
40 || name.endsWith(".css")
41 || name.endsWith(".js");
42 }
43
44 // Normalizes a string by converting it to lowercase and removing accents
45 private static String normalize(String string) {
46 return string.toLowerCase()
47 .replace("á", "a")
48 .replace("é", "e")
49 .replace("í", "i")
50 .replace("ó", "o")
51 .replace("ú", "u");
52 }
53
54 // Recursively fills the map with the count of words found on all the text files
55 static void fillWordMap(Map<String, Integer> map, File root) throws IOException {
56 // Our file queue begins with the root
57 Queue<File> fileQueue = new ArrayDeque<>();
58 fileQueue.add(root);
59
60 // For as long as the queue is not empty...
61 File file;
62 while ((file = fileQueue.poll()) != null) {
63 if (!file.exists() || !file.canRead()) {
64 // ...ignore files for which we don't have permission...
65 System.err.println("warning: cannot read file: " + file);
66 } else if (file.isDirectory()) {
67 // ...else if it's a directory, extend our queue with its files...
68 File[] files = file.listFiles();
69 if (files == null) {
70 System.err.println("warning: cannot list dir: " + file);
71 } else {
72 fileQueue.addAll(Arrays.asList(files));
73 }
74 } else if (isTextFile(file)) {
75 // ...otherwise, count the words in the file.
76 countWordsInFile(map, file);
77 }
78 }
79 }
80
81 // Counts the words in a single file and adds the count to the map.
82 public static void countWordsInFile(Map<String, Integer> map, File file) throws IOException {
83 BufferedReader reader = new BufferedReader(new FileReader(file));
84
85 String line;
86 while ((line = reader.readLine()) != null) {
87 Matcher matcher = WORDS.matcher(line);
88 while (matcher.find()) {
89 String token = normalize(matcher.group());
90 Integer count = map.get(token);
91 if (count == null) {
92 map.put(token, 1);
93 } else {
94 map.put(token, count + 1);
95 }
96 }
97 }
98
99 reader.close();
100 }
101
102 // Prints the map of word count to the desired output stream.
103 public static void printWordMap(Map<String, Integer> map, PrintStream writer) {
104 List<String> keys = new ArrayList<>(map.keySet());
105 Collections.sort(keys);
106 for (String key : keys) {
107 writer.println(key + "\t" + map.get(key));
108 }
109 }
110
111 @SuppressWarnings("unchecked")
112 public static void main(String[] args) throws IOException, ClassNotFoundException {
113 // Validate arguments
114 if (args.length == 1 && args[0].equals("--help")) {
115 System.err.println("usage: java Crawl [input]");
116 return;
117 }
118
119 File root = new File(args.length > 0 ? args[0] : ".");
120
121 // Loading or generating the map where we aggregate the data {word: count}
122 Map<String, Integer> map;
123 if (INDEX_FILE.isFile()) {
124 System.err.println("Found existing index file: " + INDEX_FILE);
125 try (ObjectInputStream ois = new ObjectInputStream(new FileInputStream(INDEX_FILE))) {
126 map = (Map<String, Integer>) ois.readObject();
127 }
128 } else {
129 System.err.println("Index file not found: " + INDEX_FILE + "; indexing...");
130 map = new TreeMap<>();
131 fillWordMap(map, root);
132 // Cache the results to avoid doing the work a next time
133 try (ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream(INDEX_FILE))) {
134 out.writeObject(map);
135 }
136 }
137
138 // Ask the user in a loop to query for words
139 Scanner scanner = new Scanner(System.in);
140 while (true) {
141 System.out.print("Escriba palabra a consultar (o Enter para salir): ");
142 System.out.flush();
143 String line = scanner.nextLine().trim();
144 if (line.isEmpty()) {
145 break;
146 }
147
148 line = normalize(line);
149 Integer count = map.get(line);
150 if (count == null) {
151 System.out.println(String.format("La palabra \"%s\" no está presente", line));
152 } else if (count == 1) {
153 System.out.println(String.format("La palabra \"%s\" está presente 1 vez", line));
154 } else {
155 System.out.println(String.format("La palabra \"%s\" está presente %d veces", line, count));
156 }
157 }
158 }
159}
160</code></pre>
161<p>It can be compiled and executed as follows:</p>
162<pre><code>javac Crawl.java
163java Crawl
164</code></pre>
165<p>Instead of copy-pasting the code, you may also download it as a <code>.zip</code>:</p>
166<p><em>(contents removed)</em></p>
167<h2 id="addendum"><a class="anchor" href="#addendum">¶</a>Addendum</h2>
168<p>The following simple function can be used if one desires to print the contents of a file:</p>
169<pre><code>public static void printFile(File file) {
170 if (isTextFile(file)) {
171 System.out.println('\n' + file.getName());
172 try (BufferedReader reader = new BufferedReader(new FileReader(file))) {
173 String line;
174 while ((line = reader.readLine()) != null) {
175 System.out.println(line);
176 }
177 } catch (FileNotFoundException ignored) {
178 System.err.println("warning: file disappeared while reading: " + file);
179 } catch (IOException e) {
180 e.printStackTrace();
181 }
182 }
183}
184</code></pre>
185</main>
186</body>
187</html>
188