content/blog/ribw/build-your-own-pc/index.md (view raw)
1+++
2title = "Build your own PC"
3date = 2020-02-25T02:00:12+00:00
4updated = 2020-03-18T09:38:46+00:00
5+++
6
7_…where PC obviously stands for Personal Crawler_.
8
9----------
10
11This post contains the source code for a very simple crawler written in Java. You can compile and run it on any file or directory, and it will calculate the frequency of all the words it finds.
12
13## Source code
14
15Paste the following code in a new file called `Crawl.java`:
16
17```
18import java.io.*;
19import java.util.*;
20import java.util.regex.Matcher;
21import java.util.regex.Pattern;
22
23class Crawl {
24 // Regex used to tokenize the words from a line of text
25 private final static Pattern WORDS = Pattern.compile("\\w+");
26
27 // The file where we will cache our results
28 private final static File INDEX_FILE = new File("index.bin");
29
30 // Helper method to determine if a file is a text file or not
31 private static boolean isTextFile(File file) {
32 String name = file.getName().toLowerCase();
33 return name.endsWith(".txt")
34 || name.endsWith(".java")
35 || name.endsWith(".c")
36 || name.endsWith(".cpp")
37 || name.endsWith(".h")
38 || name.endsWith(".hpp")
39 || name.endsWith(".html")
40 || name.endsWith(".css")
41 || name.endsWith(".js");
42 }
43
44 // Normalizes a string by converting it to lowercase and removing accents
45 private static String normalize(String string) {
46 return string.toLowerCase()
47 .replace("á", "a")
48 .replace("é", "e")
49 .replace("í", "i")
50 .replace("ó", "o")
51 .replace("ú", "u");
52 }
53
54 // Recursively fills the map with the count of words found on all the text files
55 static void fillWordMap(Map<String, Integer> map, File root) throws IOException {
56 // Our file queue begins with the root
57 Queue<File> fileQueue = new ArrayDeque<>();
58 fileQueue.add(root);
59
60 // For as long as the queue is not empty...
61 File file;
62 while ((file = fileQueue.poll()) != null) {
63 if (!file.exists() || !file.canRead()) {
64 // ...ignore files for which we don't have permission...
65 System.err.println("warning: cannot read file: " + file);
66 } else if (file.isDirectory()) {
67 // ...else if it's a directory, extend our queue with its files...
68 File[] files = file.listFiles();
69 if (files == null) {
70 System.err.println("warning: cannot list dir: " + file);
71 } else {
72 fileQueue.addAll(Arrays.asList(files));
73 }
74 } else if (isTextFile(file)) {
75 // ...otherwise, count the words in the file.
76 countWordsInFile(map, file);
77 }
78 }
79 }
80
81 // Counts the words in a single file and adds the count to the map.
82 public static void countWordsInFile(Map<String, Integer> map, File file) throws IOException {
83 BufferedReader reader = new BufferedReader(new FileReader(file));
84
85 String line;
86 while ((line = reader.readLine()) != null) {
87 Matcher matcher = WORDS.matcher(line);
88 while (matcher.find()) {
89 String token = normalize(matcher.group());
90 Integer count = map.get(token);
91 if (count == null) {
92 map.put(token, 1);
93 } else {
94 map.put(token, count + 1);
95 }
96 }
97 }
98
99 reader.close();
100 }
101
102 // Prints the map of word count to the desired output stream.
103 public static void printWordMap(Map<String, Integer> map, PrintStream writer) {
104 List<String> keys = new ArrayList<>(map.keySet());
105 Collections.sort(keys);
106 for (String key : keys) {
107 writer.println(key + "\t" + map.get(key));
108 }
109 }
110
111 @SuppressWarnings("unchecked")
112 public static void main(String[] args) throws IOException, ClassNotFoundException {
113 // Validate arguments
114 if (args.length == 1 && args[0].equals("--help")) {
115 System.err.println("usage: java Crawl [input]");
116 return;
117 }
118
119 File root = new File(args.length > 0 ? args[0] : ".");
120
121 // Loading or generating the map where we aggregate the data {word: count}
122 Map<String, Integer> map;
123 if (INDEX_FILE.isFile()) {
124 System.err.println("Found existing index file: " + INDEX_FILE);
125 try (ObjectInputStream ois = new ObjectInputStream(new FileInputStream(INDEX_FILE))) {
126 map = (Map<String, Integer>) ois.readObject();
127 }
128 } else {
129 System.err.println("Index file not found: " + INDEX_FILE + "; indexing...");
130 map = new TreeMap<>();
131 fillWordMap(map, root);
132 // Cache the results to avoid doing the work a next time
133 try (ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream(INDEX_FILE))) {
134 out.writeObject(map);
135 }
136 }
137
138 // Ask the user in a loop to query for words
139 Scanner scanner = new Scanner(System.in);
140 while (true) {
141 System.out.print("Escriba palabra a consultar (o Enter para salir): ");
142 System.out.flush();
143 String line = scanner.nextLine().trim();
144 if (line.isEmpty()) {
145 break;
146 }
147
148 line = normalize(line);
149 Integer count = map.get(line);
150 if (count == null) {
151 System.out.println(String.format("La palabra \"%s\" no está presente", line));
152 } else if (count == 1) {
153 System.out.println(String.format("La palabra \"%s\" está presente 1 vez", line));
154 } else {
155 System.out.println(String.format("La palabra \"%s\" está presente %d veces", line, count));
156 }
157 }
158 }
159}
160```
161
162It can be compiled and executed as follows:
163
164```
165javac Crawl.java
166java Crawl
167```
168
169Instead of copy-pasting the code, you may also download it as a `.zip`:
170
171*(contents removed)*
172
173## Addendum
174
175The following simple function can be used if one desires to print the contents of a file:
176
177```
178public static void printFile(File file) {
179 if (isTextFile(file)) {
180 System.out.println('\n' + file.getName());
181 try (BufferedReader reader = new BufferedReader(new FileReader(file))) {
182 String line;
183 while ((line = reader.readLine()) != null) {
184 System.out.println(line);
185 }
186 } catch (FileNotFoundException ignored) {
187 System.err.println("warning: file disappeared while reading: " + file);
188 } catch (IOException e) {
189 e.printStackTrace();
190 }
191 }
192}
193```