blog/ribw/build-your-own-pc/index.html (view raw)
1<!DOCTYPE html><html lang=en><head><meta charset=utf-8><meta name=description content="Official Lonami's website"><meta name=viewport content="width=device-width, initial-scale=1.0, user-scalable=yes"><title> Build your own PC | Lonami's Blog </title><link rel=stylesheet href=/style.css><body><article><nav class=sections><ul class=left><li><a href=/>lonami's site</a><li><a href=/blog class=selected>blog</a><li><a href=/golb>golb</a></ul><div class=right><a href=https://github.com/LonamiWebs><img src=img/github.svg alt=github></a><a href=/blog/atom.xml><img src=/img/rss.svg alt=rss></a></div></nav><main><h1 class=title>Build your own PC</h1><div class=time><p>2020-02-25T02:00:12+00:00<p>last updated 2020-03-18T09:38:46+00:00</div><p><em>…where PC obviously stands for Personal Crawler</em>.<hr><p>This post contains the source code for a very simple crawler written in Java. You can compile and run it on any file or directory, and it will calculate the frequency of all the words it finds.<h2 id=source-code>Source code</h2><p>Paste the following code in a new file called <code>Crawl.java</code>:<pre><code>import java.io.*;
2import java.util.*;
3import java.util.regex.Matcher;
4import java.util.regex.Pattern;
5
6class Crawl {
7 // Regex used to tokenize the words from a line of text
8 private final static Pattern WORDS = Pattern.compile("\\w+");
9
10 // The file where we will cache our results
11 private final static File INDEX_FILE = new File("index.bin");
12
13 // Helper method to determine if a file is a text file or not
14 private static boolean isTextFile(File file) {
15 String name = file.getName().toLowerCase();
16 return name.endsWith(".txt")
17 || name.endsWith(".java")
18 || name.endsWith(".c")
19 || name.endsWith(".cpp")
20 || name.endsWith(".h")
21 || name.endsWith(".hpp")
22 || name.endsWith(".html")
23 || name.endsWith(".css")
24 || name.endsWith(".js");
25 }
26
27 // Normalizes a string by converting it to lowercase and removing accents
28 private static String normalize(String string) {
29 return string.toLowerCase()
30 .replace("á", "a")
31 .replace("é", "e")
32 .replace("í", "i")
33 .replace("ó", "o")
34 .replace("ú", "u");
35 }
36
37 // Recursively fills the map with the count of words found on all the text files
38 static void fillWordMap(Map<String, Integer> map, File root) throws IOException {
39 // Our file queue begins with the root
40 Queue<File> fileQueue = new ArrayDeque<>();
41 fileQueue.add(root);
42
43 // For as long as the queue is not empty...
44 File file;
45 while ((file = fileQueue.poll()) != null) {
46 if (!file.exists() || !file.canRead()) {
47 // ...ignore files for which we don't have permission...
48 System.err.println("warning: cannot read file: " + file);
49 } else if (file.isDirectory()) {
50 // ...else if it's a directory, extend our queue with its files...
51 File[] files = file.listFiles();
52 if (files == null) {
53 System.err.println("warning: cannot list dir: " + file);
54 } else {
55 fileQueue.addAll(Arrays.asList(files));
56 }
57 } else if (isTextFile(file)) {
58 // ...otherwise, count the words in the file.
59 countWordsInFile(map, file);
60 }
61 }
62 }
63
64 // Counts the words in a single file and adds the count to the map.
65 public static void countWordsInFile(Map<String, Integer> map, File file) throws IOException {
66 BufferedReader reader = new BufferedReader(new FileReader(file));
67
68 String line;
69 while ((line = reader.readLine()) != null) {
70 Matcher matcher = WORDS.matcher(line);
71 while (matcher.find()) {
72 String token = normalize(matcher.group());
73 Integer count = map.get(token);
74 if (count == null) {
75 map.put(token, 1);
76 } else {
77 map.put(token, count + 1);
78 }
79 }
80 }
81
82 reader.close();
83 }
84
85 // Prints the map of word count to the desired output stream.
86 public static void printWordMap(Map<String, Integer> map, PrintStream writer) {
87 List<String> keys = new ArrayList<>(map.keySet());
88 Collections.sort(keys);
89 for (String key : keys) {
90 writer.println(key + "\t" + map.get(key));
91 }
92 }
93
94 @SuppressWarnings("unchecked")
95 public static void main(String[] args) throws IOException, ClassNotFoundException {
96 // Validate arguments
97 if (args.length == 1 && args[0].equals("--help")) {
98 System.err.println("usage: java Crawl [input]");
99 return;
100 }
101
102 File root = new File(args.length > 0 ? args[0] : ".");
103
104 // Loading or generating the map where we aggregate the data {word: count}
105 Map<String, Integer> map;
106 if (INDEX_FILE.isFile()) {
107 System.err.println("Found existing index file: " + INDEX_FILE);
108 try (ObjectInputStream ois = new ObjectInputStream(new FileInputStream(INDEX_FILE))) {
109 map = (Map<String, Integer>) ois.readObject();
110 }
111 } else {
112 System.err.println("Index file not found: " + INDEX_FILE + "; indexing...");
113 map = new TreeMap<>();
114 fillWordMap(map, root);
115 // Cache the results to avoid doing the work a next time
116 try (ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream(INDEX_FILE))) {
117 out.writeObject(map);
118 }
119 }
120
121 // Ask the user in a loop to query for words
122 Scanner scanner = new Scanner(System.in);
123 while (true) {
124 System.out.print("Escriba palabra a consultar (o Enter para salir): ");
125 System.out.flush();
126 String line = scanner.nextLine().trim();
127 if (line.isEmpty()) {
128 break;
129 }
130
131 line = normalize(line);
132 Integer count = map.get(line);
133 if (count == null) {
134 System.out.println(String.format("La palabra \"%s\" no está presente", line));
135 } else if (count == 1) {
136 System.out.println(String.format("La palabra \"%s\" está presente 1 vez", line));
137 } else {
138 System.out.println(String.format("La palabra \"%s\" está presente %d veces", line, count));
139 }
140 }
141 }
142}
143</code></pre><p>It can be compiled and executed as follows:<pre><code>javac Crawl.java
144java Crawl
145</code></pre><p>Instead of copy-pasting the code, you may also download it as a <code>.zip</code>:<p><em>(contents removed)</em><h2 id=addendum>Addendum</h2><p>The following simple function can be used if one desires to print the contents of a file:<pre><code>public static void printFile(File file) {
146 if (isTextFile(file)) {
147 System.out.println('\n' + file.getName());
148 try (BufferedReader reader = new BufferedReader(new FileReader(file))) {
149 String line;
150 while ((line = reader.readLine()) != null) {
151 System.out.println(line);
152 }
153 } catch (FileNotFoundException ignored) {
154 System.err.println("warning: file disappeared while reading: " + file);
155 } catch (IOException e) {
156 e.printStackTrace();
157 }
158 }
159}
160</code></pre></main><footer><div><p>Share your thoughts, or simply come hang with me <a href=https://t.me/LonamiWebs><img src=/img/telegram.svg alt=Telegram></a> <a href=mailto:totufals@hotmail.com><img src=/img/mail.svg alt=Mail></a></div></footer></article><p class=abyss>Glaze into the abyss… Oh hi there!