all repos — gemini-redirect @ 4440d37092b17f1abdd64c6e4a63fc710aa168a3

content/ribw/build-your-own-pc/post.md (view raw)

  1```meta
  2title: Build your own PC
  3published: 2020-02-25T02:00:12+00:00
  4updated: 2020-03-18T09:38:46+00:00
  5```
  6
  7_…where PC obviously stands for Personal Crawler_.
  8
  9----------
 10
 11This post contains the source code for a very simple crawler written in Java. You can compile and run it on any file or directory, and it will calculate the frequency of all the words it finds.
 12
 13## Source code
 14
 15Paste the following code in a new file called `Crawl.java`:
 16
 17```
 18import java.io.*;
 19import java.util.*;
 20import java.util.regex.Matcher;
 21import java.util.regex.Pattern;
 22
 23class Crawl {
 24	// Regex used to tokenize the words from a line of text
 25	private final static Pattern WORDS = Pattern.compile("\\w+");
 26
 27	// The file where we will cache our results
 28	private final static File INDEX_FILE = new File("index.bin");
 29
 30	// Helper method to determine if a file is a text file or not
 31	private static boolean isTextFile(File file) {
 32		String name = file.getName().toLowerCase();
 33		return name.endsWith(".txt")
 34				|| name.endsWith(".java")
 35				|| name.endsWith(".c")
 36				|| name.endsWith(".cpp")
 37				|| name.endsWith(".h")
 38				|| name.endsWith(".hpp")
 39				|| name.endsWith(".html")
 40				|| name.endsWith(".css")
 41				|| name.endsWith(".js");
 42	}
 43
 44	// Normalizes a string by converting it to lowercase and removing accents
 45	private static String normalize(String string) {
 46		return string.toLowerCase()
 47				.replace("á", "a")
 48				.replace("é", "e")
 49				.replace("í", "i")
 50				.replace("ó", "o")
 51				.replace("ú", "u");
 52	}
 53
 54	// Recursively fills the map with the count of words found on all the text files
 55	static void fillWordMap(Map<String, Integer> map, File root) throws IOException {
 56		// Our file queue begins with the root
 57		Queue<File> fileQueue = new ArrayDeque<>();
 58		fileQueue.add(root);
 59
 60		// For as long as the queue is not empty...
 61		File file;
 62		while ((file = fileQueue.poll()) != null) {
 63			if (!file.exists() || !file.canRead()) {
 64				// ...ignore files for which we don't have permission...
 65				System.err.println("warning: cannot read file: " + file);
 66			} else if (file.isDirectory()) {
 67				// ...else if it's a directory, extend our queue with its files...
 68				File[] files = file.listFiles();
 69				if (files == null) {
 70					System.err.println("warning: cannot list dir: " + file);
 71				} else {
 72					fileQueue.addAll(Arrays.asList(files));
 73				}
 74			} else if (isTextFile(file)) {
 75				// ...otherwise, count the words in the file.
 76				countWordsInFile(map, file);
 77			}
 78		}
 79	}
 80
 81	// Counts the words in a single file and adds the count to the map.
 82	public static void countWordsInFile(Map<String, Integer> map, File file) throws IOException {
 83		BufferedReader reader = new BufferedReader(new FileReader(file));
 84
 85		String line;
 86		while ((line = reader.readLine()) != null) {
 87			Matcher matcher = WORDS.matcher(line);
 88			while (matcher.find()) {
 89				String token = normalize(matcher.group());
 90				Integer count = map.get(token);
 91				if (count == null) {
 92					map.put(token, 1);
 93				} else {
 94					map.put(token, count + 1);
 95				}
 96			}
 97		}
 98
 99		reader.close();
100	}
101
102	// Prints the map of word count to the desired output stream.
103	public static void printWordMap(Map<String, Integer> map, PrintStream writer) {
104		List<String> keys = new ArrayList<>(map.keySet());
105		Collections.sort(keys);
106		for (String key : keys) {
107			writer.println(key + "\t" + map.get(key));
108		}
109	}
110
111	@SuppressWarnings("unchecked")
112	public static void main(String[] args) throws IOException, ClassNotFoundException {
113		// Validate arguments
114		if (args.length == 1 && args[0].equals("--help")) {
115			System.err.println("usage: java Crawl [input]");
116			return;
117		}
118
119		File root = new File(args.length > 0 ? args[0] : ".");
120
121		// Loading or generating the map where we aggregate the data  {word: count}
122		Map<String, Integer> map;
123		if (INDEX_FILE.isFile()) {
124			System.err.println("Found existing index file: " + INDEX_FILE);
125			try (ObjectInputStream ois = new ObjectInputStream(new FileInputStream(INDEX_FILE))) {
126				map = (Map<String, Integer>) ois.readObject();
127			}
128		} else {
129			System.err.println("Index file not found: " + INDEX_FILE + "; indexing...");
130			map = new TreeMap<>();
131			fillWordMap(map, root);
132			// Cache the results to avoid doing the work a next time
133			try (ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream(INDEX_FILE))) {
134				out.writeObject(map);
135			}
136		}
137
138		// Ask the user in a loop to query for words
139		Scanner scanner = new Scanner(System.in);
140		while (true) {
141			System.out.print("Escriba palabra a consultar (o Enter para salir): ");
142			System.out.flush();
143			String line = scanner.nextLine().trim();
144			if (line.isEmpty()) {
145				break;
146			}
147
148			line = normalize(line);
149			Integer count = map.get(line);
150			if (count == null) {
151				System.out.println(String.format("La palabra \"%s\" no está presente", line));
152			} else if (count == 1) {
153				System.out.println(String.format("La palabra \"%s\" está presente 1 vez", line));
154			} else {
155				System.out.println(String.format("La palabra \"%s\" está presente %d veces", line, count));
156			}
157		}
158	}
159}
160```
161
162It can be compiled and executed as follows:
163
164```
165javac Crawl.java
166java Crawl
167```
168
169Instead of copy-pasting the code, you may also download it as a `.zip`:
170
171*(contents removed)*
172
173## Addendum
174
175The following simple function can be used if one desires to print the contents of a file:
176
177```
178public static void printFile(File file) {
179	if (isTextFile(file)) {
180		System.out.println('\n' + file.getName());
181		try (BufferedReader reader = new BufferedReader(new FileReader(file))) {
182			String line;
183			while ((line = reader.readLine()) != null) {
184				System.out.println(line);
185			}
186		} catch (FileNotFoundException ignored) {
187			System.err.println("warning: file disappeared while reading: " + file);
188		} catch (IOException e) {
189			e.printStackTrace();
190		}
191	}
192}
193```