1// Copyright 2017 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package finder 16 17import ( 18 "bufio" 19 "bytes" 20 "encoding/json" 21 "errors" 22 "fmt" 23 "io" 24 "os" 25 "path/filepath" 26 "runtime" 27 "sort" 28 "strings" 29 "sync" 30 "sync/atomic" 31 "time" 32 33 "android/soong/finder/fs" 34) 35 36// This file provides a Finder struct that can quickly search for files satisfying 37// certain criteria. 38// This Finder gets its speed partially from parallelism and partially from caching. 39// If a Stat call returns the same result as last time, then it means Finder 40// can skip the ReadDir call for that dir. 41 42// The primary data structure used by the finder is the field Finder.nodes , 43// which is a tree of nodes of type *pathMap . 44// Each node represents a directory on disk, along with its stats, subdirectories, 45// and contained files. 46 47// The common use case for the Finder is that the caller creates a Finder and gives 48// it the same query that was given to it in the previous execution. 49// In this situation, the major events that take place are: 50// 1. The Finder begins to load its db 51// 2. The Finder begins to stat the directories mentioned in its db (using multiple threads) 52// Calling Stat on each of these directories is generally a large fraction of the total time 53// 3. The Finder begins to construct a separate tree of nodes in each of its threads 54// 4. The Finder merges the individual node trees into the main node tree 55// 5. The Finder may call ReadDir a few times if there are a few directories that are out-of-date 56// These ReadDir calls might prompt additional Stat calls, etc 57// 6. The Finder waits for all loading to complete 58// 7. The Finder searches the cache for files matching the user's query (using multiple threads) 59 60// These are the invariants regarding concurrency: 61// 1. The public methods of Finder are threadsafe. 62// The public methods are only performance-optimized for one caller at a time, however. 63// For the moment, multiple concurrent callers shouldn't expect any better performance than 64// multiple serial callers. 65// 2. While building the node tree, only one thread may ever access the <children> collection of a 66// *pathMap at once. 67// a) The thread that accesses the <children> collection is the thread that discovers the 68// children (by reading them from the cache or by having received a response to ReadDir). 69// 1) Consequently, the thread that discovers the children also spawns requests to stat 70// subdirs. 71// b) Consequently, while building the node tree, no thread may do a lookup of its 72// *pathMap via filepath because another thread may be adding children to the 73// <children> collection of an ancestor node. Additionally, in rare cases, another thread 74// may be removing children from an ancestor node if the children were only discovered to 75// be irrelevant after calling ReadDir (which happens if a prune-file was just added). 76// 3. No query will begin to be serviced until all loading (both reading the db 77// and scanning the filesystem) is complete. 78// Tests indicate that it only takes about 10% as long to search the in-memory cache as to 79// generate it, making this not a huge loss in performance. 80// 4. The parsing of the db and the initial setup of the pathMap tree must complete before 81// beginning to call listDirSync (because listDirSync can create new entries in the pathMap) 82 83// see cmd/finder.go or finder_test.go for usage examples 84 85// Update versionString whenever making a backwards-incompatible change to the cache file format 86const versionString = "Android finder version 1" 87 88// a CacheParams specifies which files and directories the user wishes be scanned and 89// potentially added to the cache 90type CacheParams struct { 91 // WorkingDirectory is used as a base for any relative file paths given to the Finder 92 WorkingDirectory string 93 94 // RootDirs are the root directories used to initiate the search 95 RootDirs []string 96 97 // Whether symlinks are followed. If set, symlinks back to their own parent 98 // directory don't work. 99 FollowSymlinks bool 100 101 // ExcludeDirs are directory names that if encountered are removed from the search 102 ExcludeDirs []string 103 104 // PruneFiles are file names that if encountered prune their entire directory 105 // (including siblings) 106 PruneFiles []string 107 108 // IncludeFiles are file names to include as matches 109 IncludeFiles []string 110 111 // IncludeSuffixes are filename suffixes to include as matches. 112 IncludeSuffixes []string 113} 114 115// a cacheConfig stores the inputs that determine what should be included in the cache 116type cacheConfig struct { 117 CacheParams 118 119 // FilesystemView is a unique identifier telling which parts of which file systems 120 // are readable by the Finder. In practice its value is essentially username@hostname. 121 // FilesystemView is set to ensure that a cache file copied to another host or 122 // found by another user doesn't inadvertently get reused. 123 FilesystemView string 124} 125 126func (p *cacheConfig) Dump() ([]byte, error) { 127 bytes, err := json.Marshal(p) 128 return bytes, err 129} 130 131// a cacheMetadata stores version information about the cache 132type cacheMetadata struct { 133 // The Version enables the Finder to determine whether it can even parse the file 134 // If the version changes, the entire cache file must be regenerated 135 Version string 136 137 // The CacheParams enables the Finder to determine whether the parameters match 138 // If the CacheParams change, the Finder can choose how much of the cache file to reuse 139 // (although in practice, the Finder will probably choose to ignore the entire file anyway) 140 Config cacheConfig 141} 142 143type Logger interface { 144 Output(calldepth int, s string) error 145} 146 147// the Finder is the main struct that callers will want to use 148type Finder struct { 149 // configuration 150 DbPath string 151 numDbLoadingThreads int 152 numSearchingThreads int 153 cacheMetadata cacheMetadata 154 logger Logger 155 filesystem fs.FileSystem 156 157 // temporary state 158 threadPool *threadPool 159 mutex sync.Mutex 160 fsErrs []fsErr 161 errlock sync.Mutex 162 shutdownWaitgroup sync.WaitGroup 163 164 // non-temporary state 165 modifiedFlag int32 166 nodes pathMap 167} 168 169var defaultNumThreads = runtime.NumCPU() * 2 170 171// New creates a new Finder for use 172func New(cacheParams CacheParams, filesystem fs.FileSystem, 173 logger Logger, dbPath string) (f *Finder, err error) { 174 return newImpl(cacheParams, filesystem, logger, dbPath, defaultNumThreads) 175} 176 177// newImpl is like New but accepts more params 178func newImpl(cacheParams CacheParams, filesystem fs.FileSystem, 179 logger Logger, dbPath string, numThreads int) (f *Finder, err error) { 180 numDbLoadingThreads := numThreads 181 numSearchingThreads := numThreads 182 183 metadata := cacheMetadata{ 184 Version: versionString, 185 Config: cacheConfig{ 186 CacheParams: cacheParams, 187 FilesystemView: filesystem.ViewId(), 188 }, 189 } 190 191 f = &Finder{ 192 numDbLoadingThreads: numDbLoadingThreads, 193 numSearchingThreads: numSearchingThreads, 194 cacheMetadata: metadata, 195 logger: logger, 196 filesystem: filesystem, 197 198 nodes: *newPathMap("/"), 199 DbPath: dbPath, 200 201 shutdownWaitgroup: sync.WaitGroup{}, 202 } 203 204 f.loadFromFilesystem() 205 206 // check for any filesystem errors 207 err = f.getErr() 208 if err != nil { 209 return nil, err 210 } 211 212 // confirm that every path mentioned in the CacheConfig exists 213 for _, path := range cacheParams.RootDirs { 214 if !filepath.IsAbs(path) { 215 path = filepath.Join(f.cacheMetadata.Config.WorkingDirectory, path) 216 } 217 node := f.nodes.GetNode(filepath.Clean(path), false) 218 if node == nil || node.ModTime == 0 { 219 return nil, fmt.Errorf("path %v was specified to be included in the cache but does not exist\n", path) 220 } 221 } 222 223 return f, nil 224} 225 226// FindNamed searches for every cached file 227func (f *Finder) FindAll() []string { 228 return f.FindAt("/") 229} 230 231// FindNamed searches for every cached file under <rootDir> 232func (f *Finder) FindAt(rootDir string) []string { 233 filter := func(entries DirEntries) (dirNames []string, fileNames []string) { 234 return entries.DirNames, entries.FileNames 235 } 236 return f.FindMatching(rootDir, filter) 237} 238 239// FindNamed searches for every cached file named <fileName> 240func (f *Finder) FindNamed(fileName string) []string { 241 return f.FindNamedAt("/", fileName) 242} 243 244// FindNamedAt searches under <rootPath> for every file named <fileName> 245// The reason a caller might use FindNamedAt instead of FindNamed is if they want 246// to limit their search to a subset of the cache 247func (f *Finder) FindNamedAt(rootPath string, fileName string) []string { 248 filter := func(entries DirEntries) (dirNames []string, fileNames []string) { 249 matches := []string{} 250 for _, foundName := range entries.FileNames { 251 if foundName == fileName { 252 matches = append(matches, foundName) 253 } 254 } 255 return entries.DirNames, matches 256 257 } 258 return f.FindMatching(rootPath, filter) 259} 260 261// FindFirstNamed searches for every file named <fileName> 262// Whenever it finds a match, it stops search subdirectories 263func (f *Finder) FindFirstNamed(fileName string) []string { 264 return f.FindFirstNamedAt("/", fileName) 265} 266 267// FindFirstNamedAt searches for every file named <fileName> 268// Whenever it finds a match, it stops search subdirectories 269func (f *Finder) FindFirstNamedAt(rootPath string, fileName string) []string { 270 filter := func(entries DirEntries) (dirNames []string, fileNames []string) { 271 matches := []string{} 272 for _, foundName := range entries.FileNames { 273 if foundName == fileName { 274 matches = append(matches, foundName) 275 } 276 } 277 278 if len(matches) > 0 { 279 return []string{}, matches 280 } 281 return entries.DirNames, matches 282 } 283 return f.FindMatching(rootPath, filter) 284} 285 286// FindMatching is the most general exported function for searching for files in the cache 287// The WalkFunc will be invoked repeatedly and is expected to modify the provided DirEntries 288// in place, removing file paths and directories as desired. 289// WalkFunc will be invoked potentially many times in parallel, and must be threadsafe. 290func (f *Finder) FindMatching(rootPath string, filter WalkFunc) []string { 291 // set up some parameters 292 scanStart := time.Now() 293 var isRel bool 294 workingDir := f.cacheMetadata.Config.WorkingDirectory 295 296 isRel = !filepath.IsAbs(rootPath) 297 if isRel { 298 rootPath = filepath.Join(workingDir, rootPath) 299 } 300 301 rootPath = filepath.Clean(rootPath) 302 303 // ensure nothing else is using the Finder 304 f.verbosef("FindMatching waiting for finder to be idle\n") 305 f.lock() 306 defer f.unlock() 307 308 node := f.nodes.GetNode(rootPath, false) 309 if node == nil { 310 f.verbosef("No data for path %v ; apparently not included in cache params: %v\n", 311 rootPath, f.cacheMetadata.Config.CacheParams) 312 // path is not found; don't do a search 313 return []string{} 314 } 315 316 // search for matching files 317 f.verbosef("Finder finding %v using cache\n", rootPath) 318 results := f.findInCacheMultithreaded(node, filter, f.numSearchingThreads) 319 320 // format and return results 321 if isRel { 322 for i := 0; i < len(results); i++ { 323 results[i] = strings.Replace(results[i], workingDir+"/", "", 1) 324 } 325 } 326 sort.Strings(results) 327 f.verbosef("Found %v files under %v in %v using cache\n", 328 len(results), rootPath, time.Since(scanStart)) 329 return results 330} 331 332// Shutdown declares that the finder is no longer needed and waits for its cleanup to complete 333// Currently, that only entails waiting for the database dump to complete. 334func (f *Finder) Shutdown() { 335 f.WaitForDbDump() 336} 337 338// WaitForDbDump returns once the database has been written to f.DbPath. 339func (f *Finder) WaitForDbDump() { 340 f.shutdownWaitgroup.Wait() 341} 342 343// End of public api 344 345func (f *Finder) goDumpDb() { 346 if f.wasModified() { 347 f.shutdownWaitgroup.Add(1) 348 go func() { 349 err := f.dumpDb() 350 if err != nil { 351 f.verbosef("%v\n", err) 352 } 353 f.shutdownWaitgroup.Done() 354 }() 355 } else { 356 f.verbosef("Skipping dumping unmodified db\n") 357 } 358} 359 360// joinCleanPaths is like filepath.Join but is faster because 361// joinCleanPaths doesn't have to support paths ending in "/" or containing ".." 362func joinCleanPaths(base string, leaf string) string { 363 if base == "" { 364 return leaf 365 } 366 if base == "/" { 367 return base + leaf 368 } 369 if leaf == "" { 370 return base 371 } 372 return base + "/" + leaf 373} 374 375func (f *Finder) verbosef(format string, args ...interface{}) { 376 f.logger.Output(2, fmt.Sprintf(format, args...)) 377} 378 379// loadFromFilesystem populates the in-memory cache based on the contents of the filesystem 380func (f *Finder) loadFromFilesystem() { 381 f.threadPool = newThreadPool(f.numDbLoadingThreads) 382 383 err := f.startFromExternalCache() 384 if err != nil { 385 f.startWithoutExternalCache() 386 } 387 388 f.goDumpDb() 389 390 f.threadPool = nil 391} 392 393func (f *Finder) startFind(path string) { 394 if !filepath.IsAbs(path) { 395 path = filepath.Join(f.cacheMetadata.Config.WorkingDirectory, path) 396 } 397 node := f.nodes.GetNode(path, true) 398 f.statDirAsync(node) 399} 400 401func (f *Finder) lock() { 402 f.mutex.Lock() 403} 404 405func (f *Finder) unlock() { 406 f.mutex.Unlock() 407} 408 409// a statResponse is the relevant portion of the response from the filesystem to a Stat call 410type statResponse struct { 411 ModTime int64 412 Inode uint64 413 Device uint64 414} 415 416// a pathAndStats stores a path and its stats 417type pathAndStats struct { 418 statResponse 419 420 Path string 421} 422 423// a dirFullInfo stores all of the relevant information we know about a directory 424type dirFullInfo struct { 425 pathAndStats 426 427 FileNames []string 428} 429 430// a PersistedDirInfo is the information about a dir that we save to our cache on disk 431type PersistedDirInfo struct { 432 // These field names are short because they are repeated many times in the output json file 433 P string // path 434 T int64 // modification time 435 I uint64 // inode number 436 F []string // relevant filenames contained 437} 438 439// a PersistedDirs is the information that we persist for a group of dirs 440type PersistedDirs struct { 441 // the device on which each directory is stored 442 Device uint64 443 // the common root path to which all contained dirs are relative 444 Root string 445 // the directories themselves 446 Dirs []PersistedDirInfo 447} 448 449// a CacheEntry is the smallest unit that can be read and parsed from the cache (on disk) at a time 450type CacheEntry []PersistedDirs 451 452// a DirEntries lists the files and directories contained directly within a specific directory 453type DirEntries struct { 454 Path string 455 456 // elements of DirNames are just the dir names; they don't include any '/' character 457 DirNames []string 458 // elements of FileNames are just the file names; they don't include '/' character 459 FileNames []string 460} 461 462// a WalkFunc is the type that is passed into various Find functions for determining which 463// directories the caller wishes be walked. The WalkFunc is expected to decide which 464// directories to walk and which files to consider as matches to the original query. 465type WalkFunc func(DirEntries) (dirs []string, files []string) 466 467// a mapNode stores the relevant stats about a directory to be stored in a pathMap 468type mapNode struct { 469 statResponse 470 FileNames []string 471} 472 473// a pathMap implements the directory tree structure of nodes 474type pathMap struct { 475 mapNode 476 477 path string 478 479 children map[string]*pathMap 480 481 // number of descendent nodes, including self 482 approximateNumDescendents int 483} 484 485func newPathMap(path string) *pathMap { 486 result := &pathMap{path: path, children: make(map[string]*pathMap, 4), 487 approximateNumDescendents: 1} 488 return result 489} 490 491// GetNode returns the node at <path> 492func (m *pathMap) GetNode(path string, createIfNotFound bool) *pathMap { 493 if len(path) > 0 && path[0] == '/' { 494 path = path[1:] 495 } 496 497 node := m 498 for { 499 if path == "" { 500 return node 501 } 502 503 index := strings.Index(path, "/") 504 var firstComponent string 505 if index >= 0 { 506 firstComponent = path[:index] 507 path = path[index+1:] 508 } else { 509 firstComponent = path 510 path = "" 511 } 512 513 child, found := node.children[firstComponent] 514 515 if !found { 516 if createIfNotFound { 517 child = node.newChild(firstComponent) 518 } else { 519 return nil 520 } 521 } 522 523 node = child 524 } 525} 526 527func (m *pathMap) newChild(name string) (child *pathMap) { 528 path := joinCleanPaths(m.path, name) 529 newChild := newPathMap(path) 530 m.children[name] = newChild 531 532 return m.children[name] 533} 534 535func (m *pathMap) UpdateNumDescendents() int { 536 count := 1 537 for _, child := range m.children { 538 count += child.approximateNumDescendents 539 } 540 m.approximateNumDescendents = count 541 return count 542} 543 544func (m *pathMap) UpdateNumDescendentsRecursive() { 545 for _, child := range m.children { 546 child.UpdateNumDescendentsRecursive() 547 } 548 m.UpdateNumDescendents() 549} 550 551func (m *pathMap) MergeIn(other *pathMap) { 552 for key, theirs := range other.children { 553 ours, found := m.children[key] 554 if found { 555 ours.MergeIn(theirs) 556 } else { 557 m.children[key] = theirs 558 } 559 } 560 if other.ModTime != 0 { 561 m.mapNode = other.mapNode 562 } 563 m.UpdateNumDescendents() 564} 565 566func (m *pathMap) DumpAll() []dirFullInfo { 567 results := []dirFullInfo{} 568 m.dumpInto("", &results) 569 return results 570} 571 572func (m *pathMap) dumpInto(path string, results *[]dirFullInfo) { 573 *results = append(*results, 574 dirFullInfo{ 575 pathAndStats{statResponse: m.statResponse, Path: path}, 576 m.FileNames}, 577 ) 578 for key, child := range m.children { 579 childPath := joinCleanPaths(path, key) 580 if len(childPath) == 0 || childPath[0] != '/' { 581 childPath = "/" + childPath 582 } 583 child.dumpInto(childPath, results) 584 } 585} 586 587// a semaphore can be locked by up to <capacity> callers at once 588type semaphore struct { 589 pool chan bool 590} 591 592func newSemaphore(capacity int) *semaphore { 593 return &semaphore{pool: make(chan bool, capacity)} 594} 595 596func (l *semaphore) Lock() { 597 l.pool <- true 598} 599 600func (l *semaphore) Unlock() { 601 <-l.pool 602} 603 604// A threadPool runs goroutines and supports throttling and waiting. 605// Without throttling, Go may exhaust the maximum number of various resources, such as 606// threads or file descriptors, and crash the program. 607type threadPool struct { 608 receivedRequests sync.WaitGroup 609 activeRequests semaphore 610} 611 612func newThreadPool(maxNumConcurrentThreads int) *threadPool { 613 return &threadPool{ 614 receivedRequests: sync.WaitGroup{}, 615 activeRequests: *newSemaphore(maxNumConcurrentThreads), 616 } 617} 618 619// Run requests to run the given function in its own goroutine 620func (p *threadPool) Run(function func()) { 621 p.receivedRequests.Add(1) 622 // If Run() was called from within a goroutine spawned by this threadPool, 623 // then we may need to return from Run() before having capacity to actually 624 // run <function>. 625 // 626 // It's possible that the body of <function> contains a statement (such as a syscall) 627 // that will cause Go to pin it to a thread, or will contain a statement that uses 628 // another resource that is in short supply (such as a file descriptor), so we can't 629 // actually run <function> until we have capacity. 630 // 631 // However, the semaphore used for synchronization is implemented via a channel and 632 // shouldn't require a new thread for each access. 633 go func() { 634 p.activeRequests.Lock() 635 function() 636 p.activeRequests.Unlock() 637 p.receivedRequests.Done() 638 }() 639} 640 641// Wait waits until all goroutines are done, just like sync.WaitGroup's Wait 642func (p *threadPool) Wait() { 643 p.receivedRequests.Wait() 644} 645 646type fsErr struct { 647 path string 648 err error 649} 650 651func (e fsErr) String() string { 652 return e.path + ": " + e.err.Error() 653} 654 655func (f *Finder) serializeCacheEntry(dirInfos []dirFullInfo) ([]byte, error) { 656 // group each dirFullInfo by its Device, to avoid having to repeat it in the output 657 dirsByDevice := map[uint64][]PersistedDirInfo{} 658 for _, entry := range dirInfos { 659 _, found := dirsByDevice[entry.Device] 660 if !found { 661 dirsByDevice[entry.Device] = []PersistedDirInfo{} 662 } 663 dirsByDevice[entry.Device] = append(dirsByDevice[entry.Device], 664 PersistedDirInfo{P: entry.Path, T: entry.ModTime, I: entry.Inode, F: entry.FileNames}) 665 } 666 667 cacheEntry := CacheEntry{} 668 669 for device, infos := range dirsByDevice { 670 // find common prefix 671 prefix := "" 672 if len(infos) > 0 { 673 prefix = infos[0].P 674 } 675 for _, info := range infos { 676 for !strings.HasPrefix(info.P+"/", prefix+"/") { 677 prefix = filepath.Dir(prefix) 678 if prefix == "/" { 679 break 680 } 681 } 682 } 683 // remove common prefix 684 for i := range infos { 685 suffix := strings.Replace(infos[i].P, prefix, "", 1) 686 if len(suffix) > 0 && suffix[0] == '/' { 687 suffix = suffix[1:] 688 } 689 infos[i].P = suffix 690 } 691 692 // turn the map (keyed by device) into a list of structs with labeled fields 693 // this is to improve readability of the output 694 cacheEntry = append(cacheEntry, PersistedDirs{Device: device, Root: prefix, Dirs: infos}) 695 } 696 697 // convert to json. 698 // it would save some space to use a different format than json for the db file, 699 // but the space and time savings are small, and json is easy for humans to read 700 bytes, err := json.Marshal(cacheEntry) 701 return bytes, err 702} 703 704func (f *Finder) parseCacheEntry(bytes []byte) ([]dirFullInfo, error) { 705 var cacheEntry CacheEntry 706 err := json.Unmarshal(bytes, &cacheEntry) 707 if err != nil { 708 return nil, err 709 } 710 711 // convert from a CacheEntry to a []dirFullInfo (by copying a few fields) 712 capacity := 0 713 for _, element := range cacheEntry { 714 capacity += len(element.Dirs) 715 } 716 nodes := make([]dirFullInfo, capacity) 717 count := 0 718 for _, element := range cacheEntry { 719 for _, dir := range element.Dirs { 720 path := joinCleanPaths(element.Root, dir.P) 721 722 nodes[count] = dirFullInfo{ 723 pathAndStats: pathAndStats{ 724 statResponse: statResponse{ 725 ModTime: dir.T, Inode: dir.I, Device: element.Device, 726 }, 727 Path: path}, 728 FileNames: dir.F} 729 count++ 730 } 731 } 732 return nodes, nil 733} 734 735// We use the following separator byte to distinguish individually parseable blocks of json 736// because we know this separator won't appear in the json that we're parsing. 737// 738// The newline byte can only appear in a UTF-8 stream if the newline character appears, because: 739// - The newline character is encoded as "0000 1010" in binary ("0a" in hex) 740// - UTF-8 dictates that bytes beginning with a "0" bit are never emitted as part of a multibyte 741// character. 742// 743// We know that the newline character will never appear in our json string, because: 744// - If a newline character appears as part of a data string, then json encoding will 745// emit two characters instead: '\' and 'n'. 746// - The json encoder that we use doesn't emit the optional newlines between any of its 747// other outputs. 748const lineSeparator = byte('\n') 749 750func (f *Finder) readLine(reader *bufio.Reader) ([]byte, error) { 751 return reader.ReadBytes(lineSeparator) 752} 753 754// validateCacheHeader reads the cache header from cacheReader and tells whether the cache is compatible with this Finder 755func (f *Finder) validateCacheHeader(cacheReader *bufio.Reader) bool { 756 cacheVersionBytes, err := f.readLine(cacheReader) 757 if err != nil { 758 f.verbosef("Failed to read database header; database is invalid\n") 759 return false 760 } 761 if len(cacheVersionBytes) > 0 && cacheVersionBytes[len(cacheVersionBytes)-1] == lineSeparator { 762 cacheVersionBytes = cacheVersionBytes[:len(cacheVersionBytes)-1] 763 } 764 cacheVersionString := string(cacheVersionBytes) 765 currentVersion := f.cacheMetadata.Version 766 if cacheVersionString != currentVersion { 767 f.verbosef("Version changed from %q to %q, database is not applicable\n", cacheVersionString, currentVersion) 768 return false 769 } 770 771 cacheParamBytes, err := f.readLine(cacheReader) 772 if err != nil { 773 f.verbosef("Failed to read database search params; database is invalid\n") 774 return false 775 } 776 777 if len(cacheParamBytes) > 0 && cacheParamBytes[len(cacheParamBytes)-1] == lineSeparator { 778 cacheParamBytes = cacheParamBytes[:len(cacheParamBytes)-1] 779 } 780 781 currentParamBytes, err := f.cacheMetadata.Config.Dump() 782 if err != nil { 783 panic("Finder failed to serialize its parameters") 784 } 785 cacheParamString := string(cacheParamBytes) 786 currentParamString := string(currentParamBytes) 787 if cacheParamString != currentParamString { 788 f.verbosef("Params changed from %q to %q, database is not applicable\n", cacheParamString, currentParamString) 789 return false 790 } 791 return true 792} 793 794// loadBytes compares the cache info in <data> to the state of the filesystem 795// loadBytes returns a map representing <data> and also a slice of dirs that need to be re-walked 796func (f *Finder) loadBytes(id int, data []byte) (m *pathMap, dirsToWalk []string, err error) { 797 798 helperStartTime := time.Now() 799 800 cachedNodes, err := f.parseCacheEntry(data) 801 if err != nil { 802 return nil, nil, fmt.Errorf("Failed to parse block %v: %v\n", id, err.Error()) 803 } 804 805 unmarshalDate := time.Now() 806 f.verbosef("Unmarshaled %v objects for %v in %v\n", 807 len(cachedNodes), id, unmarshalDate.Sub(helperStartTime)) 808 809 tempMap := newPathMap("/") 810 stats := make([]statResponse, len(cachedNodes)) 811 812 for i, node := range cachedNodes { 813 // check the file system for an updated timestamp 814 stats[i] = f.statDirSync(node.Path) 815 } 816 817 dirsToWalk = []string{} 818 for i, cachedNode := range cachedNodes { 819 updated := stats[i] 820 // save the cached value 821 container := tempMap.GetNode(cachedNode.Path, true) 822 container.mapNode = mapNode{statResponse: updated} 823 824 // if the metadata changed and the directory still exists, then 825 // make a note to walk it later 826 if !f.isInfoUpToDate(cachedNode.statResponse, updated) && updated.ModTime != 0 { 827 f.setModified() 828 // make a note that the directory needs to be walked 829 dirsToWalk = append(dirsToWalk, cachedNode.Path) 830 } else { 831 container.mapNode.FileNames = cachedNode.FileNames 832 } 833 } 834 // count the number of nodes to improve our understanding of the shape of the tree, 835 // thereby improving parallelism of subsequent searches 836 tempMap.UpdateNumDescendentsRecursive() 837 838 f.verbosef("Statted inodes of block %v in %v\n", id, time.Now().Sub(unmarshalDate)) 839 return tempMap, dirsToWalk, nil 840} 841 842// startFromExternalCache loads the cache database from disk 843// startFromExternalCache waits to return until the load of the cache db is complete, but 844// startFromExternalCache does not wait for all every listDir() or statDir() request to complete 845func (f *Finder) startFromExternalCache() (err error) { 846 startTime := time.Now() 847 dbPath := f.DbPath 848 849 // open cache file and validate its header 850 reader, err := f.filesystem.Open(dbPath) 851 if err != nil { 852 return errors.New("No data to load from database\n") 853 } 854 defer reader.Close() 855 bufferedReader := bufio.NewReader(reader) 856 if !f.validateCacheHeader(bufferedReader) { 857 return errors.New("Cache header does not match") 858 } 859 f.verbosef("Database header matches, will attempt to use database %v\n", f.DbPath) 860 861 // read the file and spawn threads to process it 862 nodesToWalk := [][]*pathMap{} 863 mainTree := newPathMap("/") 864 865 // read the blocks and stream them into <blockChannel> 866 type dataBlock struct { 867 id int 868 err error 869 data []byte 870 } 871 blockChannel := make(chan dataBlock, f.numDbLoadingThreads) 872 readBlocks := func() { 873 index := 0 874 for { 875 // It takes some time to unmarshal the input from json, so we want 876 // to unmarshal it in parallel. In order to find valid places to 877 // break the input, we scan for the line separators that we inserted 878 // (for this purpose) when we dumped the database. 879 data, err := f.readLine(bufferedReader) 880 var response dataBlock 881 done := false 882 if err != nil && err != io.EOF { 883 response = dataBlock{id: index, err: err, data: nil} 884 done = true 885 } else { 886 done = (err == io.EOF) 887 response = dataBlock{id: index, err: nil, data: data} 888 } 889 blockChannel <- response 890 index++ 891 duration := time.Since(startTime) 892 f.verbosef("Read block %v after %v\n", index, duration) 893 if done { 894 f.verbosef("Read %v blocks in %v\n", index, duration) 895 close(blockChannel) 896 return 897 } 898 } 899 } 900 go readBlocks() 901 902 // Read from <blockChannel> and stream the responses into <resultChannel>. 903 type workResponse struct { 904 id int 905 err error 906 tree *pathMap 907 updatedDirs []string 908 } 909 resultChannel := make(chan workResponse) 910 processBlocks := func() { 911 numProcessed := 0 912 threadPool := newThreadPool(f.numDbLoadingThreads) 913 for { 914 // get a block to process 915 block, received := <-blockChannel 916 if !received { 917 break 918 } 919 920 if block.err != nil { 921 resultChannel <- workResponse{err: block.err} 922 break 923 } 924 numProcessed++ 925 // wait until there is CPU available to process it 926 threadPool.Run( 927 func() { 928 processStartTime := time.Now() 929 f.verbosef("Starting to process block %v after %v\n", 930 block.id, processStartTime.Sub(startTime)) 931 tempMap, updatedDirs, err := f.loadBytes(block.id, block.data) 932 var response workResponse 933 if err != nil { 934 f.verbosef( 935 "Block %v failed to parse with error %v\n", 936 block.id, err) 937 response = workResponse{err: err} 938 } else { 939 response = workResponse{ 940 id: block.id, 941 err: nil, 942 tree: tempMap, 943 updatedDirs: updatedDirs, 944 } 945 } 946 f.verbosef("Processed block %v in %v\n", 947 block.id, time.Since(processStartTime), 948 ) 949 resultChannel <- response 950 }, 951 ) 952 } 953 threadPool.Wait() 954 f.verbosef("Finished processing %v blocks in %v\n", 955 numProcessed, time.Since(startTime)) 956 close(resultChannel) 957 } 958 go processBlocks() 959 960 // Read from <resultChannel> and use the results 961 combineResults := func() (err error) { 962 for { 963 result, received := <-resultChannel 964 if !received { 965 break 966 } 967 if err != nil { 968 // In case of an error, wait for work to complete before 969 // returning the error. This ensures that any subsequent 970 // work doesn't need to compete for resources (and possibly 971 // fail due to, for example, a filesystem limit on the number of 972 // concurrently open files) with past work. 973 continue 974 } 975 if result.err != nil { 976 err = result.err 977 continue 978 } 979 // update main tree 980 mainTree.MergeIn(result.tree) 981 // record any new directories that we will need to Stat() 982 updatedNodes := make([]*pathMap, len(result.updatedDirs)) 983 for j, dir := range result.updatedDirs { 984 node := mainTree.GetNode(dir, false) 985 updatedNodes[j] = node 986 } 987 nodesToWalk = append(nodesToWalk, updatedNodes) 988 } 989 return err 990 } 991 err = combineResults() 992 if err != nil { 993 return err 994 } 995 996 f.nodes = *mainTree 997 998 // after having loaded the entire db and therefore created entries for 999 // the directories we know of, now it's safe to start calling ReadDir on 1000 // any updated directories 1001 for i := range nodesToWalk { 1002 f.listDirsAsync(nodesToWalk[i]) 1003 } 1004 f.verbosef("Loaded db and statted known dirs in %v\n", time.Since(startTime)) 1005 f.threadPool.Wait() 1006 f.verbosef("Loaded db and statted all dirs in %v\n", time.Now().Sub(startTime)) 1007 1008 return err 1009} 1010 1011// startWithoutExternalCache starts scanning the filesystem according to the cache config 1012// startWithoutExternalCache should be called if startFromExternalCache is not applicable 1013func (f *Finder) startWithoutExternalCache() { 1014 startTime := time.Now() 1015 configDirs := f.cacheMetadata.Config.RootDirs 1016 1017 // clean paths 1018 candidates := make([]string, len(configDirs)) 1019 for i, dir := range configDirs { 1020 candidates[i] = filepath.Clean(dir) 1021 } 1022 // remove duplicates 1023 dirsToScan := make([]string, 0, len(configDirs)) 1024 for _, candidate := range candidates { 1025 include := true 1026 for _, included := range dirsToScan { 1027 if included == "/" || strings.HasPrefix(candidate+"/", included+"/") { 1028 include = false 1029 break 1030 } 1031 } 1032 if include { 1033 dirsToScan = append(dirsToScan, candidate) 1034 } 1035 } 1036 1037 // start searching finally 1038 for _, path := range dirsToScan { 1039 f.verbosef("Starting find of %v\n", path) 1040 f.startFind(path) 1041 } 1042 1043 f.threadPool.Wait() 1044 1045 f.verbosef("Scanned filesystem (not using cache) in %v\n", time.Now().Sub(startTime)) 1046} 1047 1048// isInfoUpToDate tells whether <new> can confirm that results computed at <old> are still valid 1049func (f *Finder) isInfoUpToDate(old statResponse, new statResponse) (equal bool) { 1050 if old.Inode != new.Inode { 1051 return false 1052 } 1053 if old.ModTime != new.ModTime { 1054 return false 1055 } 1056 if old.Device != new.Device { 1057 return false 1058 } 1059 return true 1060} 1061 1062func (f *Finder) wasModified() bool { 1063 return atomic.LoadInt32(&f.modifiedFlag) > 0 1064} 1065 1066func (f *Finder) setModified() { 1067 var newVal int32 1068 newVal = 1 1069 atomic.StoreInt32(&f.modifiedFlag, newVal) 1070} 1071 1072// sortedDirEntries exports directory entries to facilitate dumping them to the external cache 1073func (f *Finder) sortedDirEntries() []dirFullInfo { 1074 startTime := time.Now() 1075 nodes := make([]dirFullInfo, 0) 1076 for _, node := range f.nodes.DumpAll() { 1077 if node.ModTime != 0 { 1078 nodes = append(nodes, node) 1079 } 1080 } 1081 discoveryDate := time.Now() 1082 f.verbosef("Generated %v cache entries in %v\n", len(nodes), discoveryDate.Sub(startTime)) 1083 less := func(i int, j int) bool { 1084 return nodes[i].Path < nodes[j].Path 1085 } 1086 sort.Slice(nodes, less) 1087 sortDate := time.Now() 1088 f.verbosef("Sorted %v cache entries in %v\n", len(nodes), sortDate.Sub(discoveryDate)) 1089 1090 return nodes 1091} 1092 1093// serializeDb converts the cache database into a form to save to disk 1094func (f *Finder) serializeDb() ([]byte, error) { 1095 // sort dir entries 1096 var entryList = f.sortedDirEntries() 1097 1098 // Generate an output file that can be conveniently loaded using the same number of threads 1099 // as were used in this execution (because presumably that will be the number of threads 1100 // used in the next execution too) 1101 1102 // generate header 1103 header := []byte{} 1104 header = append(header, []byte(f.cacheMetadata.Version)...) 1105 header = append(header, lineSeparator) 1106 configDump, err := f.cacheMetadata.Config.Dump() 1107 if err != nil { 1108 return nil, err 1109 } 1110 header = append(header, configDump...) 1111 1112 // serialize individual blocks in parallel 1113 numBlocks := f.numDbLoadingThreads 1114 if numBlocks > len(entryList) { 1115 numBlocks = len(entryList) 1116 } 1117 blocks := make([][]byte, 1+numBlocks) 1118 blocks[0] = header 1119 blockMin := 0 1120 wg := sync.WaitGroup{} 1121 var errLock sync.Mutex 1122 1123 for i := 1; i <= numBlocks; i++ { 1124 // identify next block 1125 blockMax := len(entryList) * i / numBlocks 1126 block := entryList[blockMin:blockMax] 1127 1128 // process block 1129 wg.Add(1) 1130 go func(index int, block []dirFullInfo) { 1131 byteBlock, subErr := f.serializeCacheEntry(block) 1132 f.verbosef("Serialized block %v into %v bytes\n", index, len(byteBlock)) 1133 if subErr != nil { 1134 f.verbosef("%v\n", subErr.Error()) 1135 errLock.Lock() 1136 err = subErr 1137 errLock.Unlock() 1138 } else { 1139 blocks[index] = byteBlock 1140 } 1141 wg.Done() 1142 }(i, block) 1143 1144 blockMin = blockMax 1145 } 1146 1147 wg.Wait() 1148 1149 if err != nil { 1150 return nil, err 1151 } 1152 1153 content := bytes.Join(blocks, []byte{lineSeparator}) 1154 1155 return content, nil 1156} 1157 1158// dumpDb saves the cache database to disk 1159func (f *Finder) dumpDb() error { 1160 startTime := time.Now() 1161 f.verbosef("Dumping db\n") 1162 1163 tempPath := f.DbPath + ".tmp" 1164 1165 bytes, err := f.serializeDb() 1166 if err != nil { 1167 return err 1168 } 1169 serializeDate := time.Now() 1170 f.verbosef("Serialized db in %v\n", serializeDate.Sub(startTime)) 1171 // dump file and atomically move 1172 err = f.filesystem.WriteFile(tempPath, bytes, 0777) 1173 if err != nil { 1174 return err 1175 } 1176 err = f.filesystem.Rename(tempPath, f.DbPath) 1177 if err != nil { 1178 return err 1179 } 1180 1181 f.verbosef("Wrote db in %v\n", time.Now().Sub(serializeDate)) 1182 return nil 1183 1184} 1185 1186// canIgnoreFsErr checks for certain classes of filesystem errors that are safe to ignore 1187func (f *Finder) canIgnoreFsErr(err error) bool { 1188 pathErr, isPathErr := err.(*os.PathError) 1189 if !isPathErr { 1190 // Don't recognize this error 1191 return false 1192 } 1193 if os.IsPermission(pathErr) { 1194 // Permission errors are ignored: 1195 // https://issuetracker.google.com/37553659 1196 // https://github.com/google/kati/pull/116 1197 return true 1198 } 1199 if pathErr.Err == os.ErrNotExist { 1200 // If a directory doesn't exist, that generally means the cache is out-of-date 1201 return true 1202 } 1203 // Don't recognize this error 1204 return false 1205} 1206 1207// onFsError should be called whenever a potentially fatal error is returned from a filesystem call 1208func (f *Finder) onFsError(path string, err error) { 1209 if !f.canIgnoreFsErr(err) { 1210 // We could send the errors through a channel instead, although that would cause this call 1211 // to block unless we preallocated a sufficient buffer or spawned a reader thread. 1212 // Although it wouldn't be too complicated to spawn a reader thread, it's still slightly 1213 // more convenient to use a lock. Only in an unusual situation should this code be 1214 // invoked anyway. 1215 f.errlock.Lock() 1216 f.fsErrs = append(f.fsErrs, fsErr{path: path, err: err}) 1217 f.errlock.Unlock() 1218 } 1219} 1220 1221// discardErrsForPrunedPaths removes any errors for paths that are no longer included in the cache 1222func (f *Finder) discardErrsForPrunedPaths() { 1223 // This function could be somewhat inefficient due to being single-threaded, 1224 // but the length of f.fsErrs should be approximately 0, so it shouldn't take long anyway. 1225 relevantErrs := make([]fsErr, 0, len(f.fsErrs)) 1226 for _, fsErr := range f.fsErrs { 1227 path := fsErr.path 1228 node := f.nodes.GetNode(path, false) 1229 if node != nil { 1230 // The path in question wasn't pruned due to a failure to process a parent directory. 1231 // So, the failure to process this path is important 1232 relevantErrs = append(relevantErrs, fsErr) 1233 } 1234 } 1235 f.fsErrs = relevantErrs 1236} 1237 1238// getErr returns an error based on previous calls to onFsErr, if any 1239func (f *Finder) getErr() error { 1240 f.discardErrsForPrunedPaths() 1241 1242 numErrs := len(f.fsErrs) 1243 if numErrs < 1 { 1244 return nil 1245 } 1246 1247 maxNumErrsToInclude := 10 1248 message := "" 1249 if numErrs > maxNumErrsToInclude { 1250 message = fmt.Sprintf("finder encountered %v errors: %v...", numErrs, f.fsErrs[:maxNumErrsToInclude]) 1251 } else { 1252 message = fmt.Sprintf("finder encountered %v errors: %v", numErrs, f.fsErrs) 1253 } 1254 1255 return errors.New(message) 1256} 1257 1258func (f *Finder) statDirAsync(dir *pathMap) { 1259 node := dir 1260 path := dir.path 1261 f.threadPool.Run( 1262 func() { 1263 updatedStats := f.statDirSync(path) 1264 1265 if !f.isInfoUpToDate(node.statResponse, updatedStats) { 1266 node.mapNode = mapNode{ 1267 statResponse: updatedStats, 1268 FileNames: []string{}, 1269 } 1270 f.setModified() 1271 if node.statResponse.ModTime != 0 { 1272 // modification time was updated, so re-scan for 1273 // child directories 1274 f.listDirAsync(dir) 1275 } 1276 } 1277 }, 1278 ) 1279} 1280 1281func (f *Finder) statDirSync(path string) statResponse { 1282 1283 fileInfo, err := f.filesystem.Lstat(path) 1284 1285 var stats statResponse 1286 if err != nil { 1287 // possibly record this error 1288 f.onFsError(path, err) 1289 // in case of a failure to stat the directory, treat the directory as missing (modTime = 0) 1290 return stats 1291 } 1292 modTime := fileInfo.ModTime() 1293 stats = statResponse{} 1294 inode, err := f.filesystem.InodeNumber(fileInfo) 1295 if err != nil { 1296 panic(fmt.Sprintf("Could not get inode number of %v: %v\n", path, err.Error())) 1297 } 1298 stats.Inode = inode 1299 device, err := f.filesystem.DeviceNumber(fileInfo) 1300 if err != nil { 1301 panic(fmt.Sprintf("Could not get device number of %v: %v\n", path, err.Error())) 1302 } 1303 stats.Device = device 1304 permissionsChangeTime, err := f.filesystem.PermTime(fileInfo) 1305 1306 if err != nil { 1307 panic(fmt.Sprintf("Could not get permissions modification time (CTime) of %v: %v\n", path, err.Error())) 1308 } 1309 // We're only interested in knowing whether anything about the directory 1310 // has changed since last check, so we use the latest of the two 1311 // modification times (content modification (mtime) and 1312 // permission modification (ctime)) 1313 if permissionsChangeTime.After(modTime) { 1314 modTime = permissionsChangeTime 1315 } 1316 stats.ModTime = modTime.UnixNano() 1317 1318 return stats 1319} 1320 1321func (f *Finder) shouldIncludeFile(fileName string) bool { 1322 for _, includedName := range f.cacheMetadata.Config.IncludeFiles { 1323 if fileName == includedName { 1324 return true 1325 } 1326 } 1327 for _, includeSuffix := range f.cacheMetadata.Config.IncludeSuffixes { 1328 if strings.HasSuffix(fileName, includeSuffix) { 1329 return true 1330 } 1331 } 1332 return false 1333} 1334 1335// pruneCacheCandidates removes the items that we don't want to include in our persistent cache 1336func (f *Finder) pruneCacheCandidates(items *DirEntries) { 1337 1338 for _, fileName := range items.FileNames { 1339 for _, abortedName := range f.cacheMetadata.Config.PruneFiles { 1340 if fileName == abortedName { 1341 items.FileNames = []string{} 1342 items.DirNames = []string{} 1343 return 1344 } 1345 } 1346 } 1347 1348 // remove any files that aren't the ones we want to include 1349 writeIndex := 0 1350 for _, fileName := range items.FileNames { 1351 if f.shouldIncludeFile(fileName) { 1352 items.FileNames[writeIndex] = fileName 1353 writeIndex++ 1354 } 1355 } 1356 // resize 1357 items.FileNames = items.FileNames[:writeIndex] 1358 1359 writeIndex = 0 1360 for _, dirName := range items.DirNames { 1361 items.DirNames[writeIndex] = dirName 1362 // ignore other dirs that are known to not be inputs to the build process 1363 include := true 1364 for _, excludedName := range f.cacheMetadata.Config.ExcludeDirs { 1365 if dirName == excludedName { 1366 // don't include 1367 include = false 1368 break 1369 } 1370 } 1371 if include { 1372 writeIndex++ 1373 } 1374 } 1375 // resize 1376 items.DirNames = items.DirNames[:writeIndex] 1377} 1378 1379func (f *Finder) listDirsAsync(nodes []*pathMap) { 1380 f.threadPool.Run( 1381 func() { 1382 for i := range nodes { 1383 f.listDirSync(nodes[i]) 1384 } 1385 }, 1386 ) 1387} 1388 1389func (f *Finder) listDirAsync(node *pathMap) { 1390 f.threadPool.Run( 1391 func() { 1392 f.listDirSync(node) 1393 }, 1394 ) 1395} 1396 1397func (f *Finder) listDirSync(dir *pathMap) { 1398 path := dir.path 1399 children, err := f.filesystem.ReadDir(path) 1400 1401 if err != nil { 1402 // possibly record this error 1403 f.onFsError(path, err) 1404 // if listing the contents of the directory fails (presumably due to 1405 // permission denied), then treat the directory as empty 1406 children = nil 1407 } 1408 1409 var subdirs []string 1410 var subfiles []string 1411 1412 for _, child := range children { 1413 linkBits := child.Mode() & os.ModeSymlink 1414 isLink := linkBits != 0 1415 if isLink { 1416 childPath := filepath.Join(path, child.Name()) 1417 childStat, err := f.filesystem.Stat(childPath) 1418 if err != nil { 1419 // If stat fails this is probably a broken or dangling symlink, treat it as a file. 1420 subfiles = append(subfiles, child.Name()) 1421 } else if childStat.IsDir() { 1422 // Skip symlink dirs if not requested otherwise. Android has a number 1423 // of symlinks creating infinite source trees which would otherwise get 1424 // us in an infinite loop. 1425 // TODO(b/197349722): Revisit this once symlink loops are banned in the 1426 // source tree. 1427 if f.cacheMetadata.Config.FollowSymlinks { 1428 subdirs = append(subdirs, child.Name()) 1429 } 1430 } else { 1431 // We do have to support symlink files because the link name might be 1432 // different than the target name 1433 // (for example, Android.bp -> build/soong/root.bp) 1434 subfiles = append(subfiles, child.Name()) 1435 } 1436 } else if child.IsDir() { 1437 subdirs = append(subdirs, child.Name()) 1438 } else { 1439 subfiles = append(subfiles, child.Name()) 1440 } 1441 1442 } 1443 parentNode := dir 1444 1445 entry := &DirEntries{Path: path, DirNames: subdirs, FileNames: subfiles} 1446 f.pruneCacheCandidates(entry) 1447 1448 // create a pathMap node for each relevant subdirectory 1449 relevantChildren := map[string]*pathMap{} 1450 for _, subdirName := range entry.DirNames { 1451 childNode, found := parentNode.children[subdirName] 1452 // if we already knew of this directory, then we already have a request pending to Stat it 1453 // if we didn't already know of this directory, then we must Stat it now 1454 if !found { 1455 childNode = parentNode.newChild(subdirName) 1456 f.statDirAsync(childNode) 1457 } 1458 relevantChildren[subdirName] = childNode 1459 } 1460 // Note that in rare cases, it's possible that we're reducing the set of 1461 // children via this statement, if these are all true: 1462 // 1. we previously had a cache that knew about subdirectories of parentNode 1463 // 2. the user created a prune-file (described in pruneCacheCandidates) 1464 // inside <parentNode>, which specifies that the contents of parentNode 1465 // are to be ignored. 1466 // The fact that it's possible to remove children here means that *pathMap structs 1467 // must not be looked up from f.nodes by filepath (and instead must be accessed by 1468 // direct pointer) until after every listDirSync completes 1469 parentNode.FileNames = entry.FileNames 1470 parentNode.children = relevantChildren 1471 1472} 1473 1474// listMatches takes a node and a function that specifies which subdirectories and 1475// files to include, and listMatches returns the matches 1476func (f *Finder) listMatches(node *pathMap, 1477 filter WalkFunc) (subDirs []*pathMap, filePaths []string) { 1478 entries := DirEntries{ 1479 FileNames: node.FileNames, 1480 } 1481 entries.DirNames = make([]string, 0, len(node.children)) 1482 for childName := range node.children { 1483 entries.DirNames = append(entries.DirNames, childName) 1484 } 1485 1486 dirNames, fileNames := filter(entries) 1487 1488 subDirs = []*pathMap{} 1489 filePaths = make([]string, 0, len(fileNames)) 1490 for _, fileName := range fileNames { 1491 filePaths = append(filePaths, joinCleanPaths(node.path, fileName)) 1492 } 1493 subDirs = make([]*pathMap, 0, len(dirNames)) 1494 for _, childName := range dirNames { 1495 child, ok := node.children[childName] 1496 if ok { 1497 subDirs = append(subDirs, child) 1498 } 1499 } 1500 1501 return subDirs, filePaths 1502} 1503 1504// findInCacheMultithreaded spawns potentially multiple goroutines with which to search the cache. 1505func (f *Finder) findInCacheMultithreaded(node *pathMap, filter WalkFunc, 1506 approxNumThreads int) []string { 1507 1508 if approxNumThreads < 2 { 1509 // Done spawning threads; process remaining directories 1510 return f.findInCacheSinglethreaded(node, filter) 1511 } 1512 1513 totalWork := 0 1514 for _, child := range node.children { 1515 totalWork += child.approximateNumDescendents 1516 } 1517 childrenResults := make(chan []string, len(node.children)) 1518 1519 subDirs, filePaths := f.listMatches(node, filter) 1520 1521 // process child directories 1522 for _, child := range subDirs { 1523 numChildThreads := approxNumThreads * child.approximateNumDescendents / totalWork 1524 childProcessor := func(child *pathMap) { 1525 childResults := f.findInCacheMultithreaded(child, filter, numChildThreads) 1526 childrenResults <- childResults 1527 } 1528 // If we're allowed to use more than 1 thread to process this directory, 1529 // then instead we use 1 thread for each subdirectory. 1530 // It would be strange to spawn threads for only some subdirectories. 1531 go childProcessor(child) 1532 } 1533 1534 // collect results 1535 for i := 0; i < len(subDirs); i++ { 1536 childResults := <-childrenResults 1537 filePaths = append(filePaths, childResults...) 1538 } 1539 close(childrenResults) 1540 1541 return filePaths 1542} 1543 1544// findInCacheSinglethreaded synchronously searches the cache for all matching file paths 1545// note findInCacheSinglethreaded runs 2X to 4X as fast by being iterative rather than recursive 1546func (f *Finder) findInCacheSinglethreaded(node *pathMap, filter WalkFunc) []string { 1547 if node == nil { 1548 return []string{} 1549 } 1550 1551 nodes := []*pathMap{node} 1552 matches := []string{} 1553 1554 for len(nodes) > 0 { 1555 currentNode := nodes[0] 1556 nodes = nodes[1:] 1557 1558 subDirs, filePaths := f.listMatches(currentNode, filter) 1559 1560 nodes = append(nodes, subDirs...) 1561 1562 matches = append(matches, filePaths...) 1563 } 1564 return matches 1565} 1566