Skip to content

Commit a109719

Browse files
committed
Update page-rank service
1 parent 50bf37f commit a109719

File tree

4 files changed

+86
-129
lines changed

4 files changed

+86
-129
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -214,4 +214,5 @@ iptrack.txt
214214

215215
variables.env
216216

217-
.vscode
217+
.vscode
218+
certbot

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
# MOOGLE - The Worst Best Search Engine
22

33
## TODO
4-
- [ ] Test the flattened database in the query engine
5-
- [ ] If everything works, rename the database and start the spiders, indexers, and other services to point to this new database
6-
- [ ] Fix the query engine to use the new database
4+
- [ ] Add a favicon
5+
- [x] Add a redis instance to the docker-compose file
6+
- [x] Check fuzzy finding and spell correction
77

88
## For Future
99
- [ ] Use stems for words

services/page-rank/cmd/page-rank/main.go

Lines changed: 79 additions & 123 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,14 @@ func getEnv(key, fallback string) string {
1515
if value, exists := os.LookupEnv(key); exists {
1616
return value
1717
}
18-
1918
return fallback
2019
}
2120

2221
func main() {
2322
mongoHost := getEnv("MONGO_HOST", "localhost")
2423
mongoPassword := getEnv("MONGO_PASSWORD", "")
2524
mongoUsername := getEnv("MONGO_USERNAME", "")
25+
mongoDatabase := getEnv("MONGO_DB", "test")
2626

2727
fmt.Println("Page Rank Service!")
2828

@@ -47,169 +47,125 @@ func main() {
4747
fmt.Println("Successfully connected to MongoDB!")
4848

4949
// Access the test database
50-
db := client.Database("test")
50+
db := client.Database(mongoDatabase)
5151

52-
// Access the outlinks collection
53-
coll := db.Collection("outlinks")
52+
// Access the outlinks and backlinks collections
53+
outlinksColl := db.Collection("outlinks")
54+
backlinksColl := db.Collection("backlinks")
5455

55-
// Get the count of documents in the collection
56-
count, err := coll.CountDocuments(context.TODO(), bson.D{})
56+
// Get the count of documents in the outlinks collection
57+
count, err := outlinksColl.CountDocuments(context.TODO(), bson.D{})
5758
if err != nil {
58-
panic(fmt.Sprintf("Could not count documents: %v", err))
59+
panic(fmt.Sprintf("Could not count documents in outlinks: %v", err))
5960
}
6061

61-
fmt.Printf("Number of documents in the collection: %d\n", count)
62-
63-
// Iterate over the documents in the collection
64-
cursor, err := coll.Find(context.TODO(), bson.D{})
62+
backlinks := make(map[string][]string)
63+
cursorBacklinks, err := backlinksColl.Find(context.TODO(), bson.D{})
6564
if err != nil {
66-
panic(fmt.Sprintf("Could not find documents: %v", err))
65+
panic(fmt.Sprintf("Could not fetch backlinks: %v", err))
6766
}
68-
69-
defer cursor.Close(context.TODO())
70-
71-
// Page rank setup
72-
// Create a hash map to hold the page_url and its corresponding page rank
73-
pageRank := make(map[string]float64)
74-
75-
for cursor.Next(context.TODO()) {
76-
var result bson.M
77-
if err := cursor.Decode(&result); err != nil {
78-
panic(fmt.Sprintf("Could not decode document: %v", err))
67+
defer cursorBacklinks.Close(context.TODO())
68+
for cursorBacklinks.Next(context.TODO()) {
69+
var doc struct {
70+
ID string `bson:"_id"`
71+
Links []string `bson:"links"`
7972
}
80-
81-
// Get the _id field, this is the page_url
82-
url, ok := result["_id"].(string)
83-
if !ok {
84-
panic("Could not convert _id to string")
73+
if err := cursorBacklinks.Decode(&doc); err != nil {
74+
panic(fmt.Sprintf("Could not decode backlink document: %v", err))
8575
}
86-
87-
// Assign a starting page rank value
88-
pageRank[url] = 1.0 / float64(count)
76+
backlinks[doc.ID] = doc.Links
8977
}
9078

91-
if err := cursor.Err(); err != nil {
92-
panic(fmt.Sprintf("Cursor error: %v", err))
79+
outlinksCount := make(map[string]int)
80+
cursorOutlinks, err := outlinksColl.Find(context.TODO(), bson.D{})
81+
if err != nil {
82+
panic(fmt.Sprintf("Could not fetch outlinks: %v", err))
83+
}
84+
defer cursorOutlinks.Close(context.TODO())
85+
for cursorOutlinks.Next(context.TODO()) {
86+
var doc struct {
87+
ID string `bson:"_id"`
88+
Links []string `bson:"links"`
89+
}
90+
if err := cursorOutlinks.Decode(&doc); err != nil {
91+
panic(fmt.Sprintf("Could not decode outlink document: %v", err))
92+
}
93+
outlinksCount[doc.ID] = len(doc.Links)
9394
}
9495

95-
// Print the initial page rank values
96-
fmt.Println("Initial Page Rank values:")
97-
for url, rank := range pageRank {
98-
fmt.Printf("Page URL: %s, Page Rank: %f\n", url, rank)
96+
pageRank := make(map[string]float64)
97+
for url := range outlinksCount {
98+
pageRank[url] = 1.0 / float64(count)
9999
}
100100

101-
// Page rank algorithm
102-
// Set the number of iterations
101+
fmt.Printf("Total number of URLs: %d\n", count)
102+
103103
iterations := 10
104-
for range iterations {
105-
// Create a temporary hash map to hold the new page rank values
104+
damping := 0.85
105+
for i := 0; i < iterations; i++ {
106106
newPageRank := make(map[string]float64)
107107

108-
// Calculate the new page rank values
109-
for url, rank := range pageRank {
110-
fmt.Printf("Calculating new page rank for URL: %s | Previous Rank: %v\n", url, rank)
111-
112-
// Get the backlinks for the current URL
113-
var backlinksDoc struct {
114-
Links []string `bson:"links"`
115-
}
116-
117-
// Get the backlinks for the current URL
118-
err := db.Collection("backlinks").FindOne(context.TODO(), bson.D{{Key: "_id", Value: url}}).Decode(&backlinksDoc)
119-
if err != nil {
120-
if err == mongo.ErrNoDocuments {
121-
// No backlinks found for this URL
122-
fmt.Printf("No backlinks found for URL %s\n", url)
123-
} else {
124-
panic(fmt.Sprintf("Could not find backlinks for URL %s: %v", url, err))
125-
}
126-
continue
127-
}
128-
129-
// Get the count of backlinks
130-
backlinksCount := len(backlinksDoc.Links)
131-
fmt.Printf("\tFound %d backlinks for URL: %s\n", backlinksCount, url)
132-
133-
newCumulativeRank := 0.0
134-
135-
// Iterate over the backlinks and calculate the new page rank
136-
for _, backlink := range backlinksDoc.Links {
137-
// Get the outlink document for the specified backlink
138-
var outlinkDoc struct {
139-
Links []string `bson:"links"`
140-
}
141-
142-
// Get the count of outlinks
143-
err := db.Collection("outlinks").FindOne(context.TODO(), bson.D{{Key: "_id", Value: backlink}}).Decode(&outlinkDoc)
144-
if err != nil {
145-
if err == mongo.ErrNoDocuments {
146-
// No outlinks found for this URL
147-
fmt.Printf("No outlinks found for URL %s\n", backlink)
148-
} else {
149-
panic(fmt.Sprintf("Could not find outlinks for URL %s: %v", backlink, err))
108+
for url, _ := range pageRank {
109+
var newCumulativeRank float64
110+
111+
backlinksForUrl, exists := backlinks[url]
112+
if exists {
113+
for _, backlink := range backlinksForUrl {
114+
outlinkCount, ok := outlinksCount[backlink]
115+
if ok {
116+
backlinkRank, ok := pageRank[backlink]
117+
if ok {
118+
newCumulativeRank += backlinkRank / float64(outlinkCount)
119+
}
150120
}
151-
continue
152-
}
153-
outlinksCount := len(outlinkDoc.Links)
154-
// fmt.Printf("\t\tFound %d outlinks for URL: %s\n", outlinksCount, backlink)
155-
156-
// Get the previous page rank value for the backlink
157-
backlinkRank, ok := pageRank[backlink]
158-
if !ok {
159-
// fmt.Printf("No page rank found for backlink %s\n", backlink)
160-
continue
161121
}
162-
// fmt.Printf("\t\t\tBacklink Page Rank: %f\n", backlinkRank)
163-
164-
newCumulativeRank += backlinkRank / float64(outlinksCount)
165122
}
166123

167-
damping := 0.85
168124
newPageRank[url] = (1-damping)/float64(count) + damping*newCumulativeRank
169-
fmt.Println()
170125
}
171126

172-
// Update the page rank values
173127
pageRank = newPageRank
174-
175-
// Print the new page rank values
176-
fmt.Println("New Page Rank values:")
177-
for url, rank := range pageRank {
178-
fmt.Printf("Page URL: %s, Page Rank: %f\n", url, rank)
179-
}
180-
fmt.Println("--------------------------------------------------")
181128
}
182129

183-
// Sort the page rank values by rank
184-
// Create a slice to hold the page rank values
185-
type PageRank struct {
130+
var sortedPageRanks []struct {
186131
URL string
187132
Rank float64
188133
}
189-
var pageRanks []PageRank
190134
for url, rank := range pageRank {
191-
pageRanks = append(pageRanks, PageRank{URL: url, Rank: rank})
135+
sortedPageRanks = append(sortedPageRanks, struct {
136+
URL string
137+
Rank float64
138+
}{url, rank})
192139
}
193-
// Sort the page ranks by rank
194-
sort.Slice(pageRanks, func(i, j int) bool {
195-
return pageRanks[i].Rank > pageRanks[j].Rank
140+
sort.Slice(sortedPageRanks, func(i, j int) bool {
141+
return sortedPageRanks[i].Rank > sortedPageRanks[j].Rank
196142
})
197143

198-
// Print the sorted page rank values
144+
// Print sorted page ranks
199145
fmt.Println("Sorted Page Rank values:")
200-
for _, pageRank := range pageRanks {
146+
for _, pageRank := range sortedPageRanks {
201147
fmt.Printf("Page URL: %s, Page Rank: %f\n", pageRank.URL, pageRank.Rank)
202148
}
203149

204-
// Save the page rank values to the database
205-
for _, pageRank := range pageRanks {
206-
_, err := db.Collection("pagerank").InsertOne(context.TODO(), bson.D{
207-
{Key: "_id", Value: pageRank.URL},
208-
{Key: "rank", Value: pageRank.Rank},
209-
})
150+
var bulkOps []mongo.WriteModel
151+
for _, pageRank := range sortedPageRanks {
152+
bulkOps = append(bulkOps, mongo.NewUpdateOneModel().
153+
SetFilter(bson.D{{Key: "_id", Value: pageRank.URL}}).
154+
SetUpdate(bson.D{
155+
{Key: "$set", Value: bson.D{
156+
{Key: "rank", Value: pageRank.Rank},
157+
}},
158+
}).
159+
SetUpsert(true))
160+
}
161+
162+
// Execute the batch insert
163+
if len(bulkOps) > 0 {
164+
_, err := db.Collection("pagerank").BulkWrite(context.TODO(), bulkOps)
210165
if err != nil {
211-
panic(fmt.Sprintf("Could not insert page rank value: %v", err))
166+
panic(fmt.Sprintf("Could not batch insert page rank values: %v", err))
212167
}
213168
}
169+
214170
fmt.Println("Page rank values saved to the database!")
215171
}
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
services:
22
page-rank-service:
3-
# build: .
4-
image: ghcr.io/ionelpopjara/moogle/page-rank:latest
3+
build: .
4+
# image: ghcr.io/ionelpopjara/moogle/page-rank:latest
55
env_file: "variables.env"
66
deploy:
77
replicas: 1

0 commit comments

Comments
 (0)