-
Notifications
You must be signed in to change notification settings - Fork 49
Update profiles parser and related staged changes #156
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,327 @@ | ||
| package parser | ||
|
|
||
| import ( | ||
| "encoding/json" | ||
| "fmt" | ||
| "log" | ||
| "os" | ||
| "regexp" | ||
| "strings" | ||
|
|
||
| "github.com/UTDNebula/nebula-api/api/schema" | ||
| "go.mongodb.org/mongo-driver/bson/primitive" | ||
| ) | ||
|
|
||
| const profilesRawFileName = "profiles_raw.json" | ||
|
|
||
| var ( | ||
| apiPrimaryLocationRegex = regexp.MustCompile(`^(\w+)\s+(\d+\.\d{3}[A-z]?)$`) | ||
| apiFallbackLocationRegex = regexp.MustCompile(`^([A-z]+)(\d+)\.?([\d]{3}[A-z]?)$`) | ||
| ) | ||
|
|
||
| type profileIndexResponse struct { | ||
| Count int `json:"count"` | ||
| Profile []profileIndexRow `json:"profile"` | ||
| } | ||
|
|
||
| type profileIndexRow struct { | ||
| ID int `json:"id"` | ||
| FullName string `json:"full_name"` | ||
| FirstName string `json:"first_name"` | ||
| LastName string `json:"last_name"` | ||
| Slug string `json:"slug"` | ||
| Public bool `json:"public"` | ||
| URL string `json:"url"` | ||
| Name string `json:"name"` | ||
| ImageURL string `json:"image_url"` | ||
| APIURL string `json:"api_url"` | ||
| Media []map[string]any `json:"media"` | ||
| Information []profileInformation `json:"information"` | ||
| Areas []profileArea `json:"areas"` | ||
| } | ||
|
|
||
| type profileDetailsResponse struct { | ||
| Information []profileInformation `json:"information"` | ||
| Areas []profileArea `json:"areas"` | ||
| } | ||
|
Comment on lines
+43
to
+46
|
||
|
|
||
| type profileInformation struct { | ||
| Data profileInformationData `json:"data"` | ||
| } | ||
|
|
||
| type profileInformationData struct { | ||
| URL string `json:"url"` | ||
| SecondaryURL string `json:"secondary_url"` | ||
| TertiaryURL string `json:"tertiary_url"` | ||
| QuaternaryURL string `json:"quaternary_url"` | ||
| QuinaryURL string `json:"quinary_url"` | ||
| Email string `json:"email"` | ||
| Phone string `json:"phone"` | ||
| Title string `json:"title"` | ||
| SecondaryTitle string `json:"secondary_title"` | ||
| TertiaryTitle string `json:"tertiary_title"` | ||
| DistinguishedTitle string `json:"distinguished_title"` | ||
| Location string `json:"location"` | ||
| ProfileSummary string `json:"profile_summary"` | ||
| AcceptingStudents string `json:"accepting_students"` | ||
| NotAcceptingStudents string `json:"not_accepting_students"` | ||
| } | ||
|
|
||
| type profileArea struct { | ||
| Data profileAreaData `json:"data"` | ||
| } | ||
|
|
||
| type profileAreaData struct { | ||
| Title string `json:"title"` | ||
| Description string `json:"description"` | ||
| } | ||
|
|
||
| // LoadProfiles reads scraped profile API data and populates the package maps. | ||
| func LoadProfiles(inDir string) bool { | ||
| path := fmt.Sprintf("%s/%s", inDir, profilesRawFileName) | ||
| fptr, err := os.Open(path) | ||
| if err != nil { | ||
| return false | ||
| } | ||
| defer fptr.Close() | ||
|
|
||
| var response profileIndexResponse | ||
| if err := json.NewDecoder(fptr).Decode(&response); err != nil { | ||
| log.Printf("Failed to decode profiles JSON: %v", err) | ||
| return false | ||
| } | ||
|
|
||
| loadedCount := 0 | ||
| for _, row := range response.Profile { | ||
| if !row.Public { | ||
| continue | ||
| } | ||
|
|
||
| prof := buildProfessorFromRow(row) | ||
| if prof == nil { | ||
| continue | ||
| } | ||
|
|
||
| professorKey := prof.First_name + prof.Last_name | ||
| if _, exists := Professors[professorKey]; exists { | ||
| continue | ||
| } | ||
| Professors[professorKey] = prof | ||
| ProfessorIDMap[prof.Id] = professorKey | ||
| loadedCount++ | ||
| } | ||
|
|
||
| log.Printf("Loaded %d profiles from %s.", loadedCount, profilesRawFileName) | ||
| return true | ||
| } | ||
|
|
||
| func buildProfessorFromRow(row profileIndexRow) *schema.Professor { | ||
|
|
||
| firstName := strings.TrimSpace(row.FirstName) | ||
| lastName := strings.TrimSpace(row.LastName) | ||
| if firstName == "" || lastName == "" { | ||
| firstName, lastName = splitFullName(row.FullName) | ||
| } | ||
|
|
||
| // Ignore blank names to match the parser's existing professor population behavior. | ||
| if firstName == "" || lastName == "" { | ||
| return nil | ||
| } | ||
|
|
||
| titles := collectTitles(row) | ||
| info := bestInformationData(row.Information) | ||
|
|
||
| prof := &schema.Professor{} | ||
| prof.Id = primitive.NewObjectID() | ||
| prof.First_name = firstName | ||
| prof.Last_name = lastName | ||
| prof.Titles = titles | ||
| prof.Email = strings.TrimSpace(info.Email) | ||
| prof.Phone_number = strings.TrimSpace(info.Phone) | ||
| prof.Office = bestLocation(row.Information) | ||
| prof.Profile_uri = bestProfileURI(row) | ||
| prof.Image_uri = bestImageURI(row) | ||
| prof.Office_hours = []schema.Meeting{} | ||
| prof.Sections = []primitive.ObjectID{} | ||
|
|
||
| return prof | ||
| } | ||
|
|
||
| func splitFullName(fullName string) (string, string) { | ||
| parts := strings.Fields(strings.TrimSpace(fullName)) | ||
| if len(parts) == 0 { | ||
| return "", "" | ||
| } | ||
| if len(parts) == 1 { | ||
| return parts[0], "" | ||
| } | ||
| return strings.Join(parts[:len(parts)-1], " "), parts[len(parts)-1] | ||
| } | ||
|
|
||
| func parseAPILocation(text string) schema.Location { | ||
| normalized := strings.TrimSpace(text) | ||
| if normalized == "" { | ||
| return schema.Location{} | ||
| } | ||
|
|
||
| var building string | ||
| var room string | ||
|
|
||
| submatches := apiPrimaryLocationRegex.FindStringSubmatch(normalized) | ||
| if submatches == nil { | ||
| submatches = apiFallbackLocationRegex.FindStringSubmatch(strings.ReplaceAll(normalized, " ", "")) | ||
| if submatches == nil { | ||
| return schema.Location{} | ||
| } | ||
| building = submatches[1] | ||
| room = fmt.Sprintf("%s.%s", submatches[2], submatches[3]) | ||
| } else { | ||
| building = submatches[1] | ||
| room = submatches[2] | ||
| } | ||
|
|
||
| return schema.Location{ | ||
| Building: building, | ||
| Room: room, | ||
| Map_uri: fmt.Sprintf("https://locator.utdallas.edu/%s_%s", building, room), | ||
| } | ||
| } | ||
|
|
||
| func collectTitles(row profileIndexRow) []string { | ||
| titles := make([]string, 0, 8) | ||
| if row.Name != "" { | ||
| titles = append(titles, strings.TrimSpace(row.Name)) | ||
| } | ||
|
|
||
| for _, info := range row.Information { | ||
| for _, candidate := range []string{info.Data.Title, info.Data.SecondaryTitle, info.Data.TertiaryTitle, info.Data.DistinguishedTitle} { | ||
| trimmed := strings.TrimSpace(candidate) | ||
| if trimmed == "" { | ||
| continue | ||
| } | ||
| if !containsString(titles, trimmed) { | ||
| titles = append(titles, trimmed) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| return titles | ||
| } | ||
|
|
||
| func bestInformationData(items []profileInformation) profileInformationData { | ||
| if len(items) == 0 { | ||
| return profileInformationData{} | ||
| } | ||
|
|
||
| best := items[0].Data | ||
| bestScore := informationScore(best) | ||
|
|
||
| for _, item := range items[1:] { | ||
| score := informationScore(item.Data) | ||
| if score > bestScore { | ||
| best = item.Data | ||
| bestScore = score | ||
| } | ||
| } | ||
|
|
||
| return best | ||
| } | ||
|
|
||
| func informationScore(data profileInformationData) int { | ||
| score := 0 | ||
| for _, value := range []string{ | ||
| data.Email, | ||
| data.Phone, | ||
| data.Location, | ||
| data.URL, | ||
| data.SecondaryURL, | ||
| data.TertiaryURL, | ||
| data.QuaternaryURL, | ||
| data.QuinaryURL, | ||
| data.Title, | ||
| data.SecondaryTitle, | ||
| data.TertiaryTitle, | ||
| data.DistinguishedTitle, | ||
| data.ProfileSummary, | ||
| data.AcceptingStudents, | ||
| data.NotAcceptingStudents, | ||
| } { | ||
| if strings.TrimSpace(value) != "" { | ||
| score++ | ||
| } | ||
| } | ||
|
|
||
| return score | ||
| } | ||
|
|
||
| func bestLocation(items []profileInformation) schema.Location { | ||
| for _, item := range items { | ||
| location := parseAPILocation(item.Data.Location) | ||
| if location.Building != "" || location.Room != "" { | ||
| return location | ||
| } | ||
| } | ||
|
|
||
| return schema.Location{} | ||
| } | ||
|
|
||
| func bestProfileURI(row profileIndexRow) string { | ||
| if trimmed := strings.TrimSpace(row.URL); trimmed != "" { | ||
| return trimmed | ||
| } | ||
|
|
||
| for _, info := range row.Information { | ||
| for _, candidate := range []string{info.Data.URL, info.Data.SecondaryURL, info.Data.TertiaryURL, info.Data.QuaternaryURL, info.Data.QuinaryURL} { | ||
| trimmed := strings.TrimSpace(candidate) | ||
| if trimmed != "" { | ||
| return trimmed | ||
| } | ||
| } | ||
| } | ||
|
|
||
| for _, candidate := range []string{row.APIURL} { | ||
| trimmed := strings.TrimSpace(candidate) | ||
| if trimmed != "" { | ||
| return trimmed | ||
| } | ||
| } | ||
|
|
||
| return "" | ||
| } | ||
|
|
||
| func bestImageURI(row profileIndexRow) string { | ||
| if trimmed := strings.TrimSpace(row.ImageURL); trimmed != "" { | ||
| return trimmed | ||
| } | ||
|
|
||
| for _, media := range row.Media { | ||
| for _, key := range []string{"url", "image_url", "src", "uri"} { | ||
| if raw, exists := media[key]; exists { | ||
| if str, ok := raw.(string); ok { | ||
| trimmed := strings.TrimSpace(str) | ||
| if trimmed != "" { | ||
| return trimmed | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| return "" | ||
| } | ||
|
|
||
| func firstInformationData(items []profileInformation) profileInformationData { | ||
| if len(items) == 0 { | ||
| return profileInformationData{} | ||
| } | ||
| return items[0].Data | ||
| } | ||
|
Comment on lines
+313
to
+318
|
||
|
|
||
| func containsString(values []string, target string) bool { | ||
| for _, value := range values { | ||
| if value == target { | ||
| return true | ||
| } | ||
| } | ||
| return false | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The location regexes use
[A-z]/[A-z]+, which also matches non-letter ASCII characters (e.g.,[,\,],^,_,`). This can lead to incorrect matches for building codes / room suffixes. Use[A-Za-z](or a more specific allowed set) instead.