Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,11 @@ Run the tool by changing directory using `cd` to the `api-tools` directory and r
| `./api-tools -scrape -headless` | Runs ChromeDP in headless mode. |
| `./api-tools -o [directory]` | Sets output directory (default: `./data`). |

For profile scraping, you can optionally scope requests by school to reduce API load:
- Set `PROFILE_SCHOOLS` to a comma/semicolon/space-separated list (example: `PROFILE_SCHOOLS=ECS;BBS;AHT`).
- Then run `./api-tools -scrape -profiles` as usual.
- If `PROFILE_SCHOOLS` is not set, the scraper defaults to batched `person` slug requests.

### Parsing Mode:

| Command | Description |
Expand Down
6 changes: 5 additions & 1 deletion parser/profileLoader.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,13 @@ import (
)

func loadProfiles(inDir string) {
if LoadProfiles(inDir) {
return
}

fptr, err := os.Open(fmt.Sprintf("%s/profiles.json", inDir))
if err != nil {
log.Print("Couldn't find/open profiles.json in the input directory. Skipping profile load.")
log.Print("Couldn't find/open profiles_raw.json or profiles.json in the input directory. Skipping profile load.")
return
}

Expand Down
327 changes: 327 additions & 0 deletions parser/profiles.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,327 @@
package parser

import (
"encoding/json"
"fmt"
"log"
"os"
"regexp"
"strings"

"github.com/UTDNebula/nebula-api/api/schema"
"go.mongodb.org/mongo-driver/bson/primitive"
)

const profilesRawFileName = "profiles_raw.json"

var (
apiPrimaryLocationRegex = regexp.MustCompile(`^(\w+)\s+(\d+\.\d{3}[A-z]?)$`)
apiFallbackLocationRegex = regexp.MustCompile(`^([A-z]+)(\d+)\.?([\d]{3}[A-z]?)$`)
)
Comment on lines +18 to +20
Copy link

Copilot AI Apr 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The location regexes use [A-z]/[A-z]+, which also matches non-letter ASCII characters (e.g., [, \, ], ^, _, `). This can lead to incorrect matches for building codes / room suffixes. Use [A-Za-z] (or a more specific allowed set) instead.

Copilot uses AI. Check for mistakes.

type profileIndexResponse struct {
Count int `json:"count"`
Profile []profileIndexRow `json:"profile"`
}

type profileIndexRow struct {
ID int `json:"id"`
FullName string `json:"full_name"`
FirstName string `json:"first_name"`
LastName string `json:"last_name"`
Slug string `json:"slug"`
Public bool `json:"public"`
URL string `json:"url"`
Name string `json:"name"`
ImageURL string `json:"image_url"`
APIURL string `json:"api_url"`
Media []map[string]any `json:"media"`
Information []profileInformation `json:"information"`
Areas []profileArea `json:"areas"`
}

type profileDetailsResponse struct {
Information []profileInformation `json:"information"`
Areas []profileArea `json:"areas"`
}
Comment on lines +43 to +46
Copy link

Copilot AI Apr 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

profileDetailsResponse is declared but never used in this package. Consider removing it (or using it) to avoid accumulating dead/unused types and reduce maintenance overhead.

Copilot uses AI. Check for mistakes.

type profileInformation struct {
Data profileInformationData `json:"data"`
}

type profileInformationData struct {
URL string `json:"url"`
SecondaryURL string `json:"secondary_url"`
TertiaryURL string `json:"tertiary_url"`
QuaternaryURL string `json:"quaternary_url"`
QuinaryURL string `json:"quinary_url"`
Email string `json:"email"`
Phone string `json:"phone"`
Title string `json:"title"`
SecondaryTitle string `json:"secondary_title"`
TertiaryTitle string `json:"tertiary_title"`
DistinguishedTitle string `json:"distinguished_title"`
Location string `json:"location"`
ProfileSummary string `json:"profile_summary"`
AcceptingStudents string `json:"accepting_students"`
NotAcceptingStudents string `json:"not_accepting_students"`
}

type profileArea struct {
Data profileAreaData `json:"data"`
}

type profileAreaData struct {
Title string `json:"title"`
Description string `json:"description"`
}

// LoadProfiles reads scraped profile API data and populates the package maps.
func LoadProfiles(inDir string) bool {
path := fmt.Sprintf("%s/%s", inDir, profilesRawFileName)
fptr, err := os.Open(path)
if err != nil {
return false
}
defer fptr.Close()

var response profileIndexResponse
if err := json.NewDecoder(fptr).Decode(&response); err != nil {
log.Printf("Failed to decode profiles JSON: %v", err)
return false
}

loadedCount := 0
for _, row := range response.Profile {
if !row.Public {
continue
}

prof := buildProfessorFromRow(row)
if prof == nil {
continue
}

professorKey := prof.First_name + prof.Last_name
if _, exists := Professors[professorKey]; exists {
continue
}
Professors[professorKey] = prof
ProfessorIDMap[prof.Id] = professorKey
loadedCount++
}

log.Printf("Loaded %d profiles from %s.", loadedCount, profilesRawFileName)
return true
}

func buildProfessorFromRow(row profileIndexRow) *schema.Professor {

firstName := strings.TrimSpace(row.FirstName)
lastName := strings.TrimSpace(row.LastName)
if firstName == "" || lastName == "" {
firstName, lastName = splitFullName(row.FullName)
}

// Ignore blank names to match the parser's existing professor population behavior.
if firstName == "" || lastName == "" {
return nil
}

titles := collectTitles(row)
info := bestInformationData(row.Information)

prof := &schema.Professor{}
prof.Id = primitive.NewObjectID()
prof.First_name = firstName
prof.Last_name = lastName
prof.Titles = titles
prof.Email = strings.TrimSpace(info.Email)
prof.Phone_number = strings.TrimSpace(info.Phone)
prof.Office = bestLocation(row.Information)
prof.Profile_uri = bestProfileURI(row)
prof.Image_uri = bestImageURI(row)
prof.Office_hours = []schema.Meeting{}
prof.Sections = []primitive.ObjectID{}

return prof
}

func splitFullName(fullName string) (string, string) {
parts := strings.Fields(strings.TrimSpace(fullName))
if len(parts) == 0 {
return "", ""
}
if len(parts) == 1 {
return parts[0], ""
}
return strings.Join(parts[:len(parts)-1], " "), parts[len(parts)-1]
}

func parseAPILocation(text string) schema.Location {
normalized := strings.TrimSpace(text)
if normalized == "" {
return schema.Location{}
}

var building string
var room string

submatches := apiPrimaryLocationRegex.FindStringSubmatch(normalized)
if submatches == nil {
submatches = apiFallbackLocationRegex.FindStringSubmatch(strings.ReplaceAll(normalized, " ", ""))
if submatches == nil {
return schema.Location{}
}
building = submatches[1]
room = fmt.Sprintf("%s.%s", submatches[2], submatches[3])
} else {
building = submatches[1]
room = submatches[2]
}

return schema.Location{
Building: building,
Room: room,
Map_uri: fmt.Sprintf("https://locator.utdallas.edu/%s_%s", building, room),
}
}

func collectTitles(row profileIndexRow) []string {
titles := make([]string, 0, 8)
if row.Name != "" {
titles = append(titles, strings.TrimSpace(row.Name))
}

for _, info := range row.Information {
for _, candidate := range []string{info.Data.Title, info.Data.SecondaryTitle, info.Data.TertiaryTitle, info.Data.DistinguishedTitle} {
trimmed := strings.TrimSpace(candidate)
if trimmed == "" {
continue
}
if !containsString(titles, trimmed) {
titles = append(titles, trimmed)
}
}
}

return titles
}

func bestInformationData(items []profileInformation) profileInformationData {
if len(items) == 0 {
return profileInformationData{}
}

best := items[0].Data
bestScore := informationScore(best)

for _, item := range items[1:] {
score := informationScore(item.Data)
if score > bestScore {
best = item.Data
bestScore = score
}
}

return best
}

func informationScore(data profileInformationData) int {
score := 0
for _, value := range []string{
data.Email,
data.Phone,
data.Location,
data.URL,
data.SecondaryURL,
data.TertiaryURL,
data.QuaternaryURL,
data.QuinaryURL,
data.Title,
data.SecondaryTitle,
data.TertiaryTitle,
data.DistinguishedTitle,
data.ProfileSummary,
data.AcceptingStudents,
data.NotAcceptingStudents,
} {
if strings.TrimSpace(value) != "" {
score++
}
}

return score
}

func bestLocation(items []profileInformation) schema.Location {
for _, item := range items {
location := parseAPILocation(item.Data.Location)
if location.Building != "" || location.Room != "" {
return location
}
}

return schema.Location{}
}

func bestProfileURI(row profileIndexRow) string {
if trimmed := strings.TrimSpace(row.URL); trimmed != "" {
return trimmed
}

for _, info := range row.Information {
for _, candidate := range []string{info.Data.URL, info.Data.SecondaryURL, info.Data.TertiaryURL, info.Data.QuaternaryURL, info.Data.QuinaryURL} {
trimmed := strings.TrimSpace(candidate)
if trimmed != "" {
return trimmed
}
}
}

for _, candidate := range []string{row.APIURL} {
trimmed := strings.TrimSpace(candidate)
if trimmed != "" {
return trimmed
}
}

return ""
}

func bestImageURI(row profileIndexRow) string {
if trimmed := strings.TrimSpace(row.ImageURL); trimmed != "" {
return trimmed
}

for _, media := range row.Media {
for _, key := range []string{"url", "image_url", "src", "uri"} {
if raw, exists := media[key]; exists {
if str, ok := raw.(string); ok {
trimmed := strings.TrimSpace(str)
if trimmed != "" {
return trimmed
}
}
}
}
}

return ""
}

func firstInformationData(items []profileInformation) profileInformationData {
if len(items) == 0 {
return profileInformationData{}
}
return items[0].Data
}
Comment on lines +313 to +318
Copy link

Copilot AI Apr 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

firstInformationData is never called. If it’s not needed, removing it will keep the parser surface area smaller and make future refactors less error-prone.

Copilot uses AI. Check for mistakes.

func containsString(values []string, target string) bool {
for _, value := range values {
if value == target {
return true
}
}
return false
}
Loading
Loading